summaryrefslogtreecommitdiffstats
path: root/R Scripts
diff options
context:
space:
mode:
authorBen Green <bgreen@g.harvard.edu>2015-08-01 12:01:19 -0400
committerBen Green <bgreen@g.harvard.edu>2015-08-01 12:01:19 -0400
commit9350ee2c6359562a23cf8efdefdd7de80b2a682e (patch)
treeb1217fcc2098bebaa6fed641bb759ac71eae139f /R Scripts
parent37c23e0d1fcbdee116856d59c0dab98686e265ae (diff)
downloadcriminal_cascades-9350ee2c6359562a23cf8efdefdd7de80b2a682e.tar.gz
minor updates
Diffstat (limited to 'R Scripts')
-rwxr-xr-xR Scripts/data-exploration.R4
-rw-r--r--R Scripts/demographics.R24
-rw-r--r--R Scripts/predict-victims-plots.R6
-rw-r--r--R Scripts/predict-victims.R98
4 files changed, 80 insertions, 52 deletions
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R
index 404ce9e..11e33cf 100755
--- a/R Scripts/data-exploration.R
+++ b/R Scripts/data-exploration.R
@@ -1,6 +1,6 @@
library(igraph)
-setwd("Documents/Cascade Project/Raw Data/")
-load('chi-19mar2015.RData')
+setwd("~/Documents/Violence Cascades/")
+load('Results/hyper-lcc.RData')
d = remove.edge.attribute(person,'weight')
lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize)))
diff --git a/R Scripts/demographics.R b/R Scripts/demographics.R
new file mode 100644
index 0000000..349eda9
--- /dev/null
+++ b/R Scripts/demographics.R
@@ -0,0 +1,24 @@
+library(igraph)
+setwd('~/Documents/Violence Cascades/')
+load('Raw Data/lcc.RData')
+
+length(vic_ids)
+
+mean(lcc_verts$sex=="M")
+mean(lcc_verts$sex[vic_ids]=="M")
+
+mean(lcc_verts$race=="BLK")
+
+mean(lcc_verts$race %in% c("WHI", "WWH"))
+
+mean(lcc_verts$gang.member)
+
+d = degree(lcc)
+mean(d)
+
+vic_frac = rep(NA,vcount(lcc))
+for(i in 1:vcount(lcc)){
+ if (i %% 10000==0) print(i)
+ neighbors = unlist(neighborhood(lcc, nodes=i, order=2, mode='all'))[-1]
+ vic_frac[i] = sum(lcc_verts$vic[neighbors])/length(neighbors)
+} \ No newline at end of file
diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R
index 553aa89..b872201 100644
--- a/R Scripts/predict-victims-plots.R
+++ b/R Scripts/predict-victims-plots.R
@@ -1,5 +1,5 @@
##### Plot results
-hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
+hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
xlab='Risk Ranking of Victims',main='')
hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),add=T)
@@ -15,7 +15,7 @@ plot(lambdas,counts[1,],log='x',type='l')
correct_rank1 = correct_rank[,length(lambdas)] # demographics model
correct_rank2 = correct_rank[,1] # cascade model
correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model
-counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)),
+counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.001)),
sum(correct_rank1<(vcount(lcc)*0.005)),
sum(correct_rank1<(vcount(lcc)*0.01)),
sum(correct_rank2<(vcount(lcc)*0.001)),
@@ -33,7 +33,7 @@ barplot(counts,
col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)),
beside=TRUE)
legend("topleft", inset=0.05,
- c("Demographics Model", "Cascade Model", "Combined Model"),
+ c("Demographics (Logit)", "Cascades (Sum parent)", "Combined Model"),
fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)))
box(which='plot')
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index 2bda7e2..c662651 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -1,8 +1,9 @@
library(igraph)
+library(data.table)
library(foreach)
library(doMC)
registerDoMC(cores=4)
-setwd('~/Documents/Cascade Project')
+setwd('~/Documents/Violence Cascades/')
load('Raw Data/lcc.RData')
load('Results/hyper-lcc.RData')
load('Results/dag_dat_all.RData')
@@ -10,65 +11,68 @@ source('criminal_cascades/R Scripts/temporal.R')
source('criminal_cascades/R Scripts/structural.R')
##### Initialize data
-formula = vic ~ sex + race + age + gang.member #+ gang.name
+formula = vic ~ sex + race + age + gang.member + gang.name
lcc_verts$sex = as.factor(lcc_verts$sex)
lcc_verts$race = as.factor(lcc_verts$race)
lcc_verts$age = as.numeric(lcc_verts$age)
lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
-# sum(hyp_lcc_verts$vic)/length(days)
-df = data.frame(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
+dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
alpha = 0.0028
delta = 0.06
days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T)
-lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1)
-nvics = sum(hyp_lcc_verts$vic.day %in% days)
-edges_all = dag_dat_all
+days = split(days, ceiling(seq_along(days)/456))
+lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=50)), 1)
+nvics = sum(hyp_lcc_verts$vic)
+edges_all = data.table(dag_dat_all)
##### Loop through days
-writeLines(c(""), "Results/log.txt")
-ptm = proc.time()
-correct_rank = foreach (day = days, .combine=rbind) %dopar% {
- if (which(day==days) %% 100 == 0){sink("Results/log.txt", append=TRUE);cat(paste("day:",day,"\n"))}
-
- ##### Demographics model
- vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
- victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
- victims$vic[vics] = TRUE
- victims$vic[-vics] = FALSE
- fit = lm(formula, data=victims)
-# fit = glm(formula, data=victims, family=binomial)
-# fit = randomForest(formula, data=victims[,1:5], ntree=100)
- probs = predict(fit, newdata=lcc_verts, type='response')
+correct_rank = c()
+for(i in 1:length(days)){
+ ptm = proc.time()
+ print(c(i,length(days)))
+ ds = unlist(days[i], use.names=F)
+ cr = foreach (day = ds, .combine=rbind) %dopar% {
- ##### Cascade Model
- edges = edges_all[which(edges_all$t1<day),]
- f = temporal(edges$t1, day, alpha)
- h = structural(delta,edges$dist)
- weights = f*h
- ids = edges$to
- irs = hyp_lcc_verts$ir_no[ids]
- risk = data.frame(id=ids, ir=irs, weight=weights)
- risk = risk[order(weights, decreasing=T),]
- risk = risk[match(unique(risk$ir),risk$ir),]
-# maybe need to change this to reflect new algorithm that accounts for \tilde{p}
-
- ##### Combined Model
- combined = df#data.frame(ir=attr(probs,'name'), dem=as.numeric(probs), cas=0, comb=0)
- combined$dem[match(attr(probs,'name'), df$ir)] = as.numeric(probs)
- combined$cas[match(risk$ir, attr(probs,'name'))] = risk$weight
+ ##### Demographics model
+ vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
+ victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
+ victims$vic[vics] = TRUE
+ victims$vic[-vics] = FALSE
+# fit = lm(formula, data=victims)
+ fit = glm(formula, data=victims, family=binomial)
+ # fit = randomForest(formula, data=victims[,1:5], ntree=100)
+ probs = predict(fit, newdata=lcc_verts, type='response')
+
+ ##### Cascade Model
+ edges = edges_all[which(edges_all$t1<day),]
+ f = temporal(edges$t1, day, alpha)
+ h = structural(delta,edges$dist)
+ weights = f*h
+ ids = edges$to
+ irs = hyp_lcc_verts$ir_no[ids]
+
+ risk = data.table(id=ids, ir=irs, weight=weights)
+ if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum
- ##### Gather results
- infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
- crday = matrix(nrow=length(infected_irs), ncol=length(lambdas))
- for (lambda in lambdas){
- combined$comb = lambda*combined$dem + (1-lambda)*combined$cas
- c_idx = which(lambdas==lambda)
- crday[,c_idx] = rank(-combined$comb,ties.method='average')[match(infected_irs,combined$ir)]
+ ##### Combined Model
+ combined = dt
+ combined$dem[match(attr(probs,'name'), dt$ir)] = as.numeric(probs)
+ combined$cas[match(risk$ir, dt$ir)] = risk$weight
+
+ ##### Gather results
+ infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
+ crday = matrix(nrow=length(infected_irs), ncol=length(lambdas))
+ for (lambda in lambdas){
+ combined$comb = lambda*combined$dem + (1-lambda)*combined$cas
+ c_idx = which(lambdas==lambda)
+ crday[,c_idx] = rank(-combined$comb,ties.method='average')[match(infected_irs,combined$ir)]
+ }
+
+ return(crday)
}
-
- return(crday)
+ correct_rank = rbind(correct_rank,cr)
+ print(proc.time()-ptm)
}
-print(proc.time()-ptm)
# save(correct_rank, file='Results/correct_rank_62815.RData') \ No newline at end of file