diff options
| author | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
|---|---|---|
| committer | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
| commit | 1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch) | |
| tree | 6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts/benchmarking.R | |
| parent | e5dada202c34521618bf82a086093c342841e5e8 (diff) | |
| download | criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz | |
added my R scripts
Diffstat (limited to 'R Scripts/benchmarking.R')
| -rw-r--r-- | R Scripts/benchmarking.R | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R new file mode 100644 index 0000000..1cf99e0 --- /dev/null +++ b/R Scripts/benchmarking.R @@ -0,0 +1,130 @@ +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/hyper-lcc.RData') +load('Results/dag_dat_all.RData') +source('Scripts/temporal.R') +source('Scripts/structural.R') + +##### Initialize data +formula = vic ~ sex + race + age + gang.member + gang.name +lcc_verts$sex = as.factor(lcc_verts$sex) +lcc_verts$race = as.factor(lcc_verts$race) +lcc_verts$age = as.numeric(lcc_verts$age) +lcc_verts$gang.name = as.factor(lcc_verts$gang.name) +# sum(hyp_lcc_verts$vic)/length(days) + +##### Loop through days +alpha = 1/100 +gamma = 0.18 +days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) +lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) +nvics = sum(hyp_lcc_verts$vic.day %in% days) +correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +# correct_rank1 = correct_rank2 = correct_rank3 = c() +edges_all = dag_dat_all[dag_dat_all$dist<2,] +ptm = proc.time() +for (day in days){ + if (day %% 100 == 0) print(day) + + ##### Demographics model +# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) +# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] +# victims$vic[vics] = TRUE +# victims$vic[-vics] = FALSE +# # glm.fit = glm(formula, data=victims, family=binomial) +# glm.fit = lm(formula, data=victims) +# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') + + ##### Cascade Model + edges = edges_all[which(edges_all$t1<day),] + f = temporal(edges$t1, day, alpha) + h = structural(gamma,edges$dist) + weights = f*h + ids = edges$to + irs = hyp_lcc_verts$ir_no[ids] + risk = data.frame(id=ids, ir=irs, weight=weights) + risk = risk[order(weights, decreasing=T),] + risk = risk[match(unique(risk$ir),risk$ir),] +# maybe need to change this to reflect new algorithm that accounts for \tilde{p} + + ##### Combined Model +# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) + combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight + + ##### Gather results + infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + for (lambda in lambdas){ + combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas + c_idx = which(lambdas==lambda) + r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) + # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank + correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) + # maybe should also mark down vic/nonvic status of each? + } + +} +print(proc.time()-ptm) + + +##### Plot results +hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), + xlab='Risk Ranking of Victims',main='') +hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +legend("topright", c("Demographics Model", "Cascade Model"), + fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) + +counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, + colSums(correct_rank<(vcount(lcc)/200))*100/nvics, + colSums(correct_rank<(vcount(lcc)/100))*100/nvics), + nrow=3, byrow=T) +plot(lambdas,counts[1,],log='x',type='l') + +correct_rank1 = correct_rank[,length(lambdas)] +correct_rank2 = correct_rank[,1] +correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] +counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), + sum(correct_rank1<(vcount(lcc)*0.005)), + sum(correct_rank1<(vcount(lcc)*0.01)), + sum(correct_rank2<(vcount(lcc)*0.001)), + sum(correct_rank2<(vcount(lcc)*0.005)), + sum(correct_rank2<(vcount(lcc)*0.01)), + sum(correct_rank3<(vcount(lcc)*0.001)), + sum(correct_rank3<(vcount(lcc)*0.005)), + sum(correct_rank3<(vcount(lcc)*0.01))), + nrow=3, byrow=T) +counts = counts*100/nvics +barplot(counts, + xlab="Size of High-Risk Population", + ylab="Percent of Victims Predicted", + names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), + beside=TRUE) +legend("topleft", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) +box(which='plot') +par(new=T) +counts = counts/(100/nvics) +barplot(counts, + ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), + beside=TRUE) +axis(side = 4) +mtext(side = 4, line = 3, "Number of Victims Predicted") + +popsizes = c(0.1, 0.5, 1) +plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) +lines(popsizes,counts[2,]) +lines(popsizes,counts[3,]) +lines(c(0,1),c(0,1)) + +#### Precision-Recall Curve +plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) +plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) +plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) +legend("bottomright", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c('red','darkblue','darkgreen')) +lines(c(0,vcount(lcc)),c(0,1)) |
