diff options
| author | Ben Green <ben@SEASITs-MacBook-Pro.local> | 2015-06-28 17:38:33 -0400 |
|---|---|---|
| committer | Ben Green <ben@SEASITs-MacBook-Pro.local> | 2015-06-28 17:38:33 -0400 |
| commit | 6e527bbf612465bf5d739b9652abc0165550993c (patch) | |
| tree | 9525bed16d9e4568747855afd84a03937090f1cb /R Scripts/benchmarking.R | |
| parent | 7167a81cfb8b872dd1547e5a8669004b191417db (diff) | |
| download | criminal_cascades-6e527bbf612465bf5d739b9652abc0165550993c.tar.gz | |
Worked on synthetic data recovery so we can tell how high the actual
infector is ranked among all potential parents. Cleaned up code for the
predicting victims benchmarking test.
Diffstat (limited to 'R Scripts/benchmarking.R')
| -rw-r--r-- | R Scripts/benchmarking.R | 130 |
1 files changed, 0 insertions, 130 deletions
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R deleted file mode 100644 index 1cf99e0..0000000 --- a/R Scripts/benchmarking.R +++ /dev/null @@ -1,130 +0,0 @@ -library(igraph) -setwd('~/Documents/Cascade Project') -load('Raw Data/lcc.RData') -load('Results/hyper-lcc.RData') -load('Results/dag_dat_all.RData') -source('Scripts/temporal.R') -source('Scripts/structural.R') - -##### Initialize data -formula = vic ~ sex + race + age + gang.member + gang.name -lcc_verts$sex = as.factor(lcc_verts$sex) -lcc_verts$race = as.factor(lcc_verts$race) -lcc_verts$age = as.numeric(lcc_verts$age) -lcc_verts$gang.name = as.factor(lcc_verts$gang.name) -# sum(hyp_lcc_verts$vic)/length(days) - -##### Loop through days -alpha = 1/100 -gamma = 0.18 -days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) -lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) -nvics = sum(hyp_lcc_verts$vic.day %in% days) -correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) -# correct_rank1 = correct_rank2 = correct_rank3 = c() -edges_all = dag_dat_all[dag_dat_all$dist<2,] -ptm = proc.time() -for (day in days){ - if (day %% 100 == 0) print(day) - - ##### Demographics model -# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) -# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] -# victims$vic[vics] = TRUE -# victims$vic[-vics] = FALSE -# # glm.fit = glm(formula, data=victims, family=binomial) -# glm.fit = lm(formula, data=victims) -# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') - - ##### Cascade Model - edges = edges_all[which(edges_all$t1<day),] - f = temporal(edges$t1, day, alpha) - h = structural(gamma,edges$dist) - weights = f*h - ids = edges$to - irs = hyp_lcc_verts$ir_no[ids] - risk = data.frame(id=ids, ir=irs, weight=weights) - risk = risk[order(weights, decreasing=T),] - risk = risk[match(unique(risk$ir),risk$ir),] -# maybe need to change this to reflect new algorithm that accounts for \tilde{p} - - ##### Combined Model -# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) - combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight - - ##### Gather results - infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] - for (lambda in lambdas){ - combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas - c_idx = which(lambdas==lambda) - r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) - # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank - correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) - # maybe should also mark down vic/nonvic status of each? - } - -} -print(proc.time()-ptm) - - -##### Plot results -hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), - xlab='Risk Ranking of Victims',main='') -hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -legend("topright", c("Demographics Model", "Cascade Model"), - fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) - -counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, - colSums(correct_rank<(vcount(lcc)/200))*100/nvics, - colSums(correct_rank<(vcount(lcc)/100))*100/nvics), - nrow=3, byrow=T) -plot(lambdas,counts[1,],log='x',type='l') - -correct_rank1 = correct_rank[,length(lambdas)] -correct_rank2 = correct_rank[,1] -correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] -counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), - sum(correct_rank1<(vcount(lcc)*0.005)), - sum(correct_rank1<(vcount(lcc)*0.01)), - sum(correct_rank2<(vcount(lcc)*0.001)), - sum(correct_rank2<(vcount(lcc)*0.005)), - sum(correct_rank2<(vcount(lcc)*0.01)), - sum(correct_rank3<(vcount(lcc)*0.001)), - sum(correct_rank3<(vcount(lcc)*0.005)), - sum(correct_rank3<(vcount(lcc)*0.01))), - nrow=3, byrow=T) -counts = counts*100/nvics -barplot(counts, - xlab="Size of High-Risk Population", - ylab="Percent of Victims Predicted", - names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), - beside=TRUE) -legend("topleft", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) -box(which='plot') -par(new=T) -counts = counts/(100/nvics) -barplot(counts, - ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), - beside=TRUE) -axis(side = 4) -mtext(side = 4, line = 3, "Number of Victims Predicted") - -popsizes = c(0.1, 0.5, 1) -plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) -lines(popsizes,counts[2,]) -lines(popsizes,counts[3,]) -lines(c(0,1),c(0,1)) - -#### Precision-Recall Curve -plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) -plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) -plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) -legend("bottomright", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c('red','darkblue','darkgreen')) -lines(c(0,vcount(lcc)),c(0,1)) |
