diff options
Diffstat (limited to 'R Scripts')
| -rw-r--r-- | R Scripts/benchmarking.R | 130 | ||||
| -rw-r--r-- | R Scripts/find-cascades.R | 11 | ||||
| -rw-r--r-- | R Scripts/find-parents.R | 40 | ||||
| -rw-r--r-- | R Scripts/generate-network.R | 5 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 61 | ||||
| -rw-r--r-- | R Scripts/predict-victims.R | 67 |
6 files changed, 177 insertions, 137 deletions
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R deleted file mode 100644 index 1cf99e0..0000000 --- a/R Scripts/benchmarking.R +++ /dev/null @@ -1,130 +0,0 @@ -library(igraph) -setwd('~/Documents/Cascade Project') -load('Raw Data/lcc.RData') -load('Results/hyper-lcc.RData') -load('Results/dag_dat_all.RData') -source('Scripts/temporal.R') -source('Scripts/structural.R') - -##### Initialize data -formula = vic ~ sex + race + age + gang.member + gang.name -lcc_verts$sex = as.factor(lcc_verts$sex) -lcc_verts$race = as.factor(lcc_verts$race) -lcc_verts$age = as.numeric(lcc_verts$age) -lcc_verts$gang.name = as.factor(lcc_verts$gang.name) -# sum(hyp_lcc_verts$vic)/length(days) - -##### Loop through days -alpha = 1/100 -gamma = 0.18 -days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) -lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) -nvics = sum(hyp_lcc_verts$vic.day %in% days) -correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) -# correct_rank1 = correct_rank2 = correct_rank3 = c() -edges_all = dag_dat_all[dag_dat_all$dist<2,] -ptm = proc.time() -for (day in days){ - if (day %% 100 == 0) print(day) - - ##### Demographics model -# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) -# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] -# victims$vic[vics] = TRUE -# victims$vic[-vics] = FALSE -# # glm.fit = glm(formula, data=victims, family=binomial) -# glm.fit = lm(formula, data=victims) -# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') - - ##### Cascade Model - edges = edges_all[which(edges_all$t1<day),] - f = temporal(edges$t1, day, alpha) - h = structural(gamma,edges$dist) - weights = f*h - ids = edges$to - irs = hyp_lcc_verts$ir_no[ids] - risk = data.frame(id=ids, ir=irs, weight=weights) - risk = risk[order(weights, decreasing=T),] - risk = risk[match(unique(risk$ir),risk$ir),] -# maybe need to change this to reflect new algorithm that accounts for \tilde{p} - - ##### Combined Model -# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) - combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight - - ##### Gather results - infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] - for (lambda in lambdas){ - combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas - c_idx = which(lambdas==lambda) - r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) - # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank - correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) - # maybe should also mark down vic/nonvic status of each? - } - -} -print(proc.time()-ptm) - - -##### Plot results -hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), - xlab='Risk Ranking of Victims',main='') -hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -legend("topright", c("Demographics Model", "Cascade Model"), - fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) - -counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, - colSums(correct_rank<(vcount(lcc)/200))*100/nvics, - colSums(correct_rank<(vcount(lcc)/100))*100/nvics), - nrow=3, byrow=T) -plot(lambdas,counts[1,],log='x',type='l') - -correct_rank1 = correct_rank[,length(lambdas)] -correct_rank2 = correct_rank[,1] -correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] -counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), - sum(correct_rank1<(vcount(lcc)*0.005)), - sum(correct_rank1<(vcount(lcc)*0.01)), - sum(correct_rank2<(vcount(lcc)*0.001)), - sum(correct_rank2<(vcount(lcc)*0.005)), - sum(correct_rank2<(vcount(lcc)*0.01)), - sum(correct_rank3<(vcount(lcc)*0.001)), - sum(correct_rank3<(vcount(lcc)*0.005)), - sum(correct_rank3<(vcount(lcc)*0.01))), - nrow=3, byrow=T) -counts = counts*100/nvics -barplot(counts, - xlab="Size of High-Risk Population", - ylab="Percent of Victims Predicted", - names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), - beside=TRUE) -legend("topleft", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) -box(which='plot') -par(new=T) -counts = counts/(100/nvics) -barplot(counts, - ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), - beside=TRUE) -axis(side = 4) -mtext(side = 4, line = 3, "Number of Victims Predicted") - -popsizes = c(0.1, 0.5, 1) -plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) -lines(popsizes,counts[2,]) -lines(popsizes,counts[3,]) -lines(c(0,1),c(0,1)) - -#### Precision-Recall Curve -plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) -plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) -plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) -legend("bottomright", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c('red','darkblue','darkgreen')) -lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R index 0a69c5c..7673909 100644 --- a/R Scripts/find-cascades.R +++ b/R Scripts/find-cascades.R @@ -9,14 +9,14 @@ source('criminal_cascades/R Scripts/structural.R') vic_ids = which(V(hyp_lcc)$vic==TRUE) #### Initialize model parameters -alpha = 1/21 +alpha = 0.041 # gamma = 0.3 -delta = 0.001 -beta = 0.00796964464237 +delta = 0.1 +# beta = 0.00796964464237 ##### Get weights # find max n days where infection possible given alpha -edges = dag_dat_vics +edges = dag_dat_test[!is.na(dag_dat_test$t2),] # edges = edges[(edges$t2-edges$t1)<300,] p_t = temporal(edges$t1, edges$t2, alpha) @@ -31,7 +31,7 @@ weights = p/p_tilde # probs = as.numeric(lm.probs) # betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)] # betas = 0.055 -thresh = beta/(1-beta) +# thresh = beta/(1-beta) realized = c() # edges = edges[weights>thresh,] @@ -51,6 +51,7 @@ for (u in vic_ids){ } realized = c(realized, max_edge) } +edges[realized,c('from','to')] # if (length(Ei)>0){ # max_edge = Ei[which.max(weights[Ei])] # how to deal with ties???? # realized = c(realized, max_edge) diff --git a/R Scripts/find-parents.R b/R Scripts/find-parents.R new file mode 100644 index 0000000..3ec8809 --- /dev/null +++ b/R Scripts/find-parents.R @@ -0,0 +1,40 @@ +# library(igraph) +# setwd("~/Documents/Cascade Project/") +# load('Results/hyper-lcc.RData') +# load('Results/dag_dat_vics.RData') +# source('criminal_cascades/R Scripts/temporal.R') +# source('criminal_cascades/R Scripts/structural.R') + +##### Initialize parameters based on what ml2 found +alpha = 0.061 +delta = 0.082 + +##### Get weights +edges = dag_dat_test[!is.na(dag_dat_test$t2),] + +dt = edges$t2 - edges$t1 +p_t = exp(-alpha*dt) * (exp(alpha)-1) +p_s = structural(delta, edges$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * exp(-alpha*dt) +weights = p/p_tilde +edges$weight = weights + +##### Find most likely parents +parents = data.frame(vic=0,Npars=0,par_rank=0) +vics = setdiff(vic_ids,seeds) +for (u in vics){ + u_parents = edges[edges$to==u,] + u_parents = u_parents[order(u_parents$weight,decreasing=T),] + Nparents = dim(u_parents)[1] + infector = V(g)$infector[u] + infectorID = which(u_parents$from==infector) + parents[which(vics==u),] = c(u, Nparents, infectorID) +} + +##### Get some summary statistics on how well +median(parents$par_rank[parents$Npars>9]) +median(parents$par_rank[parents$Npars>99]) + + + diff --git a/R Scripts/generate-network.R b/R Scripts/generate-network.R index dc4a4f8..3b40969 100644 --- a/R Scripts/generate-network.R +++ b/R Scripts/generate-network.R @@ -9,7 +9,6 @@ delta = 0.15 # lmbda = 1/10 t_max = 1000 -# g = watts.strogatz.game(1, 100, 3, 0.25) N = 5000 g = forest.fire.game(nodes=N, fw.prob=0.3, ambs=1, directed=F) plot(g, vertex.size=5, vertex.label=NA) @@ -31,7 +30,7 @@ for (day in 1:t_max){ infected = setdiff(infected,seeds) # don't try to infect seeds inf.days = day + ceiling(alpha*rexp(length(infected),alpha)) V(g)$vic[infected] = TRUE - infects = (inf.days < V(g)$vic.day[infected]) %in% c(NA,T) + infects = (inf.days <= V(g)$vic.day[infected]) %in% c(NA,T) V(g)$vic.day[infected[infects]] = inf.days[infects] V(g)$infector[infected[infects]] = vic } @@ -76,3 +75,5 @@ infectors = cbind(setdiff(vic_ids,seeds), V(g)$infector[setdiff(vic_ids,seeds)], recovered$infector[recovered$victim %in% setdiff(vic_ids,seeds)]) mean(infectors[,2]==infectors[,3]) + +dag_dat_test[dag_dat_test$to==4984,]
\ No newline at end of file diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R new file mode 100644 index 0000000..8a93667 --- /dev/null +++ b/R Scripts/predict-victims-plots.R @@ -0,0 +1,61 @@ +##### Plot results +hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), + xlab='Risk Ranking of Victims',main='') +hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +legend("topright", c("Demographics Model", "Cascade Model"), + fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) + +counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, + colSums(correct_rank<(vcount(lcc)/200))*100/nvics, + colSums(correct_rank<(vcount(lcc)/100))*100/nvics), + nrow=3, byrow=T) +plot(lambdas,counts[1,],log='x',type='l') + +correct_rank1 = correct_rank[,length(lambdas)] +correct_rank2 = correct_rank[,1] +correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] +counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), + sum(correct_rank1<(vcount(lcc)*0.005)), + sum(correct_rank1<(vcount(lcc)*0.01)), + sum(correct_rank2<(vcount(lcc)*0.001)), + sum(correct_rank2<(vcount(lcc)*0.005)), + sum(correct_rank2<(vcount(lcc)*0.01)), + sum(correct_rank3<(vcount(lcc)*0.001)), + sum(correct_rank3<(vcount(lcc)*0.005)), + sum(correct_rank3<(vcount(lcc)*0.01))), + nrow=3, byrow=T) +counts = counts*100/nvics +barplot(counts, + xlab="Size of High-Risk Population", + ylab="Percent of Victims Predicted", + names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), + beside=TRUE) +legend("topleft", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) +box(which='plot') +par(new=T) +counts = counts/(100/nvics) +barplot(counts, + ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), + beside=TRUE) +axis(side = 4) +mtext(side = 4, line = 3, "Number of Victims Predicted") + +popsizes = c(0.1, 0.5, 1) +plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) +lines(popsizes,counts[2,]) +lines(popsizes,counts[3,]) +lines(c(0,1),c(0,1)) + +#### Precision-Recall Curve +plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) +plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) +plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) +legend("bottomright", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c('red','darkblue','darkgreen')) +lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R new file mode 100644 index 0000000..470815d --- /dev/null +++ b/R Scripts/predict-victims.R @@ -0,0 +1,67 @@ +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/hyper-lcc.RData') +load('Results/dag_dat_all.RData') +source('criminal_cascades/R Scripts/temporal.R') +source('criminal_cascades/R Scripts/structural.R') + +##### Initialize data +formula = vic ~ sex + race + age + gang.member + gang.name +lcc_verts$sex = as.factor(lcc_verts$sex) +lcc_verts$race = as.factor(lcc_verts$race) +lcc_verts$age = as.numeric(lcc_verts$age) +lcc_verts$gang.name = as.factor(lcc_verts$gang.name) +# sum(hyp_lcc_verts$vic)/length(days) + +alpha = 0.0028 +delta = 0.06 +days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T) +lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) +nvics = sum(lcc_verts$vic)#sum(hyp_lcc_verts$vic.day %in% days) +correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +edges_all = dag_dat_all + +##### Loop through days +ptm = proc.time() +for (day in days){ + if (which(day==days) %% 100 == 0) print(day) + + ##### Demographics model + vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) + victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] + victims$vic[vics] = TRUE + victims$vic[-vics] = FALSE +# glm.fit = glm(formula, data=victims, family=binomial) + glm.fit = lm(formula, data=victims) + glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') + + ##### Cascade Model + edges = edges_all[which(edges_all$t1<day),] + f = temporal(edges$t1, day, alpha) + h = structural(delta,edges$dist) + weights = f*h + ids = edges$to + irs = hyp_lcc_verts$ir_no[ids] + risk = data.frame(id=ids, ir=irs, weight=weights) + risk = risk[order(weights, decreasing=T),] + risk = risk[match(unique(risk$ir),risk$ir),] +# maybe need to change this to reflect new algorithm that accounts for \tilde{p} + + ##### Combined Model + combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) + combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight + + ##### Gather results + infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + for (lambda in lambdas){ + combined$comb = lambda*combined$dem + (1-lambda)*combined$cas + c_idx = which(lambdas==lambda) + r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) + # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank + correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) + # maybe should also mark down vic/nonvic status of each? + } + +} +print(proc.time()-ptm) |
