diff options
Diffstat (limited to 'R Scripts')
| -rw-r--r-- | R Scripts/benchmarking.R | 130 | ||||
| -rw-r--r-- | R Scripts/find-cascades.R | 11 | ||||
| -rw-r--r-- | R Scripts/find-parents.R | 40 | ||||
| -rwxr-xr-x | R Scripts/generate-dag-dat.R | 10 | ||||
| -rw-r--r-- | R Scripts/generate-network.R | 79 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 61 | ||||
| -rw-r--r-- | R Scripts/predict-victims.R | 67 | ||||
| -rwxr-xr-x | R Scripts/structural.R | 4 |
8 files changed, 260 insertions, 142 deletions
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R deleted file mode 100644 index 1cf99e0..0000000 --- a/R Scripts/benchmarking.R +++ /dev/null @@ -1,130 +0,0 @@ -library(igraph) -setwd('~/Documents/Cascade Project') -load('Raw Data/lcc.RData') -load('Results/hyper-lcc.RData') -load('Results/dag_dat_all.RData') -source('Scripts/temporal.R') -source('Scripts/structural.R') - -##### Initialize data -formula = vic ~ sex + race + age + gang.member + gang.name -lcc_verts$sex = as.factor(lcc_verts$sex) -lcc_verts$race = as.factor(lcc_verts$race) -lcc_verts$age = as.numeric(lcc_verts$age) -lcc_verts$gang.name = as.factor(lcc_verts$gang.name) -# sum(hyp_lcc_verts$vic)/length(days) - -##### Loop through days -alpha = 1/100 -gamma = 0.18 -days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) -lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) -nvics = sum(hyp_lcc_verts$vic.day %in% days) -correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) -# correct_rank1 = correct_rank2 = correct_rank3 = c() -edges_all = dag_dat_all[dag_dat_all$dist<2,] -ptm = proc.time() -for (day in days){ - if (day %% 100 == 0) print(day) - - ##### Demographics model -# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) -# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] -# victims$vic[vics] = TRUE -# victims$vic[-vics] = FALSE -# # glm.fit = glm(formula, data=victims, family=binomial) -# glm.fit = lm(formula, data=victims) -# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') - - ##### Cascade Model - edges = edges_all[which(edges_all$t1<day),] - f = temporal(edges$t1, day, alpha) - h = structural(gamma,edges$dist) - weights = f*h - ids = edges$to - irs = hyp_lcc_verts$ir_no[ids] - risk = data.frame(id=ids, ir=irs, weight=weights) - risk = risk[order(weights, decreasing=T),] - risk = risk[match(unique(risk$ir),risk$ir),] -# maybe need to change this to reflect new algorithm that accounts for \tilde{p} - - ##### Combined Model -# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) - combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight - - ##### Gather results - infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] - for (lambda in lambdas){ - combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas - c_idx = which(lambdas==lambda) - r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) - # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank - correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) - # maybe should also mark down vic/nonvic status of each? - } - -} -print(proc.time()-ptm) - - -##### Plot results -hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), - xlab='Risk Ranking of Victims',main='') -hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -legend("topright", c("Demographics Model", "Cascade Model"), - fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) - -counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, - colSums(correct_rank<(vcount(lcc)/200))*100/nvics, - colSums(correct_rank<(vcount(lcc)/100))*100/nvics), - nrow=3, byrow=T) -plot(lambdas,counts[1,],log='x',type='l') - -correct_rank1 = correct_rank[,length(lambdas)] -correct_rank2 = correct_rank[,1] -correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] -counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), - sum(correct_rank1<(vcount(lcc)*0.005)), - sum(correct_rank1<(vcount(lcc)*0.01)), - sum(correct_rank2<(vcount(lcc)*0.001)), - sum(correct_rank2<(vcount(lcc)*0.005)), - sum(correct_rank2<(vcount(lcc)*0.01)), - sum(correct_rank3<(vcount(lcc)*0.001)), - sum(correct_rank3<(vcount(lcc)*0.005)), - sum(correct_rank3<(vcount(lcc)*0.01))), - nrow=3, byrow=T) -counts = counts*100/nvics -barplot(counts, - xlab="Size of High-Risk Population", - ylab="Percent of Victims Predicted", - names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), - beside=TRUE) -legend("topleft", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) -box(which='plot') -par(new=T) -counts = counts/(100/nvics) -barplot(counts, - ylim=c(0,max(counts)*1.1), - col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), - beside=TRUE) -axis(side = 4) -mtext(side = 4, line = 3, "Number of Victims Predicted") - -popsizes = c(0.1, 0.5, 1) -plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) -lines(popsizes,counts[2,]) -lines(popsizes,counts[3,]) -lines(c(0,1),c(0,1)) - -#### Precision-Recall Curve -plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) -plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) -plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) -legend("bottomright", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), - fill=c('red','darkblue','darkgreen')) -lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R index 0a69c5c..7673909 100644 --- a/R Scripts/find-cascades.R +++ b/R Scripts/find-cascades.R @@ -9,14 +9,14 @@ source('criminal_cascades/R Scripts/structural.R') vic_ids = which(V(hyp_lcc)$vic==TRUE) #### Initialize model parameters -alpha = 1/21 +alpha = 0.041 # gamma = 0.3 -delta = 0.001 -beta = 0.00796964464237 +delta = 0.1 +# beta = 0.00796964464237 ##### Get weights # find max n days where infection possible given alpha -edges = dag_dat_vics +edges = dag_dat_test[!is.na(dag_dat_test$t2),] # edges = edges[(edges$t2-edges$t1)<300,] p_t = temporal(edges$t1, edges$t2, alpha) @@ -31,7 +31,7 @@ weights = p/p_tilde # probs = as.numeric(lm.probs) # betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)] # betas = 0.055 -thresh = beta/(1-beta) +# thresh = beta/(1-beta) realized = c() # edges = edges[weights>thresh,] @@ -51,6 +51,7 @@ for (u in vic_ids){ } realized = c(realized, max_edge) } +edges[realized,c('from','to')] # if (length(Ei)>0){ # max_edge = Ei[which.max(weights[Ei])] # how to deal with ties???? # realized = c(realized, max_edge) diff --git a/R Scripts/find-parents.R b/R Scripts/find-parents.R new file mode 100644 index 0000000..3ec8809 --- /dev/null +++ b/R Scripts/find-parents.R @@ -0,0 +1,40 @@ +# library(igraph) +# setwd("~/Documents/Cascade Project/") +# load('Results/hyper-lcc.RData') +# load('Results/dag_dat_vics.RData') +# source('criminal_cascades/R Scripts/temporal.R') +# source('criminal_cascades/R Scripts/structural.R') + +##### Initialize parameters based on what ml2 found +alpha = 0.061 +delta = 0.082 + +##### Get weights +edges = dag_dat_test[!is.na(dag_dat_test$t2),] + +dt = edges$t2 - edges$t1 +p_t = exp(-alpha*dt) * (exp(alpha)-1) +p_s = structural(delta, edges$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * exp(-alpha*dt) +weights = p/p_tilde +edges$weight = weights + +##### Find most likely parents +parents = data.frame(vic=0,Npars=0,par_rank=0) +vics = setdiff(vic_ids,seeds) +for (u in vics){ + u_parents = edges[edges$to==u,] + u_parents = u_parents[order(u_parents$weight,decreasing=T),] + Nparents = dim(u_parents)[1] + infector = V(g)$infector[u] + infectorID = which(u_parents$from==infector) + parents[which(vics==u),] = c(u, Nparents, infectorID) +} + +##### Get some summary statistics on how well +median(parents$par_rank[parents$Npars>9]) +median(parents$par_rank[parents$Npars>99]) + + + diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R index b5f2c3a..a2df165 100755 --- a/R Scripts/generate-dag-dat.R +++ b/R Scripts/generate-dag-dat.R @@ -6,7 +6,7 @@ vic_ids = which(V(hyp_lcc)$vic==TRUE) edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])} -dag_dat_all = data.frame(matrix(nrow=1,ncol=8)) +dag_dat_all = data.frame(matrix(nrow=1,ncol=10)) hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight') ei = 1 ptm=proc.time() @@ -28,15 +28,15 @@ for (u in vic_ids){ #will be faster to pre-allocate and fill in rather than rbind each time dag_dat_all[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, rep(tu,length(nbhd)), tvs, dists, - weights, row.names=NULL) + weights, u_spawn, v_spawn, row.names=NULL) ei = ei + length(nbhd) } print(proc.time()-ptm) #3.5 hours -colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3') +colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3','spawn1','spawn2') rownames(dag_dat_all) = NULL -dag_dat_all$spawn1 = hyp_lcc_verts$spawn.date[dag_dat_all$from] -dag_dat_all$spawn2 = hyp_lcc_verts$spawn.date[dag_dat_all$to] +# dag_dat_all$spawn1 = hyp_lcc_verts$spawn.date[dag_dat_all$from] +# dag_dat_all$spawn2 = hyp_lcc_verts$spawn.date[dag_dat_all$to] save(dag_dat_all, file='Results/dag_dat_all.RData') write.csv(dag_dat_all, file='Results/dag_dat_all.csv') diff --git a/R Scripts/generate-network.R b/R Scripts/generate-network.R new file mode 100644 index 0000000..3b40969 --- /dev/null +++ b/R Scripts/generate-network.R @@ -0,0 +1,79 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +source('criminal_cascades/R Scripts/temporal.R') +source('criminal_cascades/R Scripts/structural.R') + +alpha = 1/100 +beta = 0.02 +delta = 0.15 +# lmbda = 1/10 +t_max = 1000 + +N = 5000 +g = forest.fire.game(nodes=N, fw.prob=0.3, ambs=1, directed=F) +plot(g, vertex.size=5, vertex.label=NA) + +V(g)$seed = runif(vcount(g))<beta +seeds = which(V(g)$seed) +V(g)$vic = V(g)$seed +V(g)$vic.day[V(g)$seed] = sample(1:t_max, sum(V(g)$seed)) +V(g)$spawn.date = 0 +V(g)$infector = NA + +for (day in 1:t_max){ + day.vics = match(day,V(g)$vic.day) + if (is.na(day.vics)) next + for (vic in day.vics){ + neighbors = (unlist(neighborhood(g,3,vic)))[-1] + dists = as.numeric(shortest.paths(g,vic,neighbors)) + infected = neighbors[which(runif(length(neighbors))<structural(delta, dists))] + infected = setdiff(infected,seeds) # don't try to infect seeds + inf.days = day + ceiling(alpha*rexp(length(infected),alpha)) + V(g)$vic[infected] = TRUE + infects = (inf.days <= V(g)$vic.day[infected]) %in% c(NA,T) + V(g)$vic.day[infected[infects]] = inf.days[infects] + V(g)$infector[infected[infects]] = vic + } +} + +vic_ids = which(V(g)$vic) +cols = rep('lightblue',N); cols[V(g)$vic]='red'; cols[V(g)$seed]='darkred' +plot(g, vertex.size=5, vertex.label=NA, vertex.color=cols) + +##### generate dag_dat +dag_dat_test = data.frame(matrix(nrow=1,ncol=10)) +ei = 1 +for (u in vic_ids){ + if ((which(vic_ids==u) %% 1000)==0) print(which(vic_ids==u)) + tu = V(g)$vic.day[u] + u_spawn = V(g)$spawn.date[u] + nbhd = (unlist(neighborhood(g,nodes=u,order=3)))[-1] # get nodes within neighborhood + tvs = V(g)$vic.day[nbhd] + v_spawn = V(g)$spawn.date[nbhd] + nbhd = nbhd[tu>v_spawn & (is.na(tvs) | tu<tvs)] + tvs = V(g)$vic.day[nbhd] + dists = as.numeric(shortest.paths(g,u,nbhd)) + + weights = matrix(1,nrow=length(nbhd),ncol=3) + + #will be faster to pre-allocate and fill in rather than rbind each time + dag_dat_test[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, + rep(tu,length(nbhd)), tvs, dists, + weights, rep(u_spawn,length(nbhd)), + rep(0,length(nbhd)), row.names=NULL) + ei = ei + length(nbhd) +} +colnames(dag_dat_test) = c('from','to','t1','t2','dist','w1','w2','w3','spawn1','spawn2') +rownames(dag_dat_test) = NULL + +write.csv(dag_dat_test, file='Results/dag_dat_test.csv') + +##### analyze performance of recovery algorithm +recovered = read.csv('Results/infectors.csv',header=F,col.names=c('victim','infector')) +recovered = recovered[order(recovered$victim),] +infectors = cbind(setdiff(vic_ids,seeds), + V(g)$infector[setdiff(vic_ids,seeds)], + recovered$infector[recovered$victim %in% setdiff(vic_ids,seeds)]) +mean(infectors[,2]==infectors[,3]) + +dag_dat_test[dag_dat_test$to==4984,]
\ No newline at end of file diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R new file mode 100644 index 0000000..8a93667 --- /dev/null +++ b/R Scripts/predict-victims-plots.R @@ -0,0 +1,61 @@ +##### Plot results +hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), + xlab='Risk Ranking of Victims',main='') +hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +legend("topright", c("Demographics Model", "Cascade Model"), + fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) + +counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, + colSums(correct_rank<(vcount(lcc)/200))*100/nvics, + colSums(correct_rank<(vcount(lcc)/100))*100/nvics), + nrow=3, byrow=T) +plot(lambdas,counts[1,],log='x',type='l') + +correct_rank1 = correct_rank[,length(lambdas)] +correct_rank2 = correct_rank[,1] +correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] +counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), + sum(correct_rank1<(vcount(lcc)*0.005)), + sum(correct_rank1<(vcount(lcc)*0.01)), + sum(correct_rank2<(vcount(lcc)*0.001)), + sum(correct_rank2<(vcount(lcc)*0.005)), + sum(correct_rank2<(vcount(lcc)*0.01)), + sum(correct_rank3<(vcount(lcc)*0.001)), + sum(correct_rank3<(vcount(lcc)*0.005)), + sum(correct_rank3<(vcount(lcc)*0.01))), + nrow=3, byrow=T) +counts = counts*100/nvics +barplot(counts, + xlab="Size of High-Risk Population", + ylab="Percent of Victims Predicted", + names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), + beside=TRUE) +legend("topleft", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) +box(which='plot') +par(new=T) +counts = counts/(100/nvics) +barplot(counts, + ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), + beside=TRUE) +axis(side = 4) +mtext(side = 4, line = 3, "Number of Victims Predicted") + +popsizes = c(0.1, 0.5, 1) +plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) +lines(popsizes,counts[2,]) +lines(popsizes,counts[3,]) +lines(c(0,1),c(0,1)) + +#### Precision-Recall Curve +plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) +plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) +plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) +legend("bottomright", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c('red','darkblue','darkgreen')) +lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R new file mode 100644 index 0000000..470815d --- /dev/null +++ b/R Scripts/predict-victims.R @@ -0,0 +1,67 @@ +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/hyper-lcc.RData') +load('Results/dag_dat_all.RData') +source('criminal_cascades/R Scripts/temporal.R') +source('criminal_cascades/R Scripts/structural.R') + +##### Initialize data +formula = vic ~ sex + race + age + gang.member + gang.name +lcc_verts$sex = as.factor(lcc_verts$sex) +lcc_verts$race = as.factor(lcc_verts$race) +lcc_verts$age = as.numeric(lcc_verts$age) +lcc_verts$gang.name = as.factor(lcc_verts$gang.name) +# sum(hyp_lcc_verts$vic)/length(days) + +alpha = 0.0028 +delta = 0.06 +days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T) +lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) +nvics = sum(lcc_verts$vic)#sum(hyp_lcc_verts$vic.day %in% days) +correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +edges_all = dag_dat_all + +##### Loop through days +ptm = proc.time() +for (day in days){ + if (which(day==days) %% 100 == 0) print(day) + + ##### Demographics model + vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) + victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] + victims$vic[vics] = TRUE + victims$vic[-vics] = FALSE +# glm.fit = glm(formula, data=victims, family=binomial) + glm.fit = lm(formula, data=victims) + glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') + + ##### Cascade Model + edges = edges_all[which(edges_all$t1<day),] + f = temporal(edges$t1, day, alpha) + h = structural(delta,edges$dist) + weights = f*h + ids = edges$to + irs = hyp_lcc_verts$ir_no[ids] + risk = data.frame(id=ids, ir=irs, weight=weights) + risk = risk[order(weights, decreasing=T),] + risk = risk[match(unique(risk$ir),risk$ir),] +# maybe need to change this to reflect new algorithm that accounts for \tilde{p} + + ##### Combined Model + combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) + combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight + + ##### Gather results + infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + for (lambda in lambdas){ + combined$comb = lambda*combined$dem + (1-lambda)*combined$cas + c_idx = which(lambdas==lambda) + r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) + # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank + correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) + # maybe should also mark down vic/nonvic status of each? + } + +} +print(proc.time()-ptm) diff --git a/R Scripts/structural.R b/R Scripts/structural.R index 132aa6c..b68711c 100755 --- a/R Scripts/structural.R +++ b/R Scripts/structural.R @@ -1,5 +1,5 @@ -# structural = function(w1,w2,w3,gamma){ -# return (plogis(w1,scale=gamma) * plogis(w2,scale=gamma) * plogis(w3,scale=gamma)) +# structural = function(delta,lmbda,w1,w2,w3){ +# return (delta/(1. + 1./(w1*lmbda) + 1./(w2*lmbda) + 1./(w3*lmbda))) # } structural = function(delta,dist){ |
