diff options
| -rwxr-xr-x | R Scripts/data-prep.R | 24 | ||||
| -rwxr-xr-x | R Scripts/generate-dag-dat.R | 88 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 10 | ||||
| -rwxr-xr-x | R Scripts/sim-analysis.R | 94 |
4 files changed, 120 insertions, 96 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R index 3104ea2..2994575 100755 --- a/R Scripts/data-prep.R +++ b/R Scripts/data-prep.R @@ -1,5 +1,5 @@ library(igraph) -setwd('~/Documents/Cascade Project/Raw Data/') +setwd('~/Documents/Violence Cascades/Raw Data/') #================ # (1) load data @@ -99,14 +99,6 @@ start_date = as.Date("2005-12-31") ## Get first arrest date in the study period for each person sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') sub.arrests = sub.arrests[order(sub.arrests$dates),] -sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),] -arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y') -arrest.days = as.numeric(arrest.dates-start_date) -V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)] - -V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)], - V(person)$dob, - units='days')/365.25) #=================================================================== @@ -192,22 +184,30 @@ V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) #=================================================================== # create id number -V(person)$id = rank(V(person)$name) # save data # person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') -save(person, file="chi-19mar2015.RData") +save(person, file="chi-19aug2015.RData") #=================================================================== # get LCC of the network lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) +V(lcc)$id = rank(V(lcc)$name) V(lcc)$ir_no = V(lcc)$name +V(lcc)$name = V(lcc)$id vic_ids = which(V(lcc)$vic) +lcc_edges = as_data_frame(lcc,'edges') + +# update lcc_verts lcc_verts = get.data.frame(lcc,'vertices') -lcc_edges = get.data.frame(lcc,'edges') +lcc_verts = lcc_verts[,c(1,23,24,2:22)] + +# save file save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") +##### +# old stuff lcc_data = get.data.frame(lcc,'both') lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) row.names(lcc_data$vertices) = lcc_data$vertices$id diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R index a2df165..adef5f6 100755 --- a/R Scripts/generate-dag-dat.R +++ b/R Scripts/generate-dag-dat.R @@ -1,46 +1,60 @@ library(igraph) -setwd("~/Documents/Cascade Project/") -load('Results/hyper-lcc.RData') +setwd("~/Documents/Violence Cascades/") +load('Raw Data/lcc.RData') -vic_ids = which(V(hyp_lcc)$vic==TRUE) +library(foreach) +library(doMC) +registerDoMC(cores=4) -edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])} +edgeWeights = function(eis){return(c(lcc_edges$weight[eis],Inf,Inf)[1:3])} +lcc2 = remove.edge.attribute(lcc,'weight') -dag_dat_all = data.frame(matrix(nrow=1,ncol=10)) -hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight') -ei = 1 -ptm=proc.time() -for (u in vic_ids){ - if ((which(vic_ids==u) %% 1000)==0) print(which(vic_ids==u)) - tu = hyp_lcc_verts$vic.day[u] - u_spawn = hyp_lcc_verts$spawn.date[u] - nbhd = unlist(neighborhood(hyp_lcc,nodes=u,order=3)) # get nodes within neighborhood - nbhd = nbhd[-1] # don't want to include u in the neighborhood - tvs = hyp_lcc_verts$vic.day[nbhd] - v_spawn = hyp_lcc_verts$spawn.date[nbhd] - nbhd = nbhd[tu>v_spawn & (is.na(tvs) | tu<tvs)] - tvs = hyp_lcc_verts$vic.day[nbhd] - dists = as.numeric(shortest.paths(hyp_lcc2,u,nbhd)) - - es = get.shortest.paths(hyp_lcc2,u,nbhd,output='epath')$epath - weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T) +vics = split(vic_ids, ceiling(seq_along(vic_ids)/98)) +dag_dat_lcc = c() +for(i in 1:length(vics)){ + ptm = proc.time() + print(c(i,length(vics))) + vic_ids = unlist(vics[i], use.names=F) + + ddl = foreach (u = vic_ids, .combine=rbind) %dopar% { + if ((which(vic_ids==u) %% 100)==0) print(which(vic_ids==u)) + + nbhd = unlist(neighborhood(lcc,nodes=u,order=1)) # get nodes within neighborhood + nbhd = nbhd[-1] # don't want to include u in the neighborhood + + dists = as.numeric(shortest.paths(lcc2,u,nbhd)) + es = get.shortest.paths(lcc2,u,nbhd,output='epath')$epath + weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T) + + # make edge for every infection + ddlu = data.frame(matrix(nrow=1,ncol=7)) + ei = 1 + for (j in c(17:21,16)){ + tu = lcc_verts[u,j] + if (is.na(tu)) next + + ddlu[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, + rep(tu,length(nbhd)), dists, + weights, row.names=NULL) + ei = ei + length(nbhd) + } + + return(ddlu) + } - #will be faster to pre-allocate and fill in rather than rbind each time - dag_dat_all[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, - rep(tu,length(nbhd)), tvs, dists, - weights, u_spawn, v_spawn, row.names=NULL) - ei = ei + length(nbhd) + dag_dat_lcc = rbind(dag_dat_lcc,ddl) + print(proc.time()-ptm) } -print(proc.time()-ptm) #3.5 hours -colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3','spawn1','spawn2') -rownames(dag_dat_all) = NULL -# dag_dat_all$spawn1 = hyp_lcc_verts$spawn.date[dag_dat_all$from] -# dag_dat_all$spawn2 = hyp_lcc_verts$spawn.date[dag_dat_all$to] +colnames(dag_dat_lcc) = c('from','to','t1','dist','w1','w2','w3') +rownames(dag_dat_lcc) = NULL + +save(dag_dat_lcc, file='Results/dag_dat_lcc.RData') +write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv') -save(dag_dat_all, file='Results/dag_dat_all.RData') -write.csv(dag_dat_all, file='Results/dag_dat_all.csv') +# dag_dat_vics = dag_dat_lcc[!is.na(dag_dat_lcc$t2),] +# save(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.RData') +# write.csv(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.csv') -dag_dat_vics = dag_dat_all[!is.na(dag_dat_all$t2),] -save(dag_dat_vics, file='Results/dag_dat_vics.RData') -write.csv(dag_dat_vics, file='Results/dag_dat_vics.csv') +#### +# create lcc_vic_times diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R index b872201..87b0a25 100644 --- a/R Scripts/predict-victims-plots.R +++ b/R Scripts/predict-victims-plots.R @@ -15,14 +15,14 @@ plot(lambdas,counts[1,],log='x',type='l') correct_rank1 = correct_rank[,length(lambdas)] # demographics model correct_rank2 = correct_rank[,1] # cascade model correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model -counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.001)), - sum(correct_rank1<(vcount(lcc)*0.005)), +counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.0005)), + sum(correct_rank1<(vcount(lcc)*0.001)), sum(correct_rank1<(vcount(lcc)*0.01)), + sum(correct_rank2<(vcount(lcc)*0.0005)), sum(correct_rank2<(vcount(lcc)*0.001)), - sum(correct_rank2<(vcount(lcc)*0.005)), sum(correct_rank2<(vcount(lcc)*0.01)), + sum(correct_rank3<(vcount(lcc)*0.0005)), sum(correct_rank3<(vcount(lcc)*0.001)), - sum(correct_rank3<(vcount(lcc)*0.005)), sum(correct_rank3<(vcount(lcc)*0.01))), nrow=3, byrow=T) counts = counts*100/nvics @@ -53,7 +53,7 @@ lines(popsizes,counts[3,]) lines(c(0,1),c(0,1)) #### Precision-Recall Curve -plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) +plot(ecdf(correct_rank1),col='red',xlim=c(0,50),lwd=2) plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) legend("bottomright", inset=0.05, diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R index 3960578..5010485 100755 --- a/R Scripts/sim-analysis.R +++ b/R Scripts/sim-analysis.R @@ -1,5 +1,5 @@ library(igraph) -setwd("~/Documents/Cascade Project/") +setwd("~/Documents/Violence Cascades/") load('Results/hyper-lcc.RData') vic_ids = which(V(hyp_lcc)$vic==1) @@ -7,56 +7,66 @@ n.infections = length(vic_ids) n.days = max(hyp_lcc_verts$vic.day,na.rm=T) - min(hyp_lcc_verts$vic.day,na.rm=T) inf.dates = hyp_lcc_verts$vic.day[vic_ids] +birthyears = as.numeric(format(as.Date(hyp_lcc_verts$dob[vic_ids]),'%Y')) + +# nbrs = neighborhood(graph, nodes=vic_ids, order=1) +load('Results/vic-nbrs.RData') + +n = 150 +mean.time = matrix(0,1,n) +med.time = matrix(0,1,n) +mean.50 = matrix(0,1,n) +mean.100 = matrix(0,1,n) +n.vicpairs = matrix(0,1,n) + +hyp_lcc = upgrade_graph(hyp_lcc) + ptm = proc.time() -n = 1500 -mean.time = matrix(0,3,n) -med.time = matrix(0,3,n) -mean.50 = matrix(0,3,n) -mean.100 = matrix(0,3,n) -n.vicpairs = matrix(0,3,n) -for(sim in 1:3){ - print(paste('sim:',sim)) - for(q in 1:n){ - if (q%%250==0) print(paste('run:',q)) - graph = hyp_lcc - if (sim<3) sim.dates = sample(n.days, n.infections, replace=TRUE) # sims 1 + 2 - if (sim==3) sim.dates = sample(inf.dates) # sim 3 - if (sim==1) vics = sample(vcount(hyp_lcc), n.infections, replace=FALSE) # sim 1 - if (sim>1) vics = vic_ids # sims 2 + 3 - if (sim==0) {vics = vic_ids; sim.dates = inf.dates} # data - - vic.time = c() - for (i in 1:n.infections){ - u = vics[i] - nbhd = unlist(neighborhood(graph, nodes=u, order=1)) - nbhd = intersect(vic_ids,nbhd) - nbhd = setdiff(nbhd,u) - nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting - nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person - tu = sim.dates[i] - tvs = sim.dates[match(nbhd,vic_ids)] - vic.time = c(vic.time,abs(tu-tvs)) - } - n.vicpairs[sim,q] = length(vic.time) - mean.time[sim,q] = mean(vic.time) - med.time[sim,q] = median(vic.time) - mean.50[sim,q] = mean(vic.time<=50) - mean.100[sim,q] = mean(vic.time<=100) - # decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor? +for(q in 1:n){ + if (q%%250==0) print(paste('run:',q)) + graph = hyp_lcc + vics = vic_ids + + sim.dates = rep(0,length(vic_ids)) + for(y in unique(birthyears)){ + year_ids = which(birthyears==y) + if (length(year_ids)>1) sim.dates[year_ids] = sample(inf.dates[year_ids]) + } +# {vics = vic_ids; sim.dates = inf.dates} # data + + vic.time = rep(NA,14885) + idx = 1 + for (i in 1:n.infections){ + u = vics[i] + nbhd = nbrs[[i]] + nbhd = intersect(vic_ids,nbhd) + nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting + nbhd = setdiff(nbhd,u) + nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person + if (length(nbhd)==0) next + tu = sim.dates[i] + tvs = sim.dates[match(nbhd,vic_ids)] + vic.time[idx:(idx+length(nbhd)-1)] = abs(tu-tvs) + idx = idx + length(nbhd) } + + n.vicpairs[q] = length(vic.time) + mean.time[q] = mean(vic.time) + med.time[q] = median(vic.time) + mean.50[q] = mean(vic.time<=50) + mean.100[q] = mean(vic.time<=100) + # decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor? } print(proc.time()-ptm) ####### plot simulation data ####### -data = 0.1176774 # 774.5483, 634, 0.07252472, 0.1176774 -simdata = mean.100 +data = 634 # 774.5483, 634, 0.07252472, 0.1176774 +simdata = med.time xl = c(0.9*min(min(simdata),data),1.1*max(max(simdata),data)) -yl = c(0,max(c(hist(simdata[1,],15)$density,hist(simdata[2,],15)$density,hist(simdata[3,],15)$density))) -hist(simdata[1,],15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F, +yl = c(0,max(hist(simdata,15)$density)) +hist(simdata,15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F, xlab='Neighbors Infected Within 100 Days', main=NULL) -hist(simdata[2,],15,col=rgb(0,0,1,1/4),freq=F,add=TRUE) -hist(simdata[3,],15,col=rgb(1,0,1,1/4),freq=F,add=TRUE) abline(v=data,lwd=4,col='red') |
