4 files changed, 120 insertions, 96 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 3104ea2..2994575 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -1,5 +1,5 @@
 library(igraph)
-setwd('~/Documents/Cascade Project/Raw Data/')
+setwd('~/Documents/Violence Cascades/Raw Data/')
 
 #================
 # (1) load data
@@ -99,14 +99,6 @@ start_date = as.Date("2005-12-31")
 ## Get first arrest date in the study period for each person
 sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y')
 sub.arrests = sub.arrests[order(sub.arrests$dates),]
-sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),]
-arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y')
-arrest.days = as.numeric(arrest.dates-start_date)
-V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)]
-
-V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)],
-                                      V(person)$dob,
-                                      units='days')/365.25)
 
 #===================================================================
 
@@ -192,22 +184,30 @@ V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
 
 #===================================================================
 # create id number
-V(person)$id = rank(V(person)$name)
 
 # save data
 # person = remove.edge.attribute(person,'weight')
 # person_data = get.data.frame(person,'both')
-save(person, file="chi-19mar2015.RData")
+save(person, file="chi-19aug2015.RData")
 
 #===================================================================
 # get LCC of the network
 lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize)))
+V(lcc)$id = rank(V(lcc)$name)
 V(lcc)$ir_no = V(lcc)$name
+V(lcc)$name = V(lcc)$id
 vic_ids = which(V(lcc)$vic)
+lcc_edges = as_data_frame(lcc,'edges')
+
+# update lcc_verts
 lcc_verts = get.data.frame(lcc,'vertices')
-lcc_edges = get.data.frame(lcc,'edges')
+lcc_verts = lcc_verts[,c(1,23,24,2:22)]
+
+# save file
 save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData")
 
+#####
+# old stuff
 lcc_data = get.data.frame(lcc,'both')
 lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id)
 row.names(lcc_data$vertices) = lcc_data$vertices$id
diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R
index a2df165..adef5f6 100755
--- a/R Scripts/generate-dag-dat.R
+++ b/R Scripts/generate-dag-dat.R
@@ -1,46 +1,60 @@
 library(igraph)
-setwd("~/Documents/Cascade Project/")
-load('Results/hyper-lcc.RData')
+setwd("~/Documents/Violence Cascades/")
+load('Raw Data/lcc.RData')
 
-vic_ids = which(V(hyp_lcc)$vic==TRUE)
+library(foreach)
+library(doMC)
+registerDoMC(cores=4)
 
-edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])}
+edgeWeights = function(eis){return(c(lcc_edges$weight[eis],Inf,Inf)[1:3])}
+lcc2 = remove.edge.attribute(lcc,'weight')
 
-dag_dat_all = data.frame(matrix(nrow=1,ncol=10))
-hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight')
-ei = 1
-ptm=proc.time()
-for (u in vic_ids){
-  if ((which(vic_ids==u) %% 1000)==0)  print(which(vic_ids==u))
-  tu    = hyp_lcc_verts$vic.day[u]
-  u_spawn = hyp_lcc_verts$spawn.date[u]
-  nbhd  = unlist(neighborhood(hyp_lcc,nodes=u,order=3)) # get nodes within neighborhood
-  nbhd  = nbhd[-1] # don't want to include u in the neighborhood
-  tvs   = hyp_lcc_verts$vic.day[nbhd]
-  v_spawn = hyp_lcc_verts$spawn.date[nbhd]
-  nbhd  = nbhd[tu>v_spawn & (is.na(tvs) | tu<tvs)]
-  tvs   = hyp_lcc_verts$vic.day[nbhd]
-  dists = as.numeric(shortest.paths(hyp_lcc2,u,nbhd))
-  
-  es = get.shortest.paths(hyp_lcc2,u,nbhd,output='epath')$epath
-  weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T)
+vics = split(vic_ids, ceiling(seq_along(vic_ids)/98))
+dag_dat_lcc = c()
+for(i in 1:length(vics)){
+  ptm = proc.time()
+  print(c(i,length(vics)))  
+  vic_ids = unlist(vics[i], use.names=F)
+
+  ddl = foreach (u = vic_ids, .combine=rbind) %dopar% {
+    if ((which(vic_ids==u) %% 100)==0)  print(which(vic_ids==u))
+    
+    nbhd  = unlist(neighborhood(lcc,nodes=u,order=1)) # get nodes within neighborhood
+    nbhd  = nbhd[-1] # don't want to include u in the neighborhood
+    
+    dists = as.numeric(shortest.paths(lcc2,u,nbhd))
+    es = get.shortest.paths(lcc2,u,nbhd,output='epath')$epath
+    weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T)
+    
+    # make edge for every infection
+    ddlu = data.frame(matrix(nrow=1,ncol=7))
+    ei = 1
+    for (j in c(17:21,16)){
+      tu = lcc_verts[u,j]
+      if (is.na(tu)) next
+      
+      ddlu[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, 
+                                                rep(tu,length(nbhd)), dists, 
+                                                weights, row.names=NULL)
+      ei = ei + length(nbhd)
+    }
+    
+    return(ddlu)
+  }
   
-  #will be faster to pre-allocate and fill in rather than rbind each time
-  dag_dat_all[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, 
-                                                    rep(tu,length(nbhd)), tvs, dists, 
-                                                    weights, u_spawn, v_spawn, row.names=NULL)
-  ei = ei + length(nbhd)
+  dag_dat_lcc = rbind(dag_dat_lcc,ddl)
+  print(proc.time()-ptm)
 }
-print(proc.time()-ptm) #3.5 hours
-colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3','spawn1','spawn2')
-rownames(dag_dat_all) = NULL
 
-# dag_dat_all$spawn1 = hyp_lcc_verts$spawn.date[dag_dat_all$from]
-# dag_dat_all$spawn2 = hyp_lcc_verts$spawn.date[dag_dat_all$to]
+colnames(dag_dat_lcc) = c('from','to','t1','dist','w1','w2','w3')
+rownames(dag_dat_lcc) = NULL
+
+save(dag_dat_lcc, file='Results/dag_dat_lcc.RData')
+write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv')
 
-save(dag_dat_all, file='Results/dag_dat_all.RData')
-write.csv(dag_dat_all, file='Results/dag_dat_all.csv')
+# dag_dat_vics = dag_dat_lcc[!is.na(dag_dat_lcc$t2),]
+# save(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.RData')
+# write.csv(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.csv')
 
-dag_dat_vics = dag_dat_all[!is.na(dag_dat_all$t2),]
-save(dag_dat_vics, file='Results/dag_dat_vics.RData')
-write.csv(dag_dat_vics, file='Results/dag_dat_vics.csv')
+####
+# create lcc_vic_times
diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R
index b872201..87b0a25 100644
--- a/R Scripts/predict-victims-plots.R
+++ b/R Scripts/predict-victims-plots.R
@@ -15,14 +15,14 @@ plot(lambdas,counts[1,],log='x',type='l')
 correct_rank1 = correct_rank[,length(lambdas)] # demographics model
 correct_rank2 = correct_rank[,1] # cascade model
 correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model
-counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.001)),
-                  sum(correct_rank1<(vcount(lcc)*0.005)),
+counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.0005)),
+                  sum(correct_rank1<(vcount(lcc)*0.001)),
                   sum(correct_rank1<(vcount(lcc)*0.01)),
+                  sum(correct_rank2<(vcount(lcc)*0.0005)),
                   sum(correct_rank2<(vcount(lcc)*0.001)),
-                  sum(correct_rank2<(vcount(lcc)*0.005)),
                   sum(correct_rank2<(vcount(lcc)*0.01)),
+                  sum(correct_rank3<(vcount(lcc)*0.0005)),
                   sum(correct_rank3<(vcount(lcc)*0.001)),
-                  sum(correct_rank3<(vcount(lcc)*0.005)),
                   sum(correct_rank3<(vcount(lcc)*0.01))),
                 nrow=3, byrow=T)
 counts = counts*100/nvics
@@ -53,7 +53,7 @@ lines(popsizes,counts[3,])
 lines(c(0,1),c(0,1))
 
 #### Precision-Recall Curve
-plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2)
+plot(ecdf(correct_rank1),col='red',xlim=c(0,50),lwd=2)
 plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T)
 plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T)
 legend("bottomright", inset=0.05, 
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
index 3960578..5010485 100755
--- a/R Scripts/sim-analysis.R
+++ b/R Scripts/sim-analysis.R
@@ -1,5 +1,5 @@
 library(igraph)
-setwd("~/Documents/Cascade Project/")
+setwd("~/Documents/Violence Cascades/")
 
 load('Results/hyper-lcc.RData')
 vic_ids = which(V(hyp_lcc)$vic==1)
@@ -7,56 +7,66 @@ n.infections = length(vic_ids)
 n.days = max(hyp_lcc_verts$vic.day,na.rm=T) - min(hyp_lcc_verts$vic.day,na.rm=T)
 inf.dates = hyp_lcc_verts$vic.day[vic_ids]
 
+birthyears = as.numeric(format(as.Date(hyp_lcc_verts$dob[vic_ids]),'%Y'))
+
+# nbrs = neighborhood(graph, nodes=vic_ids, order=1)
+load('Results/vic-nbrs.RData')
+
+n = 150
+mean.time = matrix(0,1,n)
+med.time = matrix(0,1,n)
+mean.50 = matrix(0,1,n)
+mean.100 = matrix(0,1,n)
+n.vicpairs = matrix(0,1,n)
+
+hyp_lcc = upgrade_graph(hyp_lcc)
+
 ptm = proc.time()
-n = 1500
-mean.time = matrix(0,3,n)
-med.time = matrix(0,3,n)
-mean.50 = matrix(0,3,n)
-mean.100 = matrix(0,3,n)
-n.vicpairs = matrix(0,3,n)
-for(sim in 1:3){
-  print(paste('sim:',sim))
-  for(q in 1:n){
-    if (q%%250==0) print(paste('run:',q))
-    graph = hyp_lcc
-    if (sim<3) sim.dates = sample(n.days, n.infections, replace=TRUE) # sims 1 + 2
-    if (sim==3) sim.dates = sample(inf.dates) # sim 3
-    if (sim==1) vics = sample(vcount(hyp_lcc), n.infections, replace=FALSE) # sim 1
-    if (sim>1) vics = vic_ids # sims 2 + 3
-    if (sim==0) {vics = vic_ids; sim.dates = inf.dates} # data
-    
-    vic.time = c()
-    for (i in 1:n.infections){
-      u = vics[i]
-      nbhd = unlist(neighborhood(graph, nodes=u, order=1))
-      nbhd = intersect(vic_ids,nbhd)
-      nbhd = setdiff(nbhd,u)
-      nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting
-      nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person
-      tu = sim.dates[i]
-      tvs = sim.dates[match(nbhd,vic_ids)]
-      vic.time = c(vic.time,abs(tu-tvs))
-    }
-    n.vicpairs[sim,q] = length(vic.time)
-    mean.time[sim,q] = mean(vic.time)
-    med.time[sim,q] = median(vic.time)
-    mean.50[sim,q] = mean(vic.time<=50)
-    mean.100[sim,q] = mean(vic.time<=100)
-    #     decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor?
+for(q in 1:n){
+  if (q%%250==0) print(paste('run:',q))
+  graph = hyp_lcc
+  vics = vic_ids
+  
+  sim.dates = rep(0,length(vic_ids))
+  for(y in unique(birthyears)){
+    year_ids = which(birthyears==y)
+    if (length(year_ids)>1) sim.dates[year_ids] = sample(inf.dates[year_ids])
+  }
+#   {vics = vic_ids; sim.dates = inf.dates} # data
+  
+  vic.time = rep(NA,14885)
+  idx = 1
+  for (i in 1:n.infections){
+    u = vics[i]
+    nbhd = nbrs[[i]]
+    nbhd = intersect(vic_ids,nbhd)
+    nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting
+    nbhd = setdiff(nbhd,u)
+    nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person
+    if (length(nbhd)==0) next
+    tu = sim.dates[i]
+    tvs = sim.dates[match(nbhd,vic_ids)]
+    vic.time[idx:(idx+length(nbhd)-1)] = abs(tu-tvs)
+    idx = idx + length(nbhd)
   }
+
+  n.vicpairs[q] = length(vic.time)
+  mean.time[q] = mean(vic.time)
+  med.time[q] = median(vic.time)
+  mean.50[q] = mean(vic.time<=50)
+  mean.100[q] = mean(vic.time<=100)
+  #     decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor?
 }
 print(proc.time()-ptm)
 
 
 ####### plot simulation data #######
-data = 0.1176774 # 774.5483, 634, 0.07252472, 0.1176774
-simdata = mean.100
+data = 634 # 774.5483, 634, 0.07252472, 0.1176774
+simdata = med.time
 xl = c(0.9*min(min(simdata),data),1.1*max(max(simdata),data))
-yl = c(0,max(c(hist(simdata[1,],15)$density,hist(simdata[2,],15)$density,hist(simdata[3,],15)$density)))
-hist(simdata[1,],15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F,
+yl = c(0,max(hist(simdata,15)$density))
+hist(simdata,15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F,
      xlab='Neighbors Infected Within 100 Days', main=NULL)
-hist(simdata[2,],15,col=rgb(0,0,1,1/4),freq=F,add=TRUE)
-hist(simdata[3,],15,col=rgb(1,0,1,1/4),freq=F,add=TRUE)
 abline(v=data,lwd=4,col='red')