added my R scripts

author: Ben Green <bgreen@g.harvard.edu> 2015-06-08 15:21:51 -0400
committer: Ben Green <bgreen@g.harvard.edu> 2015-06-08 15:21:51 -0400
commit: 1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch)
tree: 6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts
parent: e5dada202c34521618bf82a086093c342841e5e8 (diff)
download: criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz
20 files changed, 1134 insertions, 0 deletions
diff --git a/R Scripts/-compare-sim.R b/R Scripts/-compare-sim.R
new file mode 100755
index 0000000..d96bd81
--- /dev/null
+++ b/R Scripts/-compare-sim.R
@@ -0,0 +1,34 @@
+
+vics = vic_ids
+graph = lcc
+
+n.infections = length(vics)
+vic.neighbors = rep(0,n.infections)
+for (i in 1:n.infections){
+  u = vics[i]
+  nbhd = unlist(neighborhood(graph, nodes=u, order=3))
+  nbhd = intersect(vic_ids,nbhd)
+  vic.neighbors[i] = sum(nbhd %in% vics)
+}
+
+
+n.infections = length(vics)
+vic.time = rep(0,n.infections)
+for (i in 1:n.infections){
+  if (i%%1000==0) print(i)
+  u = vics[i]
+  nbhd = unlist(neighborhood(graph, nodes=u, order=1))
+  nbhd = intersect(vic_ids,nbhd)
+  nbhd = setdiff(nbhd,u)
+  tu = as.numeric(V(graph)$vic_date[u])
+  tvs = as.numeric(V(graph)$vic_date[nbhd])
+  tvs = tvs[tvs<tu]
+  if (length(tvs)>0){
+    vic.time[i] = min(tu-tvs)
+  }
+}
+vic.time = vic.time[vic.time>0]
+mean(vic.time)
+median(vic.time)
+mean(vic.time<100)
+sum(vic.time>0)
diff --git a/R Scripts/-generate-dag-dat-sim.R b/R Scripts/-generate-dag-dat-sim.R
new file mode 100755
index 0000000..a6d115d
--- /dev/null
+++ b/R Scripts/-generate-dag-dat-sim.R
@@ -0,0 +1,50 @@
+library(igraph)
+setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/")
+load('Data/lcc_sim2a.RData')
+
+# d = remove.edge.attribute(person,'weight')
+# lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize)))
+# vic_ids = which(V(lcc)$vic==TRUE)
+# dag = graph.empty(vcount(lcc))
+# dag = set.edge.attribute(dag,'weight',value=0)
+
+lcc = lcc.sim
+vic_ids = vic_ids.sim
+ei = 1
+
+dag_dat = matrix(0,253096,5)
+for (1 in 1:length(vic_ids)){
+  if (i %% 100)==0)  print(which(vic_ids==u))
+
+  u = vic_ids[i]
+  nbhd = unlist(neighborhood(lcc, nodes=u, order=3)) # get nodes within neighborhood
+  nbhd = intersect(vic_ids,nbhd) # only want victim nodes
+  nbhd = setdiff(nbhd,u) # don't want to include u in the neighborhood
+  
+  tu = as.numeric(V(lcc)$vic_date[u])
+  tvs = as.numeric(V(lcc)$vic_date[nbhd])
+  Es = tu<tvs
+  tvs = tvs[Es]
+
+  if (sum(Es)>0){
+    nbhd = nbhd[Es]
+    dists = as.numeric(shortest.paths(lcc,u,nbhd))  
+    
+    for (j in 1:sum(Es)){
+      v = nbhd[j]
+      d = dists[j]
+      tv = tvs[j]
+#       dag_dat[ei,] = c(u,v,d,tu,tv)
+      dag_dat = rbind(dag_dat,c(u,v,d,tu,tv))
+#       f = time(tu,tv)
+#       h = structural(d)
+#       weight[ei] = f*h
+      ei = ei+1
+    }
+  }
+}
+
+dag_dat = dag_dat[rowSums(dag_dat)>0,]
+dag_dat = as.data.frame(dag_dat)
+colnames(dag_dat) = c('from','to','dist','t1','t2')
+# save(dag_dat,file='Data/dag_dat_sim2a.RData')
diff --git a/R Scripts/-recover-data.R b/R Scripts/-recover-data.R
new file mode 100755
index 0000000..b34ffc6
--- /dev/null
+++ b/R Scripts/-recover-data.R
@@ -0,0 +1,27 @@
+library(igraph)
+setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/")
+
+el = read.csv('Data/dag.csv')
+lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize)))
+vic_ids = which(V(lcc)$vic==TRUE)
+
+from = vic_ids[el$from]
+to = vic_ids[el$to]
+
+t1 = as.Date(V(lcc)$vic_date[from],format='%m/%d/%y')
+t2 = as.Date(V(lcc)$vic_date[to],format='%m/%d/%y')
+
+uf = unique(from)
+dist = rep(0,length(from))
+for (i in 1:length(uf)){
+  if (i%%1000==0) print(i)
+  f = uf[i]
+  fi = which(from==f)
+  ds = as.numeric(shortest.paths(lcc,v=f,to=to[fi]))
+  dist[fi] = ds
+}
+
+dag_dat = data.frame(from=el$from,to=el$to,dist,t1,t2)
+
+save(dag_dat,file='Data/dag_dat.RData')
+write.csv(dag_dat,'Data/dag_dat.csv')
diff --git a/R Scripts/-simulate-infections.R b/R Scripts/-simulate-infections.R
new file mode 100755
index 0000000..f1409af
--- /dev/null
+++ b/R Scripts/-simulate-infections.R
@@ -0,0 +1,34 @@
+library(igraph)
+setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/")
+
+load('Data/dag_dat.RData')
+load('Data/lcc.RData')
+
+n.infections = length(vic_ids)
+n.nodes = (vcount(lcc))
+n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1))
+gamma = n.infections/(n.nodes*n.days)
+
+vic = rep(FALSE,n.nodes)
+for(t in 1:n.days){
+  if (t%%1000==0) print(t)
+  infections = which(runif(n.nodes)<gamma)
+  new.infections = infections[vic[infections]==0]
+  vic[new.infections] = t
+}
+
+lcc.sim = lcc
+V(lcc.sim)$vic = vic>0
+V(lcc.sim)$vic_date = vic
+vic_ids.sim =  which(vic>0)
+# save(lcc.sim,vic_ids.sim, file='Data/lcc_sim1b.RData')
+
+#### simulation method 2 ####
+load('Data/dag_dat.RData')
+load('Data/lcc.RData')
+n.infections = length(vic_ids)
+n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1))
+lcc.sim = lcc
+V(lcc.sim)$vic_date[vic_ids] = sample(n.days, n.infections, replace=TRUE)
+vic_ids.sim =  which(V(lcc.sim)$vic)
+# save(lcc.sim, vic_ids.sim, file='Data/lcc_sim2a.RData')
diff --git a/R Scripts/.Rhistory b/R Scripts/.Rhistory
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/R Scripts/.Rhistory
diff --git a/R Scripts/README.rtf b/R Scripts/README.rtf
new file mode 100644
index 0000000..48e0353
--- /dev/null
+++ b/R Scripts/README.rtf
@@ -0,0 +1,22 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\margl1440\margr1440\vieww10800\viewh8400\viewkind0
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural
+
+\f0\fs24 \cf0 PIPELINE:\
+data-prep \'97 generate co-offending network and lcc\
+create-hyper-lcc \'97 generate new network with a unique node for each infection\
+generate-dag-dat \'97 generate a dag data frame based on possible infection edges\
+find-cascades \'97 calculate edge weights and find transmission edges, yielding a forest of cascades\
+\
+OTHER FILES:\
+analyze-cascades: simple code to analyze cascades the model generates\
+compare-sim:\
+data-exploration:\
+lm: linear model to predict victims based on demographic and other features\
+plot-graph:\
+sim-analysis:\
+simulate-infections:\
+structural: function to calculate the structural component of the infection model\
+temporal: function to calculate the temporal component of the infection model}
+\ No newline at end of file
diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R
new file mode 100755
index 0000000..463de18
--- /dev/null
+++ b/R Scripts/analyze-cascades.R
@@ -0,0 +1,58 @@
+# library(igraph)
+# setwd("~/Documents/Cascade Project/")
+# 
+# load('Results/dag_dat_all.RData')
+# load('Results/weight-12-1-14.RData')
+# load('Results/hyper-lcc.RData')
+
+# dag = graph.edgelist(as.matrix(dag_dat[,1:2]))
+# dag = set.edge.attribute(dag,'weight',value=weight)
+# dag_dat = dag_dat[which(E(dag)$weight>=0.4),]
+# dag = delete.edges(dag, which(E(dag)$weight<0.4))
+
+table(clusters(dag)$csize)
+
+clusters = clusters(dag)
+membership = clusters$membership
+csize = clusters$csize
+order = rev(order(csize))
+
+#use table not hist
+plot(sizes,counts,log='xy',type='o',lwd=3,
+     xlab='Size of Cascade', ylab='Number of Cascades', main='Distribution of Cascade Sizes')
+
+i = 14
+V = which(clusters(dag)$membership==order[i]) # get all nodes in cluster
+cc = induced.subgraph(dag,V)
+Vi = vic_ids[V]
+Ei = intersect(which(dag_dat_vics$from[realized] %in% Vi),which(dag_dat_vics$to[realized] %in% Vi))
+cc_dat = (dag_dat_vics[realized,])[Ei,]
+
+### plot cascade ###
+cols = rep('lightblue',vcount(cc))
+seed = which(degree(cc,mode='in')==0)
+cols[seed] = 'red'
+plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1,
+     edge.width=E(cc)$weight*20/max(E(cc)$weight),layout=layout.reingold.tilford(cc,root=seed),
+     vertex.label=V(hyp_lcc)$vic.day[Vi])
+# vertex.label=as.Date(V(hyp_lcc)$vic.day[Vi], format ='%m/%d/%y'))
+#      vertex.label=V(lcc)$vic_date[Vi])
+
+### basic graph statistics
+trl = mean(transitivity(cc,type='local',isolates='zero'))
+apl = average.path.length(cc)
+indeg = degree(cc,mode='in')
+outdeg = degree(cc,mode='out')
+ds = mean(cc_dat$dist)
+
+### node demographic statistics
+from = vic_ids[cc_dat$from]
+to = vic_ids[cc_dat$to]
+V(lcc)$sex[from] == V(lcc)$sex[to]
+V(lcc)$sex[Vi]
+V(lcc)$race[Vi]
+as.numeric(V(lcc)$age[Vi])
+V(lcc)$gang.member[Vi]
+V(lcc)$gang.name[Vi]
+V(lcc)$faction.name[Vi]
+
diff --git a/R Scripts/arrest-vs-infection-times.R b/R Scripts/arrest-vs-infection-times.R
new file mode 100644
index 0000000..222be3e
--- /dev/null
+++ b/R Scripts/arrest-vs-infection-times.R
@@ -0,0 +1,43 @@
+library(igraph)
+setwd('~/Documents/Cascade Project/Raw Data/')
+
+##### Load arrest data
+arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))
+arrests$ir2 <- paste("ir", arrests$ir_no)
+sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'), 
+                                        as.character('arrest_date')))
+colnames(sub.arrests) <- c("events", "individuals", "dates")
+sub.arrests$dates = as.Date(sub.arrests$dates,format='%m/%d/%Y')
+sub.arrests = sub.arrests[order(sub.arrests$dates),]
+# individuals <- unique(sub.arrests$individuals)
+# events <- unique(sub.arrests$events)
+
+##### Look at nonfatal victims
+shootings <- read.csv("shooting-data-withdate2.csv", header = T)
+victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
+victims = victims[!is.na(victims$IR_NO),]
+victims$ir2 <- paste("ir", victims$IR_NO)
+victims$INCIDENT_DATE = as.Date(victims$INCIDENT_DATE,format='%m/%d/%y')
+victims = victims[order(victims$INCIDENT_DATE),] # sort so match gets first infection
+victims = victims[victims$ir2 %in% arrests$ir2,] # only look at vics who were arrested
+
+vic_dates = victims$INCIDENT_DATE[match(unique(victims$ir2),victims$ir2)]
+arrest_dates = sub.arrests$dates[match(unique(victims$ir2), sub.arrests$individuals)]
+sum(vic_dates==arrest_dates)
+# the number of vics in arrest table doesn't match number of nonfatal vics in person?
+
+##### Look at fatal victims
+murders = read.csv("murder-victims-13nov.csv", header=T)
+murders = murders[!is.na(murders$VICTIM_IR_NO),]
+murders = murders[murders$INJURY_DESCR=="SHOT",]
+murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
+murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
+start_date = as.Date("2005-12-31")
+murders$INJURY_DATE = as.Date(murders$INJURY_DATE,format='%m/%d/%y')
+murders = murders[murders$INJURY_DATE>=start_date,]
+murders = murders[order(murders$INJURY_DATE),]
+murders = murders[murders$ir2 %in% arrests$ir2,]
+
+mur_dates = murders$INJURY_DATE
+arrest_dates = sub.arrests$dates[match(murders$ir2, sub.arrests$individuals)]
+sum(mur_dates==arrest_dates)
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R
new file mode 100644
index 0000000..1cf99e0
--- /dev/null
+++ b/R Scripts/benchmarking.R
@@ -0,0 +1,130 @@
+library(igraph)
+setwd('~/Documents/Cascade Project')
+load('Raw Data/lcc.RData')
+load('Results/hyper-lcc.RData')
+load('Results/dag_dat_all.RData')
+source('Scripts/temporal.R')
+source('Scripts/structural.R')
+
+##### Initialize data
+formula = vic ~ sex + race + age + gang.member + gang.name
+lcc_verts$sex = as.factor(lcc_verts$sex)
+lcc_verts$race = as.factor(lcc_verts$race)
+lcc_verts$age = as.numeric(lcc_verts$age)
+lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
+# sum(hyp_lcc_verts$vic)/length(days)
+
+##### Loop through days
+alpha = 1/100
+gamma = 0.18
+days = 70:max(hyp_lcc_verts$vic.day, na.rm=T)
+lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1)
+nvics = sum(hyp_lcc_verts$vic.day %in% days)
+correct_rank = matrix(nrow=nvics, ncol=length(lambdas))
+# correct_rank1 = correct_rank2 = correct_rank3 = c()
+edges_all = dag_dat_all[dag_dat_all$dist<2,]
+ptm = proc.time()
+for (day in days){
+  if (day %% 100 == 0) print(day)
+  
+  ##### Demographics model
+#   vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
+#   victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
+#   victims$vic[vics] = TRUE
+#   victims$vic[-vics] = FALSE
+# #   glm.fit = glm(formula, data=victims, family=binomial)
+#   glm.fit = lm(formula, data=victims)
+#   glm.probs = predict(glm.fit, newdata=lcc_verts, type='response')
+  
+  ##### Cascade Model
+  edges = edges_all[which(edges_all$t1<day),]
+  f = temporal(edges$t1, day, alpha)
+  h = structural(gamma,edges$dist)
+  weights = f*h
+  ids = edges$to
+  irs = hyp_lcc_verts$ir_no[ids]
+  risk = data.frame(id=ids, ir=irs, weight=weights)
+  risk = risk[order(weights, decreasing=T),]
+  risk = risk[match(unique(risk$ir),risk$ir),]
+# maybe need to change this to reflect new algorithm that accounts for \tilde{p}
+
+  ##### Combined Model
+#   combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0)
+  combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight
+  
+  ##### Gather results
+  infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
+  for (lambda in lambdas){
+    combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas
+    c_idx = which(lambdas==lambda)
+    r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs))
+    # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank
+    correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)])
+    # maybe should also mark down vic/nonvic status of each?
+  }
+  
+}
+print(proc.time()-ptm)
+
+
+##### Plot results
+hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
+     xlab='Risk Ranking of Victims',main='')
+hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
+hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
+legend("topright", c("Demographics Model", "Cascade Model"), 
+       fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8)))
+
+counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics,
+                  colSums(correct_rank<(vcount(lcc)/200))*100/nvics,
+                  colSums(correct_rank<(vcount(lcc)/100))*100/nvics),
+                nrow=3, byrow=T)
+plot(lambdas,counts[1,],log='x',type='l')
+
+correct_rank1 = correct_rank[,length(lambdas)]
+correct_rank2 = correct_rank[,1]
+correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))]
+counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)),
+               sum(correct_rank1<(vcount(lcc)*0.005)),
+               sum(correct_rank1<(vcount(lcc)*0.01)),
+               sum(correct_rank2<(vcount(lcc)*0.001)),
+               sum(correct_rank2<(vcount(lcc)*0.005)),
+               sum(correct_rank2<(vcount(lcc)*0.01)),
+               sum(correct_rank3<(vcount(lcc)*0.001)),
+               sum(correct_rank3<(vcount(lcc)*0.005)),
+               sum(correct_rank3<(vcount(lcc)*0.01))),
+               nrow=3, byrow=T)
+counts = counts*100/nvics
+barplot(counts, 
+        xlab="Size of High-Risk Population",
+        ylab="Percent of Victims Predicted",
+        names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1),
+        col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)),
+        beside=TRUE)
+legend("topleft", inset=0.05, 
+       c("Demographics Model", "Cascade Model", "Combined Model"), 
+       fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)))
+box(which='plot')
+par(new=T)
+counts = counts/(100/nvics)
+barplot(counts, 
+        ylim=c(0,max(counts)*1.1),
+        col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)),
+        beside=TRUE)
+axis(side = 4)
+mtext(side = 4, line = 3, "Number of Victims Predicted")
+
+popsizes = c(0.1, 0.5, 1)
+plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts)))
+lines(popsizes,counts[2,])
+lines(popsizes,counts[3,])
+lines(c(0,1),c(0,1))
+
+#### Precision-Recall Curve
+plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2)
+plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T)
+plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T)
+legend("bottomright", inset=0.05, 
+       c("Demographics Model", "Cascade Model", "Combined Model"), 
+       fill=c('red','darkblue','darkgreen'))
+lines(c(0,vcount(lcc)),c(0,1))
diff --git a/R Scripts/create-hyper-lcc.R b/R Scripts/create-hyper-lcc.R
new file mode 100644
index 0000000..786b694
--- /dev/null
+++ b/R Scripts/create-hyper-lcc.R
@@ -0,0 +1,126 @@
+library(igraph)
+setwd("~/Documents/Cascade Project/")
+load('Raw Data/lcc.RData')
+
+lcc_verts = get.data.frame(lcc,'vertices')
+lcc_edges = get.data.frame(lcc,'edges')
+
+
+##### Create new vertices dataframe
+cols = c('id','ir_no','sex','race','dob','age.arrest','arrest.day',
+         'gang.member','gang.name','faction.name')
+hyp_lcc_verts = data.frame(matrix(ncol=length(cols)+1,nrow=0))
+colnames(hyp_lcc_verts) = c(cols,'spawn.date')
+  
+ptm = proc.time()
+ri = 1
+for (i in 1:dim(lcc_verts)[1]){
+  if (i%%10000==0) print(i)
+  if (lcc_verts$vic[i]){
+    if (lcc_verts$vic.nonfatal[i]>0){
+      # create nodes for each nonfatal shooting
+      for (nf in 1:lcc_verts$vic.nonfatal[i]){
+        hyp_lcc_verts[ri,cols] = lcc_verts[i,cols]
+        hyp_lcc_verts$id[ri] = ri
+        hyp_lcc_verts$vic[ri] = T
+        hyp_lcc_verts$vic.type[ri] = 'nonfatal'
+        if(nf==1){
+          hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_1[i]
+          hyp_lcc_verts$spawn.date[ri] = 0
+        } else if(nf==2){
+          hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_2[i]
+          hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_1[i]
+        } else if(nf==3){
+          hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_3[i]
+          hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_2[i]
+        } else if(nf==4){
+          hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_4[i]
+          hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_3[i]
+        } else if(nf==5){
+          hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_5[i]
+          hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_4[i]
+        }
+        ri = ri+1
+      }
+      # if no fatal infection, create uninfected duplicate
+      if (!lcc_verts$vic.fatal[i]){
+        hyp_lcc_verts[ri,cols] = lcc_verts[i,cols]
+        hyp_lcc_verts$id[ri] = ri
+        hyp_lcc_verts$vic[ri] = F
+        hyp_lcc_verts$vic.type[ri] = NA
+        hyp_lcc_verts$vic.day[ri] = NA
+        hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i],
+                                           lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i],
+                                           lcc_verts$nonfatal_day_5[i],na.rm=T)
+        ri = ri+1
+      }
+    }
+    # create a node for each fatal shooting
+    # if also nonfatal shootings, spawn at last nonfatal shooting
+    if (lcc_verts$vic.fatal[i]){
+      hyp_lcc_verts[ri,cols] = lcc_verts[i,cols]
+      hyp_lcc_verts$id[ri] = ri
+      hyp_lcc_verts$vic[ri] = T
+      hyp_lcc_verts$vic.type[ri] = 'fatal'
+      hyp_lcc_verts$vic.day[ri] = lcc_verts$fatal_day[i] 
+      if (lcc_verts$vic.nonfatal[i]>0){
+        hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i],
+                                           lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i],
+                                           lcc_verts$nonfatal_day_5[i],na.rm=T)
+      } else {
+        hyp_lcc_verts$spawn.date[ri] = 0
+      }
+      ri = ri+1
+    }
+  } 
+  # create an uninfected node for each uninfected person
+  else{
+    hyp_lcc_verts[ri,cols] = lcc_verts[i,cols]
+    hyp_lcc_verts$id[ri] = ri
+    hyp_lcc_verts$vic[ri] = F
+    hyp_lcc_verts$vic.type[ri] = NA
+    hyp_lcc_verts$vic.day[ri] = NA
+    hyp_lcc_verts$spawn.date[ri] = 0
+    ri = ri+1
+  }
+}
+print(proc.time()-ptm) # 1.5 hrs
+row.names(hyp_lcc_verts) = NULL
+n.nodes = sum(sum(V(lcc)$vic.fatal),sum(V(lcc)$vic.nonfatal),
+              sum(V(lcc)$vic.nonfatal>0 & !V(lcc)$vic.fatal),sum(V(lcc)$vic==FALSE))
+stopifnot(dim(hyp_lcc_verts)[1] == n.nodes)
+
+##### Create new edgelist
+print('Edges')
+hyp_lcc_edges = data.frame(from=0, to=0, weight=0)
+ei = 1
+ptm = proc.time()
+for(i in 1:dim(hyp_lcc_verts)[1]){
+  if (i%%10000==0) print(i)
+  ego_id = hyp_lcc_verts$id[i]
+  ego_ir = hyp_lcc_verts$ir_no[i]
+  alter_irs = union(lcc_edges$from[which(lcc_edges$to==ego_ir)],
+                    lcc_edges$to[which(lcc_edges$from==ego_ir)])
+#   alter_irs = union(ego_ir, alter_irs) # add edges to other infected selves
+  alter_ids = hyp_lcc_verts$id[which(hyp_lcc_verts$ir_no %in% alter_irs)]
+  alter_ids = alter_ids[ego_id<alter_ids] # avoid double-counting edges
+  for(alter_id in alter_ids){
+    weight=Inf
+    alter_ir = hyp_lcc_verts$ir_no[alter_id]
+    if(ego_ir!=alter_ir){
+      edge_id = which((lcc_edges$from %in% c(ego_ir,alter_ir) + lcc_edges$to %in% c(ego_ir,alter_ir))==2)
+      weight = lcc_edges$weight[edge_id]
+    }
+    hyp_lcc_edges[ei,] = c(ego_id, alter_id, weight)
+    ei = ei + 1
+  }
+}
+print(proc.time()-ptm) #10 hrs
+
+##### Create new graph
+hyp_lcc = graph.data.frame(hyp_lcc_edges, directed=FALSE, vertices=hyp_lcc_verts)
+
+save(hyp_lcc_edges, hyp_lcc_verts, hyp_lcc, file='Results/hyper-lcc.RData')
+write.csv(hyp_lcc_edges, file='Results/hyp_lcc_edges.csv')
+write.csv(hyp_lcc_verts[,c('id','spawn.date','vic','vic.type','vic.day')], file='Results/hyp_lcc_verts.csv')
+
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R
new file mode 100755
index 0000000..404ce9e
--- /dev/null
+++ b/R Scripts/data-exploration.R
@@ -0,0 +1,40 @@
+library(igraph)
+setwd("Documents/Cascade Project/Raw Data/")
+load('chi-19mar2015.RData')
+
+d = remove.edge.attribute(person,'weight')
+lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize)))
+
+##### Small-World Analysis
+trl = mean(transitivity(lcc,type='local',isolates='zero'))
+apl = average.path.length(lcc)
+cat('Local Transitivity =', trl);cat('\nAverage Path Length =', apl)
+
+nsim = 5
+ER_sim = data.frame(trl=rep(0,nsim),apl=0)
+for(i in 1:nsim){
+  print(i)
+  erg = erdos.renyi.game(n=vcount(lcc),p.or.m=ecount(lcc),type='gnm')
+  erg = induced.subgraph(erg,which(clusters(erg)$membership==which.max(clusters(erg)$csize)))
+  ER_sim[i,1] = mean(transitivity(erg,type='local',isolates='zero'))
+  ER_sim[i,2] = average.path.length(erg)
+}
+
+S = data.frame(C_dat = trl,
+               L_dat = apl,
+               C_ER=mean(ER_sim$trl),
+               L_ER=mean(ER_sim$apl),
+               S_ER=mean((trl/ER_sim$trl)/(apl/ER_sim$apl)))
+S
+
+##### Degree Distribution
+plot(degree.distribution(lcc)*vcount(lcc),log='xy',type='l',col='red',lwd=2,
+      xlab='Degree', ylab='Number of Vertices', main='Degree Distribution')
+
+
+##### Victims
+vic_ids = which(V(lcc)$vic==TRUE)
+non_vic_ids = which(V(lcc)$vic==FALSE)
+hist(as.numeric(V(lcc)$vic_date[vic_ids]),100,col='lightblue',
+     xlab='Day of Study Period',main='Infections During the Study Period')
+
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
new file mode 100755
index 0000000..3104ea2
--- /dev/null
+++ b/R Scripts/data-prep.R
@@ -0,0 +1,223 @@
+library(igraph)
+setwd('~/Documents/Cascade Project/Raw Data/')
+
+#================
+# (1) load data
+#================
+
+#load all three sets of data
+arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))
+
+#I need to add the "ir" for this to make sense when I "project"
+arrests$ir2 <- paste("ir", arrests$ir_no)
+
+## Match arrests based on date, time, and location
+a = arrests[arrests$rd_no=='',]
+dtab = table(a$arrest_date)
+dates = attr(dtab,'name')[dtab>1]
+for (date in dates){
+  if (which(date==dates)%%10000==0) print(which(date==dates))
+  ids = which(a$arrest_date==date)
+  grp = a[ids,]
+  stab = table(grp$street_nme)
+  streets = attr(stab,'name')[stab>1]
+  for (street in streets){
+    arr_ids = as.numeric(rownames(grp[grp$street_nme==street,]))
+    arrests$rd_no[arr_ids] = paste('rd',arr_ids[1])
+  }
+}
+# now make unique rd_nos for the other people arrested alone
+null_rds = which(arrests$rd_no=='')
+arrests$rd_no[null_rds] = paste('rd',null_rds)
+save(arrests,file='arrests.RData')
+
+#=====================
+# (2) Structure Data
+#=====================
+
+#get the fields we need for all three: incidents, mni, and "type"
+sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2')))
+colnames(sub.arrests) <- c("events", "individuals")
+
+#=============================
+# (3) Prep for making graphs
+#=============================
+
+individuals <- unique(sub.arrests$individuals)
+
+events <- unique(sub.arrests$events)
+
+if (any(individuals %in% events))
+  
+  stop('vertex name collision')
+
+vertices <- data.frame(name=c(events, individuals),
+                       type=c(rep(FALSE, length(events)),
+                              rep(TRUE, length(individuals))),
+                       stringsAsFactors=FALSE)
+
+#===================================================================
+# (4) Make the GRAPH file
+#===================================================================
+g <- graph.data.frame(sub.arrests, vertices=vertices)
+
+
+#===================================================================
+# Sanity check the resulting igraph object
+stopifnot(ecount(g) == nrow(sub.arrests))
+stopifnot(vcount(g) == nrow(vertices))
+
+names <- V(g)$name
+stopifnot(isTRUE(identical(sort(names), sort(vertices$name))))
+
+inames <- V(g)[V(g)$type]$name
+stopifnot(isTRUE(identical(sort(inames), sort(individuals))))
+
+enames <- V(g)[! V(g)$type]$name
+stopifnot(isTRUE(identical(sort(enames), sort(events))))
+
+#===================================================================
+#now do the converstion into a single network 
+net1 <- bipartite.projection(g)
+person <- net1[[2]]
+
+#===================================================================
+# (5) Define attributes on nodes
+#===================================================================
+# set attributes from arrests file
+
+attribs <- arrests
+match_vector = match(V(person)$name, attribs$ir2)
+V(person)$sex <- as.character(attribs$sex_code_cd[match_vector])
+V(person)$race <- as.character(attribs$race_code_cd[match_vector])
+# V(person)$age <- as.character(attribs$age[match_vector])
+V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y'))
+
+# January 1, 2006 is Day 1 of the study period
+start_date = as.Date("2005-12-31")
+
+## Get first arrest date in the study period for each person
+sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y')
+sub.arrests = sub.arrests[order(sub.arrests$dates),]
+sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),]
+arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y')
+arrest.days = as.numeric(arrest.dates-start_date)
+V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)]
+
+V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)],
+                                      V(person)$dob,
+                                      units='days')/365.25)
+
+#===================================================================
+
+
+# get victim attributes
+shootings <- read.csv("shooting-data-withdate2.csv", header = T)
+victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
+victims = victims[!is.na(victims$IR_NO),]
+victims$ir2 <- paste("ir", victims$IR_NO)
+
+# get murder victim attributes
+murders = read.csv("murder-victims-13nov.csv", header=T)
+murders = murders[!is.na(murders$VICTIM_IR_NO),]
+murders = murders[murders$INJURY_DESCR=="SHOT",]
+murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
+murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
+murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
+
+# set victim data in network
+vtab = as.data.frame(table(victims$ir2))
+match_vector = match(V(person)$name,vtab$Var1)
+V(person)$vic.nonfatal = vtab$Freq[match_vector]
+V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0
+V(person)$vic.fatal = V(person)$name %in% murders$ir2
+V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2)
+
+# add fatal shooting dates to the network
+match_vector = match(V(person)$name, murders$ir2)
+fatal_dates = murders$INJURY_DATE[match_vector]
+fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y'))
+V(person)$fatal_date = fatal_dates
+
+# add nonfatal shooting dates to the network
+match_vector = match(victims$ir2,V(person)$name)
+vics = which(V(person)$vic.nonfatal>0)
+nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics))
+for(i in 1:length(vics)){
+  if (i%%3000==0) print(i)
+  name = vics[i]
+  ids = which(match_vector==name)
+  dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))
+  nfd1[i] = as.character(dates[1])
+  nfd2[i] = as.character(dates[2])
+  nfd3[i] = as.character(dates[3])
+  nfd4[i] = as.character(dates[4])
+  nfd5[i] = as.character(dates[5])
+}
+V(person)$nonfatal_date_1 = NA
+V(person)$nonfatal_date_2 = NA
+V(person)$nonfatal_date_3 = NA
+V(person)$nonfatal_date_4 = NA
+V(person)$nonfatal_date_5 = NA
+V(person)$nonfatal_date_1[vics] = nfd1
+V(person)$nonfatal_date_2[vics] = nfd2
+V(person)$nonfatal_date_3[vics] = nfd3
+V(person)$nonfatal_date_4[vics] = nfd4
+V(person)$nonfatal_date_5[vics] = nfd5
+
+# convert dates into numeric values ("days")
+start_date
+V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date)
+V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date)
+V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date)
+V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date)
+V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date)
+V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date)
+
+#===================================================================
+# set gang attributes
+gangs <- read.csv("Sept2014-ganglist.csv", header=T)
+gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
+gangs$ir2 <- paste("ir", gangs$IR_NO)
+
+V(person)$gang.member <- V(person)$name %in% gangs$ir2
+
+match_vector = match(V(person)$name, gangs$ir2)
+gnames = gangs$GANG_NAME[match_vector]
+gnames = as.character(gnames)
+gnames[V(person)$gang.member==''] = 'Unknown'
+gnames[V(person)$gang.member==F] = 'None'
+V(person)$gang.name <- as.character(gnames)
+V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
+
+#===================================================================
+# create id number
+V(person)$id = rank(V(person)$name)
+
+# save data
+# person = remove.edge.attribute(person,'weight')
+# person_data = get.data.frame(person,'both')
+save(person, file="chi-19mar2015.RData")
+
+#===================================================================
+# get LCC of the network
+lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize)))
+V(lcc)$ir_no = V(lcc)$name
+vic_ids = which(V(lcc)$vic)
+lcc_verts = get.data.frame(lcc,'vertices')
+lcc_edges = get.data.frame(lcc,'edges')
+save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData")
+
+lcc_data = get.data.frame(lcc,'both')
+lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id)
+row.names(lcc_data$vertices) = lcc_data$vertices$id
+vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal',
+                               'fatal_day','nonfatal_day_1','nonfatal_day_2',
+                               'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')]
+
+write.csv(vertices,file='lcc_vertices.csv')
+write.csv(lcc_data$edges,file='lcc_edges.csv')
+
+lcc = graph.data.frame(lcc_edges, directed=FALSE, 
+                       vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')])
+write.graph(lcc,'lcc.gml','gml')
diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R
new file mode 100644
index 0000000..ecd5c83
--- /dev/null
+++ b/R Scripts/find-cascades.R
@@ -0,0 +1,73 @@
+library(igraph)
+setwd("~/Documents/Cascade Project/")
+load('Results/hyper-lcc.RData')
+load('Results/dag_dat_vics.RData')
+source('Scripts/temporal.R')
+# source('Scripts/homophily.R')
+source('Scripts/structural.R')
+
+vic_ids = which(V(hyp_lcc)$vic==TRUE)
+
+#### Initialize model parameters
+alpha = 1/5
+# gamma = 0.3
+delta = 0.1
+beta = 9.955713e-6
+
+##### Get weights
+# find max n days where infection possible given alpha
+edges = dag_dat_vics
+# edges = edges[(edges$t2-edges$t1)<300,]
+
+p_t = temporal(edges$t1, edges$t2, alpha)
+p_s = structural(delta, edges$dist)
+p = p_s * p_t
+p_tilde = 1 - p_s + p_s * temporal(edges$t1, edges$t2+1, alpha)
+weights = p/p_tilde
+
+##### Find transmission edges
+# load('Results/lm-probs.RData')
+# names = attr(lm.probs,'names')
+# probs = as.numeric(lm.probs)
+# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)]
+# betas = 0.055
+thresh = beta/(1-beta)
+
+realized = c()
+# edges = edges[weights>thresh,]
+# weights = weights[weights>thresh]
+for (u in vic_ids){
+  max_prob = -1
+  max_edge = NULL
+  Ei = which(edges$to==u)
+  for(i in Ei){
+    if(weights[i]>thresh){
+      prob = p[i] * prod(p_tilde[setdiff(Ei,i)])
+      if(prob>max_prob){
+        max_prob = prob
+        max_edge = i
+      }
+    }
+  }
+  realized = c(realized, max_edge)
+}
+#   if (length(Ei)>0){
+#     max_edge = Ei[which.max(weights[Ei])] # how to deal with ties????
+#     realized = c(realized, max_edge)
+#   }
+
+##### Create forest of cascades
+# dag is all of the victim nodes but with only transmission edges, or don't index by vic_ids
+dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=hyp_lcc_verts[vic_ids,])
+dag = set.edge.attribute(dag,'weight',value=weights[realized])
+table(clusters(dag)$csize)
+
+t=table(clusters(dag)$csize)
+sizes = as.numeric(attr(t,'name'))
+counts = as.numeric(t)
+plot(sizes,counts,log='xy',type='o',lwd=3)
+n.seeds = sum(counts)
+n.nodes = sum(sizes*counts) # for vics case is just length(vic_ids)
+n.infections = n.nodes-n.seeds
+
+# dag = graph.edgelist(as.matrix(edges[realized,1:2]))
diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R
new file mode 100755
index 0000000..660a503
--- /dev/null
+++ b/R Scripts/generate-dag-dat.R
@@ -0,0 +1,59 @@
+library(igraph)
+setwd("~/Documents/Cascade Project/")
+load('Results/hyper-lcc.RData')
+
+vic_ids = which(V(hyp_lcc)$vic==TRUE)
+
+edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])}
+
+dag_dat_all = data.frame(matrix(nrow=1,ncol=8))
+hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight')
+ei = 1
+ptm=proc.time()
+for (u in vic_ids){
+  if ((which(vic_ids==u) %% 1000)==0)  print(which(vic_ids==u))
+  tu    = hyp_lcc_verts$vic.day[u]
+  u_spawn = hyp_lcc_verts$spawn.date[u]
+  nbhd  = unlist(neighborhood(hyp_lcc,nodes=u,order=3)) # get nodes within neighborhood
+  nbhd  = nbhd[-1] # don't want to include u in the neighborhood
+  tvs   = hyp_lcc_verts$vic.day[nbhd]
+  v_spawn = hyp_lcc_verts$spawn.date[nbhd]
+  nbhd  = nbhd[tu>v_spawn & (is.na(tvs) | tu<tvs)]
+  tvs   = hyp_lcc_verts$vic.day[nbhd]
+  dists = as.numeric(shortest.paths(hyp_lcc2,u,nbhd))
+  
+  es = get.shortest.paths(hyp_lcc2,u,nbhd,output='epath')$epath
+  weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T)
+  
+  #will be faster to pre-allocate and fill in rather than rbind each time
+  dag_dat_all[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, 
+                                                    rep(tu,length(nbhd)), tvs, dists, 
+                                                    weights, row.names=NULL)
+  ei = ei + length(nbhd)
+}
+print(proc.time()-ptm) #3.5 hours
+colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3')
+rownames(dag_dat_all) = NULL
+
+save(dag_dat_all, file='Results/dag_dat_all.RData')
+write.csv(dag_dat_all, file='Results/dag_dat_all.csv')
+
+dag_dat_vics = dag_dat_all[!is.na(dag_dat_all$t2),]
+save(dag_dat_vics, file='Results/dag_dat_vics.RData')
+write.csv(dag_dat_vics, file='Results/dag_dat_vics.csv')
+
+# analyze min possible infection time
+i = 1
+min_time = 0#rep(Inf,length(unique(dag_dat_vics$to)))
+min_time_dist = 0#rep(Inf,length(unique(dag_dat_vics$to)))
+for(to in unique(dag_dat_vics$to)){
+  rows = which(dag_dat_vics$to==to & dag_dat_vics$dist<2)
+  if(length(rows)>0){
+    min_time[i] = min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows])
+    min_time_dist[i] = dag_dat_vics$dist[rows[which.min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows])]]
+    i = i + 1
+  }
+}
+median(min_time)
+mean(min_time<100)
+save(min_time_1,min_time_2,min_time_3,file='Results/min_inf_time.RData')
diff --git a/R Scripts/likelihood.R b/R Scripts/likelihood.R
new file mode 100644
index 0000000..a4e5254
--- /dev/null
+++ b/R Scripts/likelihood.R
@@ -0,0 +1,30 @@
+# run after find-cascades
+
+load('Results/dag_dat_all.RData')
+
+edges_all = dag_dat_all
+edges_all$t2[is.na(edges_all$t2)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1
+realized_all = as.numeric(rownames(dag_dat_vics)[realized])
+
+p_t = temporal(edges_all$t1, edges_all$t2, alpha)
+p_s = structural(delta, edges_all$dist)
+p = p_s * p_t
+p_tilde = 1 - p_s + p_s * temporal(edges_all$t1, edges_all$t2+1, alpha)
+
+vic.days = hyp_lcc_verts$vic.day
+vic.days[is.na(vic.days)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1
+ages = vic.days-hyp_lcc_verts$spawn.date
+
+# realized edges
+prob_realized = sum(log(p[realized_all]))
+
+# non-realized edges
+prob_not_realized = sum(log(p_tilde[-realized_all]))
+
+# seeds
+prob_beta = n.seeds * log(beta)
+
+# non-seeds
+prob_beta_tilde = sum(ages) * log(1-beta)
+
+ll = prob_beta + prob_beta_tilde + prob_realized + prob_not_realized
diff --git a/R Scripts/lm.R b/R Scripts/lm.R
new file mode 100755
index 0000000..f1bbae0
--- /dev/null
+++ b/R Scripts/lm.R
@@ -0,0 +1,74 @@
+library(pROC)
+library(igraph)
+setwd('~/Documents/Cascade Project')
+load('Raw Data/lcc.RData')
+load('Results/lcc_bw.RData')
+
+is_vic = V(lcc)$vic
+vics = which(V(lcc)$vic==T)
+nonvics = which(V(lcc)$vic==F)
+thresh = length(vics)/length(union(vics,nonvics))
+
+nbhdVics = function(neighbs){return(sum(is_vic[neighbs[-1]]))}
+#setdiff(is_vic order-(order-1) to get only those at the given order
+n1s = neighborhood(lcc,1,V(lcc))
+n2s = neighborhood(lcc,2,V(lcc))
+n1vics = unlist(lapply(n1s,nbhdVics))
+n2vics = unlist(lapply(n2s,nbhdVics))
+
+data = data.frame(vic         = V(lcc)$vic,
+                  sex         = V(lcc)$sex,
+                  race        = V(lcc)$race,
+                  age         = V(lcc)$age,
+                  gang.member = V(lcc)$gang.member,
+                  gang.name   = V(lcc)$gang.name,
+#                   fname       = V(lcc)$faction.name,
+                  deg         = degree(lcc),
+                  bw          = bw,
+                  n1v         = n1vics,
+                  n2v         = n2vics
+  )
+data$age = as.numeric(data$age)
+data$gname = as.factor(data$gname)
+
+train = 1:dim(data)[1]; test = 1:dim(data)[1]
+# train = sort(union(sample(vics,length(vics)*0.8),sample(nonvics,length(nonvics)*0.8)))
+# test = sort((1:length(is_vic))[-train])
+
+formula = vic ~ sex + race + age + gmember + gname + deg + bw + n1v + n2v
+formula = vic ~ sex + race + age + gmember + gname
+formula = vic ~ deg + bw
+formula = vic ~ n1v + n2v
+
+glm.fit = glm(formula, data=data, family=binomial)
+glm.probs = predict(glm.fit, newdata=data, type='response')
+roc = roc(data$vic[test],glm.probs,ci=TRUE)
+plot(roc,print.auc=TRUE)
+roc$auc
+
+lm.fit = lm(formula, data=data)
+lm.probs = predict(lm.fit, newdata=data, type='response')
+p = 1/(1+exp(-lm.probs))
+z = log(lm.probs/(1-lm.probs))
+logit = 1/(1+exp(-z))
+
+save(glm.probs, file='Results/lm-probs.RData')
+
+##########
+lm.pred = lm.probs>thresh
+tab = table(lm.pred,data$vic)
+tab
+(tab[1,1]+tab[2,2])/sum(tab)
+
+propensity = as.numeric(pred)
+similarity = 1 - abs(propensity[u]-propensity[v])
+
+###############
+# Random forest approach -- not as good
+library(randomForest)
+data$vic = factor(data$vic)
+rf.fit = randomForest(formula,data=data[train,],do.trace=50,ntree=20)
+rf.probs = (predict(rf.fit,newdata=data[test,],type='prob'))[,1]
+roc = roc(data$vic[test],rf.probs,ci=TRUE)
+plot(roc,print.auc=TRUE)
+
diff --git a/R Scripts/plot-graph.R b/R Scripts/plot-graph.R
new file mode 100755
index 0000000..b6f9341
--- /dev/null
+++ b/R Scripts/plot-graph.R
@@ -0,0 +1,22 @@
+#### plot the victims and non-victims of the network
+cols = rep('lightgrey',vcount(g)) #number of nodes in g
+cols[V(g)$vic==1] = 'red'
+shapes = rep('circle',vcount(g))
+shapes[V(g)$vic==1] = 'square'
+plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols, vertex.shape=shapes,vertex.size=2)
+legend('topleft', legend=c("non-victim", "victim"), pch=c(16,15), col=c("lightgrey", "red"), inset=0.1)
+
+#### plot the neighborhood of a node
+v = 10
+v1 = unlist(neighborhood(lcc,nodes=v,order=1))
+v2 = unlist(neighborhood(lcc,nodes=v,order=2))
+v3 = unlist(neighborhood(lcc,nodes=v,order=3))
+v3 = setdiff(v3,v2)
+v2 = setdiff(v2,v1)
+v1 = setdiff(v1,v)
+cols = rep('lightgrey',vcount(g))
+cols[v] = 'red'
+cols[v1] = 'blue'
+cols[v2] = 'green'
+cols[v3] = 'black'
+plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols,vertex.size=2)
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
new file mode 100755
index 0000000..3960578
--- /dev/null
+++ b/R Scripts/sim-analysis.R
@@ -0,0 +1,72 @@
+library(igraph)
+setwd("~/Documents/Cascade Project/")
+
+load('Results/hyper-lcc.RData')
+vic_ids = which(V(hyp_lcc)$vic==1)
+n.infections = length(vic_ids)
+n.days = max(hyp_lcc_verts$vic.day,na.rm=T) - min(hyp_lcc_verts$vic.day,na.rm=T)
+inf.dates = hyp_lcc_verts$vic.day[vic_ids]
+
+ptm = proc.time()
+n = 1500
+mean.time = matrix(0,3,n)
+med.time = matrix(0,3,n)
+mean.50 = matrix(0,3,n)
+mean.100 = matrix(0,3,n)
+n.vicpairs = matrix(0,3,n)
+for(sim in 1:3){
+  print(paste('sim:',sim))
+  for(q in 1:n){
+    if (q%%250==0) print(paste('run:',q))
+    graph = hyp_lcc
+    if (sim<3) sim.dates = sample(n.days, n.infections, replace=TRUE) # sims 1 + 2
+    if (sim==3) sim.dates = sample(inf.dates) # sim 3
+    if (sim==1) vics = sample(vcount(hyp_lcc), n.infections, replace=FALSE) # sim 1
+    if (sim>1) vics = vic_ids # sims 2 + 3
+    if (sim==0) {vics = vic_ids; sim.dates = inf.dates} # data
+    
+    vic.time = c()
+    for (i in 1:n.infections){
+      u = vics[i]
+      nbhd = unlist(neighborhood(graph, nodes=u, order=1))
+      nbhd = intersect(vic_ids,nbhd)
+      nbhd = setdiff(nbhd,u)
+      nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting
+      nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person
+      tu = sim.dates[i]
+      tvs = sim.dates[match(nbhd,vic_ids)]
+      vic.time = c(vic.time,abs(tu-tvs))
+    }
+    n.vicpairs[sim,q] = length(vic.time)
+    mean.time[sim,q] = mean(vic.time)
+    med.time[sim,q] = median(vic.time)
+    mean.50[sim,q] = mean(vic.time<=50)
+    mean.100[sim,q] = mean(vic.time<=100)
+    #     decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor?
+  }
+}
+print(proc.time()-ptm)
+
+
+####### plot simulation data #######
+data = 0.1176774 # 774.5483, 634, 0.07252472, 0.1176774
+simdata = mean.100
+xl = c(0.9*min(min(simdata),data),1.1*max(max(simdata),data))
+yl = c(0,max(c(hist(simdata[1,],15)$density,hist(simdata[2,],15)$density,hist(simdata[3,],15)$density)))
+hist(simdata[1,],15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F,
+     xlab='Neighbors Infected Within 100 Days', main=NULL)
+hist(simdata[2,],15,col=rgb(0,0,1,1/4),freq=F,add=TRUE)
+hist(simdata[3,],15,col=rgb(1,0,1,1/4),freq=F,add=TRUE)
+abline(v=data,lwd=4,col='red')
+
+
+##### inf dates histogram
+dates = c(V(lcc)$fatal_date[!is.na(V(lcc)$fatal_date)],
+          V(lcc)$nonfatal_date_1[!is.na(V(lcc)$nonfatal_date_1)],
+          V(lcc)$nonfatal_date_2[!is.na(V(lcc)$nonfatal_date_2)],
+          V(lcc)$nonfatal_date_3[!is.na(V(lcc)$nonfatal_date_3)],
+          V(lcc)$nonfatal_date_4[!is.na(V(lcc)$nonfatal_date_4)],
+          V(lcc)$nonfatal_date_5[!is.na(V(lcc)$nonfatal_date_5)])
+dates = as.Date(dates)
+hist(dates, breaks='months',freq=T,col='lightblue',format='%m/%Y',
+     xlab='Month',ylab='Number of Shootings',main='')
diff --git a/R Scripts/structural.R b/R Scripts/structural.R
new file mode 100755
index 0000000..132aa6c
--- /dev/null
+++ b/R Scripts/structural.R
@@ -0,0 +1,7 @@
+# structural = function(w1,w2,w3,gamma){
+#   return (plogis(w1,scale=gamma) * plogis(w2,scale=gamma) * plogis(w3,scale=gamma))
+# }
+
+structural = function(delta,dist){
+  return (delta^dist)
+}
+\ No newline at end of file
diff --git a/R Scripts/temporal.R b/R Scripts/temporal.R
new file mode 100755
index 0000000..13194a4
--- /dev/null
+++ b/R Scripts/temporal.R
@@ -0,0 +1,10 @@
+
+temporal = function(tu,tv,alpha){
+  dt = tv - tu - 1
+  if (sum(dt<0)==0) {
+    return (alpha*exp(-alpha*dt))
+#     return ((0.01/alpha)*(1+dt/alpha)^(-1.01))
+  } else {
+    return (NA)
+  }
+}
+\ No newline at end of file
author	Ben Green <bgreen@g.harvard.edu>	2015-06-08 15:21:51 -0400
committer	Ben Green <bgreen@g.harvard.edu>	2015-06-08 15:21:51 -0400
commit	1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch)
tree	6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts
parent	e5dada202c34521618bf82a086093c342841e5e8 (diff)
download	criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz