diff options
| author | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
|---|---|---|
| committer | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
| commit | 1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch) | |
| tree | 6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts | |
| parent | e5dada202c34521618bf82a086093c342841e5e8 (diff) | |
| download | criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz | |
added my R scripts
Diffstat (limited to 'R Scripts')
| -rwxr-xr-x | R Scripts/-compare-sim.R | 34 | ||||
| -rwxr-xr-x | R Scripts/-generate-dag-dat-sim.R | 50 | ||||
| -rwxr-xr-x | R Scripts/-recover-data.R | 27 | ||||
| -rwxr-xr-x | R Scripts/-simulate-infections.R | 34 | ||||
| -rw-r--r-- | R Scripts/.Rhistory | 0 | ||||
| -rw-r--r-- | R Scripts/README.rtf | 22 | ||||
| -rwxr-xr-x | R Scripts/analyze-cascades.R | 58 | ||||
| -rw-r--r-- | R Scripts/arrest-vs-infection-times.R | 43 | ||||
| -rw-r--r-- | R Scripts/benchmarking.R | 130 | ||||
| -rw-r--r-- | R Scripts/create-hyper-lcc.R | 126 | ||||
| -rwxr-xr-x | R Scripts/data-exploration.R | 40 | ||||
| -rwxr-xr-x | R Scripts/data-prep.R | 223 | ||||
| -rw-r--r-- | R Scripts/find-cascades.R | 73 | ||||
| -rwxr-xr-x | R Scripts/generate-dag-dat.R | 59 | ||||
| -rw-r--r-- | R Scripts/likelihood.R | 30 | ||||
| -rwxr-xr-x | R Scripts/lm.R | 74 | ||||
| -rwxr-xr-x | R Scripts/plot-graph.R | 22 | ||||
| -rwxr-xr-x | R Scripts/sim-analysis.R | 72 | ||||
| -rwxr-xr-x | R Scripts/structural.R | 7 | ||||
| -rwxr-xr-x | R Scripts/temporal.R | 10 |
20 files changed, 1134 insertions, 0 deletions
diff --git a/R Scripts/-compare-sim.R b/R Scripts/-compare-sim.R new file mode 100755 index 0000000..d96bd81 --- /dev/null +++ b/R Scripts/-compare-sim.R @@ -0,0 +1,34 @@ + +vics = vic_ids +graph = lcc + +n.infections = length(vics) +vic.neighbors = rep(0,n.infections) +for (i in 1:n.infections){ + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=3)) + nbhd = intersect(vic_ids,nbhd) + vic.neighbors[i] = sum(nbhd %in% vics) +} + + +n.infections = length(vics) +vic.time = rep(0,n.infections) +for (i in 1:n.infections){ + if (i%%1000==0) print(i) + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=1)) + nbhd = intersect(vic_ids,nbhd) + nbhd = setdiff(nbhd,u) + tu = as.numeric(V(graph)$vic_date[u]) + tvs = as.numeric(V(graph)$vic_date[nbhd]) + tvs = tvs[tvs<tu] + if (length(tvs)>0){ + vic.time[i] = min(tu-tvs) + } +} +vic.time = vic.time[vic.time>0] +mean(vic.time) +median(vic.time) +mean(vic.time<100) +sum(vic.time>0) diff --git a/R Scripts/-generate-dag-dat-sim.R b/R Scripts/-generate-dag-dat-sim.R new file mode 100755 index 0000000..a6d115d --- /dev/null +++ b/R Scripts/-generate-dag-dat-sim.R @@ -0,0 +1,50 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") +load('Data/lcc_sim2a.RData') + +# d = remove.edge.attribute(person,'weight') +# lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) +# vic_ids = which(V(lcc)$vic==TRUE) +# dag = graph.empty(vcount(lcc)) +# dag = set.edge.attribute(dag,'weight',value=0) + +lcc = lcc.sim +vic_ids = vic_ids.sim +ei = 1 + +dag_dat = matrix(0,253096,5) +for (1 in 1:length(vic_ids)){ + if (i %% 100)==0) print(which(vic_ids==u)) + + u = vic_ids[i] + nbhd = unlist(neighborhood(lcc, nodes=u, order=3)) # get nodes within neighborhood + nbhd = intersect(vic_ids,nbhd) # only want victim nodes + nbhd = setdiff(nbhd,u) # don't want to include u in the neighborhood + + tu = as.numeric(V(lcc)$vic_date[u]) + tvs = as.numeric(V(lcc)$vic_date[nbhd]) + Es = tu<tvs + tvs = tvs[Es] + + if (sum(Es)>0){ + nbhd = nbhd[Es] + dists = as.numeric(shortest.paths(lcc,u,nbhd)) + + for (j in 1:sum(Es)){ + v = nbhd[j] + d = dists[j] + tv = tvs[j] +# dag_dat[ei,] = c(u,v,d,tu,tv) + dag_dat = rbind(dag_dat,c(u,v,d,tu,tv)) +# f = time(tu,tv) +# h = structural(d) +# weight[ei] = f*h + ei = ei+1 + } + } +} + +dag_dat = dag_dat[rowSums(dag_dat)>0,] +dag_dat = as.data.frame(dag_dat) +colnames(dag_dat) = c('from','to','dist','t1','t2') +# save(dag_dat,file='Data/dag_dat_sim2a.RData') diff --git a/R Scripts/-recover-data.R b/R Scripts/-recover-data.R new file mode 100755 index 0000000..b34ffc6 --- /dev/null +++ b/R Scripts/-recover-data.R @@ -0,0 +1,27 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") + +el = read.csv('Data/dag.csv') +lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) +vic_ids = which(V(lcc)$vic==TRUE) + +from = vic_ids[el$from] +to = vic_ids[el$to] + +t1 = as.Date(V(lcc)$vic_date[from],format='%m/%d/%y') +t2 = as.Date(V(lcc)$vic_date[to],format='%m/%d/%y') + +uf = unique(from) +dist = rep(0,length(from)) +for (i in 1:length(uf)){ + if (i%%1000==0) print(i) + f = uf[i] + fi = which(from==f) + ds = as.numeric(shortest.paths(lcc,v=f,to=to[fi])) + dist[fi] = ds +} + +dag_dat = data.frame(from=el$from,to=el$to,dist,t1,t2) + +save(dag_dat,file='Data/dag_dat.RData') +write.csv(dag_dat,'Data/dag_dat.csv') diff --git a/R Scripts/-simulate-infections.R b/R Scripts/-simulate-infections.R new file mode 100755 index 0000000..f1409af --- /dev/null +++ b/R Scripts/-simulate-infections.R @@ -0,0 +1,34 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") + +load('Data/dag_dat.RData') +load('Data/lcc.RData') + +n.infections = length(vic_ids) +n.nodes = (vcount(lcc)) +n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1)) +gamma = n.infections/(n.nodes*n.days) + +vic = rep(FALSE,n.nodes) +for(t in 1:n.days){ + if (t%%1000==0) print(t) + infections = which(runif(n.nodes)<gamma) + new.infections = infections[vic[infections]==0] + vic[new.infections] = t +} + +lcc.sim = lcc +V(lcc.sim)$vic = vic>0 +V(lcc.sim)$vic_date = vic +vic_ids.sim = which(vic>0) +# save(lcc.sim,vic_ids.sim, file='Data/lcc_sim1b.RData') + +#### simulation method 2 #### +load('Data/dag_dat.RData') +load('Data/lcc.RData') +n.infections = length(vic_ids) +n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1)) +lcc.sim = lcc +V(lcc.sim)$vic_date[vic_ids] = sample(n.days, n.infections, replace=TRUE) +vic_ids.sim = which(V(lcc.sim)$vic) +# save(lcc.sim, vic_ids.sim, file='Data/lcc_sim2a.RData') diff --git a/R Scripts/.Rhistory b/R Scripts/.Rhistory new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/R Scripts/.Rhistory diff --git a/R Scripts/README.rtf b/R Scripts/README.rtf new file mode 100644 index 0000000..48e0353 --- /dev/null +++ b/R Scripts/README.rtf @@ -0,0 +1,22 @@ +{\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\margl1440\margr1440\vieww10800\viewh8400\viewkind0 +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural + +\f0\fs24 \cf0 PIPELINE:\ +data-prep \'97 generate co-offending network and lcc\ +create-hyper-lcc \'97 generate new network with a unique node for each infection\ +generate-dag-dat \'97 generate a dag data frame based on possible infection edges\ +find-cascades \'97 calculate edge weights and find transmission edges, yielding a forest of cascades\ +\ +OTHER FILES:\ +analyze-cascades: simple code to analyze cascades the model generates\ +compare-sim:\ +data-exploration:\ +lm: linear model to predict victims based on demographic and other features\ +plot-graph:\ +sim-analysis:\ +simulate-infections:\ +structural: function to calculate the structural component of the infection model\ +temporal: function to calculate the temporal component of the infection model}
\ No newline at end of file diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R new file mode 100755 index 0000000..463de18 --- /dev/null +++ b/R Scripts/analyze-cascades.R @@ -0,0 +1,58 @@ +# library(igraph) +# setwd("~/Documents/Cascade Project/") +# +# load('Results/dag_dat_all.RData') +# load('Results/weight-12-1-14.RData') +# load('Results/hyper-lcc.RData') + +# dag = graph.edgelist(as.matrix(dag_dat[,1:2])) +# dag = set.edge.attribute(dag,'weight',value=weight) +# dag_dat = dag_dat[which(E(dag)$weight>=0.4),] +# dag = delete.edges(dag, which(E(dag)$weight<0.4)) + +table(clusters(dag)$csize) + +clusters = clusters(dag) +membership = clusters$membership +csize = clusters$csize +order = rev(order(csize)) + +#use table not hist +plot(sizes,counts,log='xy',type='o',lwd=3, + xlab='Size of Cascade', ylab='Number of Cascades', main='Distribution of Cascade Sizes') + +i = 14 +V = which(clusters(dag)$membership==order[i]) # get all nodes in cluster +cc = induced.subgraph(dag,V) +Vi = vic_ids[V] +Ei = intersect(which(dag_dat_vics$from[realized] %in% Vi),which(dag_dat_vics$to[realized] %in% Vi)) +cc_dat = (dag_dat_vics[realized,])[Ei,] + +### plot cascade ### +cols = rep('lightblue',vcount(cc)) +seed = which(degree(cc,mode='in')==0) +cols[seed] = 'red' +plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1, + edge.width=E(cc)$weight*20/max(E(cc)$weight),layout=layout.reingold.tilford(cc,root=seed), + vertex.label=V(hyp_lcc)$vic.day[Vi]) +# vertex.label=as.Date(V(hyp_lcc)$vic.day[Vi], format ='%m/%d/%y')) +# vertex.label=V(lcc)$vic_date[Vi]) + +### basic graph statistics +trl = mean(transitivity(cc,type='local',isolates='zero')) +apl = average.path.length(cc) +indeg = degree(cc,mode='in') +outdeg = degree(cc,mode='out') +ds = mean(cc_dat$dist) + +### node demographic statistics +from = vic_ids[cc_dat$from] +to = vic_ids[cc_dat$to] +V(lcc)$sex[from] == V(lcc)$sex[to] +V(lcc)$sex[Vi] +V(lcc)$race[Vi] +as.numeric(V(lcc)$age[Vi]) +V(lcc)$gang.member[Vi] +V(lcc)$gang.name[Vi] +V(lcc)$faction.name[Vi] + diff --git a/R Scripts/arrest-vs-infection-times.R b/R Scripts/arrest-vs-infection-times.R new file mode 100644 index 0000000..222be3e --- /dev/null +++ b/R Scripts/arrest-vs-infection-times.R @@ -0,0 +1,43 @@ +library(igraph) +setwd('~/Documents/Cascade Project/Raw Data/') + +##### Load arrest data +arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) +arrests$ir2 <- paste("ir", arrests$ir_no) +sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'), + as.character('arrest_date'))) +colnames(sub.arrests) <- c("events", "individuals", "dates") +sub.arrests$dates = as.Date(sub.arrests$dates,format='%m/%d/%Y') +sub.arrests = sub.arrests[order(sub.arrests$dates),] +# individuals <- unique(sub.arrests$individuals) +# events <- unique(sub.arrests$events) + +##### Look at nonfatal victims +shootings <- read.csv("shooting-data-withdate2.csv", header = T) +victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] +victims = victims[!is.na(victims$IR_NO),] +victims$ir2 <- paste("ir", victims$IR_NO) +victims$INCIDENT_DATE = as.Date(victims$INCIDENT_DATE,format='%m/%d/%y') +victims = victims[order(victims$INCIDENT_DATE),] # sort so match gets first infection +victims = victims[victims$ir2 %in% arrests$ir2,] # only look at vics who were arrested + +vic_dates = victims$INCIDENT_DATE[match(unique(victims$ir2),victims$ir2)] +arrest_dates = sub.arrests$dates[match(unique(victims$ir2), sub.arrests$individuals)] +sum(vic_dates==arrest_dates) +# the number of vics in arrest table doesn't match number of nonfatal vics in person? + +##### Look at fatal victims +murders = read.csv("murder-victims-13nov.csv", header=T) +murders = murders[!is.na(murders$VICTIM_IR_NO),] +murders = murders[murders$INJURY_DESCR=="SHOT",] +murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] +murders$ir2 = paste("ir", murders$VICTIM_IR_NO) +start_date = as.Date("2005-12-31") +murders$INJURY_DATE = as.Date(murders$INJURY_DATE,format='%m/%d/%y') +murders = murders[murders$INJURY_DATE>=start_date,] +murders = murders[order(murders$INJURY_DATE),] +murders = murders[murders$ir2 %in% arrests$ir2,] + +mur_dates = murders$INJURY_DATE +arrest_dates = sub.arrests$dates[match(murders$ir2, sub.arrests$individuals)] +sum(mur_dates==arrest_dates) diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R new file mode 100644 index 0000000..1cf99e0 --- /dev/null +++ b/R Scripts/benchmarking.R @@ -0,0 +1,130 @@ +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/hyper-lcc.RData') +load('Results/dag_dat_all.RData') +source('Scripts/temporal.R') +source('Scripts/structural.R') + +##### Initialize data +formula = vic ~ sex + race + age + gang.member + gang.name +lcc_verts$sex = as.factor(lcc_verts$sex) +lcc_verts$race = as.factor(lcc_verts$race) +lcc_verts$age = as.numeric(lcc_verts$age) +lcc_verts$gang.name = as.factor(lcc_verts$gang.name) +# sum(hyp_lcc_verts$vic)/length(days) + +##### Loop through days +alpha = 1/100 +gamma = 0.18 +days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) +lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) +nvics = sum(hyp_lcc_verts$vic.day %in% days) +correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +# correct_rank1 = correct_rank2 = correct_rank3 = c() +edges_all = dag_dat_all[dag_dat_all$dist<2,] +ptm = proc.time() +for (day in days){ + if (day %% 100 == 0) print(day) + + ##### Demographics model +# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) +# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] +# victims$vic[vics] = TRUE +# victims$vic[-vics] = FALSE +# # glm.fit = glm(formula, data=victims, family=binomial) +# glm.fit = lm(formula, data=victims) +# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') + + ##### Cascade Model + edges = edges_all[which(edges_all$t1<day),] + f = temporal(edges$t1, day, alpha) + h = structural(gamma,edges$dist) + weights = f*h + ids = edges$to + irs = hyp_lcc_verts$ir_no[ids] + risk = data.frame(id=ids, ir=irs, weight=weights) + risk = risk[order(weights, decreasing=T),] + risk = risk[match(unique(risk$ir),risk$ir),] +# maybe need to change this to reflect new algorithm that accounts for \tilde{p} + + ##### Combined Model +# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) + combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight + + ##### Gather results + infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + for (lambda in lambdas){ + combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas + c_idx = which(lambdas==lambda) + r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) + # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank + correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) + # maybe should also mark down vic/nonvic status of each? + } + +} +print(proc.time()-ptm) + + +##### Plot results +hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), + xlab='Risk Ranking of Victims',main='') +hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +legend("topright", c("Demographics Model", "Cascade Model"), + fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) + +counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, + colSums(correct_rank<(vcount(lcc)/200))*100/nvics, + colSums(correct_rank<(vcount(lcc)/100))*100/nvics), + nrow=3, byrow=T) +plot(lambdas,counts[1,],log='x',type='l') + +correct_rank1 = correct_rank[,length(lambdas)] +correct_rank2 = correct_rank[,1] +correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] +counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), + sum(correct_rank1<(vcount(lcc)*0.005)), + sum(correct_rank1<(vcount(lcc)*0.01)), + sum(correct_rank2<(vcount(lcc)*0.001)), + sum(correct_rank2<(vcount(lcc)*0.005)), + sum(correct_rank2<(vcount(lcc)*0.01)), + sum(correct_rank3<(vcount(lcc)*0.001)), + sum(correct_rank3<(vcount(lcc)*0.005)), + sum(correct_rank3<(vcount(lcc)*0.01))), + nrow=3, byrow=T) +counts = counts*100/nvics +barplot(counts, + xlab="Size of High-Risk Population", + ylab="Percent of Victims Predicted", + names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), + beside=TRUE) +legend("topleft", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) +box(which='plot') +par(new=T) +counts = counts/(100/nvics) +barplot(counts, + ylim=c(0,max(counts)*1.1), + col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)), + beside=TRUE) +axis(side = 4) +mtext(side = 4, line = 3, "Number of Victims Predicted") + +popsizes = c(0.1, 0.5, 1) +plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) +lines(popsizes,counts[2,]) +lines(popsizes,counts[3,]) +lines(c(0,1),c(0,1)) + +#### Precision-Recall Curve +plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2) +plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) +plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) +legend("bottomright", inset=0.05, + c("Demographics Model", "Cascade Model", "Combined Model"), + fill=c('red','darkblue','darkgreen')) +lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/create-hyper-lcc.R b/R Scripts/create-hyper-lcc.R new file mode 100644 index 0000000..786b694 --- /dev/null +++ b/R Scripts/create-hyper-lcc.R @@ -0,0 +1,126 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +load('Raw Data/lcc.RData') + +lcc_verts = get.data.frame(lcc,'vertices') +lcc_edges = get.data.frame(lcc,'edges') + + +##### Create new vertices dataframe +cols = c('id','ir_no','sex','race','dob','age.arrest','arrest.day', + 'gang.member','gang.name','faction.name') +hyp_lcc_verts = data.frame(matrix(ncol=length(cols)+1,nrow=0)) +colnames(hyp_lcc_verts) = c(cols,'spawn.date') + +ptm = proc.time() +ri = 1 +for (i in 1:dim(lcc_verts)[1]){ + if (i%%10000==0) print(i) + if (lcc_verts$vic[i]){ + if (lcc_verts$vic.nonfatal[i]>0){ + # create nodes for each nonfatal shooting + for (nf in 1:lcc_verts$vic.nonfatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = T + hyp_lcc_verts$vic.type[ri] = 'nonfatal' + if(nf==1){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_1[i] + hyp_lcc_verts$spawn.date[ri] = 0 + } else if(nf==2){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_2[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_1[i] + } else if(nf==3){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_3[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_2[i] + } else if(nf==4){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_4[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_3[i] + } else if(nf==5){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_5[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_4[i] + } + ri = ri+1 + } + # if no fatal infection, create uninfected duplicate + if (!lcc_verts$vic.fatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = F + hyp_lcc_verts$vic.type[ri] = NA + hyp_lcc_verts$vic.day[ri] = NA + hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i], + lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i], + lcc_verts$nonfatal_day_5[i],na.rm=T) + ri = ri+1 + } + } + # create a node for each fatal shooting + # if also nonfatal shootings, spawn at last nonfatal shooting + if (lcc_verts$vic.fatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = T + hyp_lcc_verts$vic.type[ri] = 'fatal' + hyp_lcc_verts$vic.day[ri] = lcc_verts$fatal_day[i] + if (lcc_verts$vic.nonfatal[i]>0){ + hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i], + lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i], + lcc_verts$nonfatal_day_5[i],na.rm=T) + } else { + hyp_lcc_verts$spawn.date[ri] = 0 + } + ri = ri+1 + } + } + # create an uninfected node for each uninfected person + else{ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = F + hyp_lcc_verts$vic.type[ri] = NA + hyp_lcc_verts$vic.day[ri] = NA + hyp_lcc_verts$spawn.date[ri] = 0 + ri = ri+1 + } +} +print(proc.time()-ptm) # 1.5 hrs +row.names(hyp_lcc_verts) = NULL +n.nodes = sum(sum(V(lcc)$vic.fatal),sum(V(lcc)$vic.nonfatal), + sum(V(lcc)$vic.nonfatal>0 & !V(lcc)$vic.fatal),sum(V(lcc)$vic==FALSE)) +stopifnot(dim(hyp_lcc_verts)[1] == n.nodes) + +##### Create new edgelist +print('Edges') +hyp_lcc_edges = data.frame(from=0, to=0, weight=0) +ei = 1 +ptm = proc.time() +for(i in 1:dim(hyp_lcc_verts)[1]){ + if (i%%10000==0) print(i) + ego_id = hyp_lcc_verts$id[i] + ego_ir = hyp_lcc_verts$ir_no[i] + alter_irs = union(lcc_edges$from[which(lcc_edges$to==ego_ir)], + lcc_edges$to[which(lcc_edges$from==ego_ir)]) +# alter_irs = union(ego_ir, alter_irs) # add edges to other infected selves + alter_ids = hyp_lcc_verts$id[which(hyp_lcc_verts$ir_no %in% alter_irs)] + alter_ids = alter_ids[ego_id<alter_ids] # avoid double-counting edges + for(alter_id in alter_ids){ + weight=Inf + alter_ir = hyp_lcc_verts$ir_no[alter_id] + if(ego_ir!=alter_ir){ + edge_id = which((lcc_edges$from %in% c(ego_ir,alter_ir) + lcc_edges$to %in% c(ego_ir,alter_ir))==2) + weight = lcc_edges$weight[edge_id] + } + hyp_lcc_edges[ei,] = c(ego_id, alter_id, weight) + ei = ei + 1 + } +} +print(proc.time()-ptm) #10 hrs + +##### Create new graph +hyp_lcc = graph.data.frame(hyp_lcc_edges, directed=FALSE, vertices=hyp_lcc_verts) + +save(hyp_lcc_edges, hyp_lcc_verts, hyp_lcc, file='Results/hyper-lcc.RData') +write.csv(hyp_lcc_edges, file='Results/hyp_lcc_edges.csv') +write.csv(hyp_lcc_verts[,c('id','spawn.date','vic','vic.type','vic.day')], file='Results/hyp_lcc_verts.csv') + diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R new file mode 100755 index 0000000..404ce9e --- /dev/null +++ b/R Scripts/data-exploration.R @@ -0,0 +1,40 @@ +library(igraph) +setwd("Documents/Cascade Project/Raw Data/") +load('chi-19mar2015.RData') + +d = remove.edge.attribute(person,'weight') +lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) + +##### Small-World Analysis +trl = mean(transitivity(lcc,type='local',isolates='zero')) +apl = average.path.length(lcc) +cat('Local Transitivity =', trl);cat('\nAverage Path Length =', apl) + +nsim = 5 +ER_sim = data.frame(trl=rep(0,nsim),apl=0) +for(i in 1:nsim){ + print(i) + erg = erdos.renyi.game(n=vcount(lcc),p.or.m=ecount(lcc),type='gnm') + erg = induced.subgraph(erg,which(clusters(erg)$membership==which.max(clusters(erg)$csize))) + ER_sim[i,1] = mean(transitivity(erg,type='local',isolates='zero')) + ER_sim[i,2] = average.path.length(erg) +} + +S = data.frame(C_dat = trl, + L_dat = apl, + C_ER=mean(ER_sim$trl), + L_ER=mean(ER_sim$apl), + S_ER=mean((trl/ER_sim$trl)/(apl/ER_sim$apl))) +S + +##### Degree Distribution +plot(degree.distribution(lcc)*vcount(lcc),log='xy',type='l',col='red',lwd=2, + xlab='Degree', ylab='Number of Vertices', main='Degree Distribution') + + +##### Victims +vic_ids = which(V(lcc)$vic==TRUE) +non_vic_ids = which(V(lcc)$vic==FALSE) +hist(as.numeric(V(lcc)$vic_date[vic_ids]),100,col='lightblue', + xlab='Day of Study Period',main='Infections During the Study Period') + diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R new file mode 100755 index 0000000..3104ea2 --- /dev/null +++ b/R Scripts/data-prep.R @@ -0,0 +1,223 @@ +library(igraph) +setwd('~/Documents/Cascade Project/Raw Data/') + +#================ +# (1) load data +#================ + +#load all three sets of data +arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) + +#I need to add the "ir" for this to make sense when I "project" +arrests$ir2 <- paste("ir", arrests$ir_no) + +## Match arrests based on date, time, and location +a = arrests[arrests$rd_no=='',] +dtab = table(a$arrest_date) +dates = attr(dtab,'name')[dtab>1] +for (date in dates){ + if (which(date==dates)%%10000==0) print(which(date==dates)) + ids = which(a$arrest_date==date) + grp = a[ids,] + stab = table(grp$street_nme) + streets = attr(stab,'name')[stab>1] + for (street in streets){ + arr_ids = as.numeric(rownames(grp[grp$street_nme==street,])) + arrests$rd_no[arr_ids] = paste('rd',arr_ids[1]) + } +} +# now make unique rd_nos for the other people arrested alone +null_rds = which(arrests$rd_no=='') +arrests$rd_no[null_rds] = paste('rd',null_rds) +save(arrests,file='arrests.RData') + +#===================== +# (2) Structure Data +#===================== + +#get the fields we need for all three: incidents, mni, and "type" +sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'))) +colnames(sub.arrests) <- c("events", "individuals") + +#============================= +# (3) Prep for making graphs +#============================= + +individuals <- unique(sub.arrests$individuals) + +events <- unique(sub.arrests$events) + +if (any(individuals %in% events)) + + stop('vertex name collision') + +vertices <- data.frame(name=c(events, individuals), + type=c(rep(FALSE, length(events)), + rep(TRUE, length(individuals))), + stringsAsFactors=FALSE) + +#=================================================================== +# (4) Make the GRAPH file +#=================================================================== +g <- graph.data.frame(sub.arrests, vertices=vertices) + + +#=================================================================== +# Sanity check the resulting igraph object +stopifnot(ecount(g) == nrow(sub.arrests)) +stopifnot(vcount(g) == nrow(vertices)) + +names <- V(g)$name +stopifnot(isTRUE(identical(sort(names), sort(vertices$name)))) + +inames <- V(g)[V(g)$type]$name +stopifnot(isTRUE(identical(sort(inames), sort(individuals)))) + +enames <- V(g)[! V(g)$type]$name +stopifnot(isTRUE(identical(sort(enames), sort(events)))) + +#=================================================================== +#now do the converstion into a single network +net1 <- bipartite.projection(g) +person <- net1[[2]] + +#=================================================================== +# (5) Define attributes on nodes +#=================================================================== +# set attributes from arrests file + +attribs <- arrests +match_vector = match(V(person)$name, attribs$ir2) +V(person)$sex <- as.character(attribs$sex_code_cd[match_vector]) +V(person)$race <- as.character(attribs$race_code_cd[match_vector]) +# V(person)$age <- as.character(attribs$age[match_vector]) +V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y')) + +# January 1, 2006 is Day 1 of the study period +start_date = as.Date("2005-12-31") + +## Get first arrest date in the study period for each person +sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') +sub.arrests = sub.arrests[order(sub.arrests$dates),] +sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),] +arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y') +arrest.days = as.numeric(arrest.dates-start_date) +V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)] + +V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)], + V(person)$dob, + units='days')/365.25) + +#=================================================================== + + +# get victim attributes +shootings <- read.csv("shooting-data-withdate2.csv", header = T) +victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] +victims = victims[!is.na(victims$IR_NO),] +victims$ir2 <- paste("ir", victims$IR_NO) + +# get murder victim attributes +murders = read.csv("murder-victims-13nov.csv", header=T) +murders = murders[!is.na(murders$VICTIM_IR_NO),] +murders = murders[murders$INJURY_DESCR=="SHOT",] +murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] +murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] +murders$ir2 = paste("ir", murders$VICTIM_IR_NO) + +# set victim data in network +vtab = as.data.frame(table(victims$ir2)) +match_vector = match(V(person)$name,vtab$Var1) +V(person)$vic.nonfatal = vtab$Freq[match_vector] +V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0 +V(person)$vic.fatal = V(person)$name %in% murders$ir2 +V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2) + +# add fatal shooting dates to the network +match_vector = match(V(person)$name, murders$ir2) +fatal_dates = murders$INJURY_DATE[match_vector] +fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y')) +V(person)$fatal_date = fatal_dates + +# add nonfatal shooting dates to the network +match_vector = match(victims$ir2,V(person)$name) +vics = which(V(person)$vic.nonfatal>0) +nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics)) +for(i in 1:length(vics)){ + if (i%%3000==0) print(i) + name = vics[i] + ids = which(match_vector==name) + dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) + nfd1[i] = as.character(dates[1]) + nfd2[i] = as.character(dates[2]) + nfd3[i] = as.character(dates[3]) + nfd4[i] = as.character(dates[4]) + nfd5[i] = as.character(dates[5]) +} +V(person)$nonfatal_date_1 = NA +V(person)$nonfatal_date_2 = NA +V(person)$nonfatal_date_3 = NA +V(person)$nonfatal_date_4 = NA +V(person)$nonfatal_date_5 = NA +V(person)$nonfatal_date_1[vics] = nfd1 +V(person)$nonfatal_date_2[vics] = nfd2 +V(person)$nonfatal_date_3[vics] = nfd3 +V(person)$nonfatal_date_4[vics] = nfd4 +V(person)$nonfatal_date_5[vics] = nfd5 + +# convert dates into numeric values ("days") +start_date +V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) +V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date) +V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date) +V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date) +V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date) +V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date) + +#=================================================================== +# set gang attributes +gangs <- read.csv("Sept2014-ganglist.csv", header=T) +gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] +gangs$ir2 <- paste("ir", gangs$IR_NO) + +V(person)$gang.member <- V(person)$name %in% gangs$ir2 + +match_vector = match(V(person)$name, gangs$ir2) +gnames = gangs$GANG_NAME[match_vector] +gnames = as.character(gnames) +gnames[V(person)$gang.member==''] = 'Unknown' +gnames[V(person)$gang.member==F] = 'None' +V(person)$gang.name <- as.character(gnames) +V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) + +#=================================================================== +# create id number +V(person)$id = rank(V(person)$name) + +# save data +# person = remove.edge.attribute(person,'weight') +# person_data = get.data.frame(person,'both') +save(person, file="chi-19mar2015.RData") + +#=================================================================== +# get LCC of the network +lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) +V(lcc)$ir_no = V(lcc)$name +vic_ids = which(V(lcc)$vic) +lcc_verts = get.data.frame(lcc,'vertices') +lcc_edges = get.data.frame(lcc,'edges') +save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") + +lcc_data = get.data.frame(lcc,'both') +lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) +row.names(lcc_data$vertices) = lcc_data$vertices$id +vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal', + 'fatal_day','nonfatal_day_1','nonfatal_day_2', + 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')] + +write.csv(vertices,file='lcc_vertices.csv') +write.csv(lcc_data$edges,file='lcc_edges.csv') + +lcc = graph.data.frame(lcc_edges, directed=FALSE, + vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')]) +write.graph(lcc,'lcc.gml','gml') diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R new file mode 100644 index 0000000..ecd5c83 --- /dev/null +++ b/R Scripts/find-cascades.R @@ -0,0 +1,73 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +load('Results/hyper-lcc.RData') +load('Results/dag_dat_vics.RData') +source('Scripts/temporal.R') +# source('Scripts/homophily.R') +source('Scripts/structural.R') + +vic_ids = which(V(hyp_lcc)$vic==TRUE) + +#### Initialize model parameters +alpha = 1/5 +# gamma = 0.3 +delta = 0.1 +beta = 9.955713e-6 + +##### Get weights +# find max n days where infection possible given alpha +edges = dag_dat_vics +# edges = edges[(edges$t2-edges$t1)<300,] + +p_t = temporal(edges$t1, edges$t2, alpha) +p_s = structural(delta, edges$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * temporal(edges$t1, edges$t2+1, alpha) +weights = p/p_tilde + +##### Find transmission edges +# load('Results/lm-probs.RData') +# names = attr(lm.probs,'names') +# probs = as.numeric(lm.probs) +# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)] +# betas = 0.055 +thresh = beta/(1-beta) + +realized = c() +# edges = edges[weights>thresh,] +# weights = weights[weights>thresh] +for (u in vic_ids){ + max_prob = -1 + max_edge = NULL + Ei = which(edges$to==u) + for(i in Ei){ + if(weights[i]>thresh){ + prob = p[i] * prod(p_tilde[setdiff(Ei,i)]) + if(prob>max_prob){ + max_prob = prob + max_edge = i + } + } + } + realized = c(realized, max_edge) +} +# if (length(Ei)>0){ +# max_edge = Ei[which.max(weights[Ei])] # how to deal with ties???? +# realized = c(realized, max_edge) +# } + +##### Create forest of cascades +# dag is all of the victim nodes but with only transmission edges, or don't index by vic_ids +dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=hyp_lcc_verts[vic_ids,]) +dag = set.edge.attribute(dag,'weight',value=weights[realized]) +table(clusters(dag)$csize) + +t=table(clusters(dag)$csize) +sizes = as.numeric(attr(t,'name')) +counts = as.numeric(t) +plot(sizes,counts,log='xy',type='o',lwd=3) +n.seeds = sum(counts) +n.nodes = sum(sizes*counts) # for vics case is just length(vic_ids) +n.infections = n.nodes-n.seeds + +# dag = graph.edgelist(as.matrix(edges[realized,1:2])) diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R new file mode 100755 index 0000000..660a503 --- /dev/null +++ b/R Scripts/generate-dag-dat.R @@ -0,0 +1,59 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +load('Results/hyper-lcc.RData') + +vic_ids = which(V(hyp_lcc)$vic==TRUE) + +edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])} + +dag_dat_all = data.frame(matrix(nrow=1,ncol=8)) +hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight') +ei = 1 +ptm=proc.time() +for (u in vic_ids){ + if ((which(vic_ids==u) %% 1000)==0) print(which(vic_ids==u)) + tu = hyp_lcc_verts$vic.day[u] + u_spawn = hyp_lcc_verts$spawn.date[u] + nbhd = unlist(neighborhood(hyp_lcc,nodes=u,order=3)) # get nodes within neighborhood + nbhd = nbhd[-1] # don't want to include u in the neighborhood + tvs = hyp_lcc_verts$vic.day[nbhd] + v_spawn = hyp_lcc_verts$spawn.date[nbhd] + nbhd = nbhd[tu>v_spawn & (is.na(tvs) | tu<tvs)] + tvs = hyp_lcc_verts$vic.day[nbhd] + dists = as.numeric(shortest.paths(hyp_lcc2,u,nbhd)) + + es = get.shortest.paths(hyp_lcc2,u,nbhd,output='epath')$epath + weights = matrix(unlist(lapply(es,edgeWeights),use.names = F),ncol=3,byrow=T) + + #will be faster to pre-allocate and fill in rather than rbind each time + dag_dat_all[ei:(ei+length(nbhd)-1),] = data.frame(rep(u,length(nbhd)), nbhd, + rep(tu,length(nbhd)), tvs, dists, + weights, row.names=NULL) + ei = ei + length(nbhd) +} +print(proc.time()-ptm) #3.5 hours +colnames(dag_dat_all) = c('from','to','t1','t2','dist','w1','w2','w3') +rownames(dag_dat_all) = NULL + +save(dag_dat_all, file='Results/dag_dat_all.RData') +write.csv(dag_dat_all, file='Results/dag_dat_all.csv') + +dag_dat_vics = dag_dat_all[!is.na(dag_dat_all$t2),] +save(dag_dat_vics, file='Results/dag_dat_vics.RData') +write.csv(dag_dat_vics, file='Results/dag_dat_vics.csv') + +# analyze min possible infection time +i = 1 +min_time = 0#rep(Inf,length(unique(dag_dat_vics$to))) +min_time_dist = 0#rep(Inf,length(unique(dag_dat_vics$to))) +for(to in unique(dag_dat_vics$to)){ + rows = which(dag_dat_vics$to==to & dag_dat_vics$dist<2) + if(length(rows)>0){ + min_time[i] = min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows]) + min_time_dist[i] = dag_dat_vics$dist[rows[which.min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows])]] + i = i + 1 + } +} +median(min_time) +mean(min_time<100) +save(min_time_1,min_time_2,min_time_3,file='Results/min_inf_time.RData') diff --git a/R Scripts/likelihood.R b/R Scripts/likelihood.R new file mode 100644 index 0000000..a4e5254 --- /dev/null +++ b/R Scripts/likelihood.R @@ -0,0 +1,30 @@ +# run after find-cascades + +load('Results/dag_dat_all.RData') + +edges_all = dag_dat_all +edges_all$t2[is.na(edges_all$t2)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1 +realized_all = as.numeric(rownames(dag_dat_vics)[realized]) + +p_t = temporal(edges_all$t1, edges_all$t2, alpha) +p_s = structural(delta, edges_all$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * temporal(edges_all$t1, edges_all$t2+1, alpha) + +vic.days = hyp_lcc_verts$vic.day +vic.days[is.na(vic.days)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1 +ages = vic.days-hyp_lcc_verts$spawn.date + +# realized edges +prob_realized = sum(log(p[realized_all])) + +# non-realized edges +prob_not_realized = sum(log(p_tilde[-realized_all])) + +# seeds +prob_beta = n.seeds * log(beta) + +# non-seeds +prob_beta_tilde = sum(ages) * log(1-beta) + +ll = prob_beta + prob_beta_tilde + prob_realized + prob_not_realized diff --git a/R Scripts/lm.R b/R Scripts/lm.R new file mode 100755 index 0000000..f1bbae0 --- /dev/null +++ b/R Scripts/lm.R @@ -0,0 +1,74 @@ +library(pROC) +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/lcc_bw.RData') + +is_vic = V(lcc)$vic +vics = which(V(lcc)$vic==T) +nonvics = which(V(lcc)$vic==F) +thresh = length(vics)/length(union(vics,nonvics)) + +nbhdVics = function(neighbs){return(sum(is_vic[neighbs[-1]]))} +#setdiff(is_vic order-(order-1) to get only those at the given order +n1s = neighborhood(lcc,1,V(lcc)) +n2s = neighborhood(lcc,2,V(lcc)) +n1vics = unlist(lapply(n1s,nbhdVics)) +n2vics = unlist(lapply(n2s,nbhdVics)) + +data = data.frame(vic = V(lcc)$vic, + sex = V(lcc)$sex, + race = V(lcc)$race, + age = V(lcc)$age, + gang.member = V(lcc)$gang.member, + gang.name = V(lcc)$gang.name, +# fname = V(lcc)$faction.name, + deg = degree(lcc), + bw = bw, + n1v = n1vics, + n2v = n2vics + ) +data$age = as.numeric(data$age) +data$gname = as.factor(data$gname) + +train = 1:dim(data)[1]; test = 1:dim(data)[1] +# train = sort(union(sample(vics,length(vics)*0.8),sample(nonvics,length(nonvics)*0.8))) +# test = sort((1:length(is_vic))[-train]) + +formula = vic ~ sex + race + age + gmember + gname + deg + bw + n1v + n2v +formula = vic ~ sex + race + age + gmember + gname +formula = vic ~ deg + bw +formula = vic ~ n1v + n2v + +glm.fit = glm(formula, data=data, family=binomial) +glm.probs = predict(glm.fit, newdata=data, type='response') +roc = roc(data$vic[test],glm.probs,ci=TRUE) +plot(roc,print.auc=TRUE) +roc$auc + +lm.fit = lm(formula, data=data) +lm.probs = predict(lm.fit, newdata=data, type='response') +p = 1/(1+exp(-lm.probs)) +z = log(lm.probs/(1-lm.probs)) +logit = 1/(1+exp(-z)) + +save(glm.probs, file='Results/lm-probs.RData') + +########## +lm.pred = lm.probs>thresh +tab = table(lm.pred,data$vic) +tab +(tab[1,1]+tab[2,2])/sum(tab) + +propensity = as.numeric(pred) +similarity = 1 - abs(propensity[u]-propensity[v]) + +############### +# Random forest approach -- not as good +library(randomForest) +data$vic = factor(data$vic) +rf.fit = randomForest(formula,data=data[train,],do.trace=50,ntree=20) +rf.probs = (predict(rf.fit,newdata=data[test,],type='prob'))[,1] +roc = roc(data$vic[test],rf.probs,ci=TRUE) +plot(roc,print.auc=TRUE) + diff --git a/R Scripts/plot-graph.R b/R Scripts/plot-graph.R new file mode 100755 index 0000000..b6f9341 --- /dev/null +++ b/R Scripts/plot-graph.R @@ -0,0 +1,22 @@ +#### plot the victims and non-victims of the network +cols = rep('lightgrey',vcount(g)) #number of nodes in g +cols[V(g)$vic==1] = 'red' +shapes = rep('circle',vcount(g)) +shapes[V(g)$vic==1] = 'square' +plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols, vertex.shape=shapes,vertex.size=2) +legend('topleft', legend=c("non-victim", "victim"), pch=c(16,15), col=c("lightgrey", "red"), inset=0.1) + +#### plot the neighborhood of a node +v = 10 +v1 = unlist(neighborhood(lcc,nodes=v,order=1)) +v2 = unlist(neighborhood(lcc,nodes=v,order=2)) +v3 = unlist(neighborhood(lcc,nodes=v,order=3)) +v3 = setdiff(v3,v2) +v2 = setdiff(v2,v1) +v1 = setdiff(v1,v) +cols = rep('lightgrey',vcount(g)) +cols[v] = 'red' +cols[v1] = 'blue' +cols[v2] = 'green' +cols[v3] = 'black' +plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols,vertex.size=2) diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R new file mode 100755 index 0000000..3960578 --- /dev/null +++ b/R Scripts/sim-analysis.R @@ -0,0 +1,72 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") + +load('Results/hyper-lcc.RData') +vic_ids = which(V(hyp_lcc)$vic==1) +n.infections = length(vic_ids) +n.days = max(hyp_lcc_verts$vic.day,na.rm=T) - min(hyp_lcc_verts$vic.day,na.rm=T) +inf.dates = hyp_lcc_verts$vic.day[vic_ids] + +ptm = proc.time() +n = 1500 +mean.time = matrix(0,3,n) +med.time = matrix(0,3,n) +mean.50 = matrix(0,3,n) +mean.100 = matrix(0,3,n) +n.vicpairs = matrix(0,3,n) +for(sim in 1:3){ + print(paste('sim:',sim)) + for(q in 1:n){ + if (q%%250==0) print(paste('run:',q)) + graph = hyp_lcc + if (sim<3) sim.dates = sample(n.days, n.infections, replace=TRUE) # sims 1 + 2 + if (sim==3) sim.dates = sample(inf.dates) # sim 3 + if (sim==1) vics = sample(vcount(hyp_lcc), n.infections, replace=FALSE) # sim 1 + if (sim>1) vics = vic_ids # sims 2 + 3 + if (sim==0) {vics = vic_ids; sim.dates = inf.dates} # data + + vic.time = c() + for (i in 1:n.infections){ + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=1)) + nbhd = intersect(vic_ids,nbhd) + nbhd = setdiff(nbhd,u) + nbhd = nbhd[u<nbhd] # only looks at pairs with u<v to avoid double-counting + nbhd = nbhd[hyp_lcc_verts$ir_no[u] != hyp_lcc_verts$ir_no[nbhd]] # don't count infections of the same person + tu = sim.dates[i] + tvs = sim.dates[match(nbhd,vic_ids)] + vic.time = c(vic.time,abs(tu-tvs)) + } + n.vicpairs[sim,q] = length(vic.time) + mean.time[sim,q] = mean(vic.time) + med.time[sim,q] = median(vic.time) + mean.50[sim,q] = mean(vic.time<=50) + mean.100[sim,q] = mean(vic.time<=100) + # decay.rate[sim,q] = getDecayRate(vic.time) #get lm of histogram. or maybe just cor? + } +} +print(proc.time()-ptm) + + +####### plot simulation data ####### +data = 0.1176774 # 774.5483, 634, 0.07252472, 0.1176774 +simdata = mean.100 +xl = c(0.9*min(min(simdata),data),1.1*max(max(simdata),data)) +yl = c(0,max(c(hist(simdata[1,],15)$density,hist(simdata[2,],15)$density,hist(simdata[3,],15)$density))) +hist(simdata[1,],15,xlim=xl,ylim=yl,col=rgb(0,1,0,1/4),freq=F, + xlab='Neighbors Infected Within 100 Days', main=NULL) +hist(simdata[2,],15,col=rgb(0,0,1,1/4),freq=F,add=TRUE) +hist(simdata[3,],15,col=rgb(1,0,1,1/4),freq=F,add=TRUE) +abline(v=data,lwd=4,col='red') + + +##### inf dates histogram +dates = c(V(lcc)$fatal_date[!is.na(V(lcc)$fatal_date)], + V(lcc)$nonfatal_date_1[!is.na(V(lcc)$nonfatal_date_1)], + V(lcc)$nonfatal_date_2[!is.na(V(lcc)$nonfatal_date_2)], + V(lcc)$nonfatal_date_3[!is.na(V(lcc)$nonfatal_date_3)], + V(lcc)$nonfatal_date_4[!is.na(V(lcc)$nonfatal_date_4)], + V(lcc)$nonfatal_date_5[!is.na(V(lcc)$nonfatal_date_5)]) +dates = as.Date(dates) +hist(dates, breaks='months',freq=T,col='lightblue',format='%m/%Y', + xlab='Month',ylab='Number of Shootings',main='') diff --git a/R Scripts/structural.R b/R Scripts/structural.R new file mode 100755 index 0000000..132aa6c --- /dev/null +++ b/R Scripts/structural.R @@ -0,0 +1,7 @@ +# structural = function(w1,w2,w3,gamma){ +# return (plogis(w1,scale=gamma) * plogis(w2,scale=gamma) * plogis(w3,scale=gamma)) +# } + +structural = function(delta,dist){ + return (delta^dist) +}
\ No newline at end of file diff --git a/R Scripts/temporal.R b/R Scripts/temporal.R new file mode 100755 index 0000000..13194a4 --- /dev/null +++ b/R Scripts/temporal.R @@ -0,0 +1,10 @@ + +temporal = function(tu,tv,alpha){ + dt = tv - tu - 1 + if (sum(dt<0)==0) { + return (alpha*exp(-alpha*dt)) +# return ((0.01/alpha)*(1+dt/alpha)^(-1.01)) + } else { + return (NA) + } +}
\ No newline at end of file |
