From 1739e9f5706bb8a73de5dbf0b467de49ea040898 Mon Sep 17 00:00:00 2001 From: Ben Green Date: Mon, 8 Jun 2015 15:21:51 -0400 Subject: added my R scripts --- R Scripts/-compare-sim.R | 34 ++++++ R Scripts/-generate-dag-dat-sim.R | 50 ++++++++ R Scripts/-recover-data.R | 27 ++++ R Scripts/-simulate-infections.R | 34 ++++++ R Scripts/.Rhistory | 0 R Scripts/README.rtf | 22 ++++ R Scripts/analyze-cascades.R | 58 +++++++++ R Scripts/arrest-vs-infection-times.R | 43 +++++++ R Scripts/benchmarking.R | 130 ++++++++++++++++++++ R Scripts/create-hyper-lcc.R | 126 +++++++++++++++++++ R Scripts/data-exploration.R | 40 ++++++ R Scripts/data-prep.R | 223 ++++++++++++++++++++++++++++++++++ R Scripts/find-cascades.R | 73 +++++++++++ R Scripts/generate-dag-dat.R | 59 +++++++++ R Scripts/likelihood.R | 30 +++++ R Scripts/lm.R | 74 +++++++++++ R Scripts/plot-graph.R | 22 ++++ R Scripts/sim-analysis.R | 72 +++++++++++ R Scripts/structural.R | 7 ++ R Scripts/temporal.R | 10 ++ 20 files changed, 1134 insertions(+) create mode 100755 R Scripts/-compare-sim.R create mode 100755 R Scripts/-generate-dag-dat-sim.R create mode 100755 R Scripts/-recover-data.R create mode 100755 R Scripts/-simulate-infections.R create mode 100644 R Scripts/.Rhistory create mode 100644 R Scripts/README.rtf create mode 100755 R Scripts/analyze-cascades.R create mode 100644 R Scripts/arrest-vs-infection-times.R create mode 100644 R Scripts/benchmarking.R create mode 100644 R Scripts/create-hyper-lcc.R create mode 100755 R Scripts/data-exploration.R create mode 100755 R Scripts/data-prep.R create mode 100644 R Scripts/find-cascades.R create mode 100755 R Scripts/generate-dag-dat.R create mode 100644 R Scripts/likelihood.R create mode 100755 R Scripts/lm.R create mode 100755 R Scripts/plot-graph.R create mode 100755 R Scripts/sim-analysis.R create mode 100755 R Scripts/structural.R create mode 100755 R Scripts/temporal.R diff --git a/R Scripts/-compare-sim.R b/R Scripts/-compare-sim.R new file mode 100755 index 0000000..d96bd81 --- /dev/null +++ b/R Scripts/-compare-sim.R @@ -0,0 +1,34 @@ + +vics = vic_ids +graph = lcc + +n.infections = length(vics) +vic.neighbors = rep(0,n.infections) +for (i in 1:n.infections){ + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=3)) + nbhd = intersect(vic_ids,nbhd) + vic.neighbors[i] = sum(nbhd %in% vics) +} + + +n.infections = length(vics) +vic.time = rep(0,n.infections) +for (i in 1:n.infections){ + if (i%%1000==0) print(i) + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=1)) + nbhd = intersect(vic_ids,nbhd) + nbhd = setdiff(nbhd,u) + tu = as.numeric(V(graph)$vic_date[u]) + tvs = as.numeric(V(graph)$vic_date[nbhd]) + tvs = tvs[tvs0){ + vic.time[i] = min(tu-tvs) + } +} +vic.time = vic.time[vic.time>0] +mean(vic.time) +median(vic.time) +mean(vic.time<100) +sum(vic.time>0) diff --git a/R Scripts/-generate-dag-dat-sim.R b/R Scripts/-generate-dag-dat-sim.R new file mode 100755 index 0000000..a6d115d --- /dev/null +++ b/R Scripts/-generate-dag-dat-sim.R @@ -0,0 +1,50 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") +load('Data/lcc_sim2a.RData') + +# d = remove.edge.attribute(person,'weight') +# lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) +# vic_ids = which(V(lcc)$vic==TRUE) +# dag = graph.empty(vcount(lcc)) +# dag = set.edge.attribute(dag,'weight',value=0) + +lcc = lcc.sim +vic_ids = vic_ids.sim +ei = 1 + +dag_dat = matrix(0,253096,5) +for (1 in 1:length(vic_ids)){ + if (i %% 100)==0) print(which(vic_ids==u)) + + u = vic_ids[i] + nbhd = unlist(neighborhood(lcc, nodes=u, order=3)) # get nodes within neighborhood + nbhd = intersect(vic_ids,nbhd) # only want victim nodes + nbhd = setdiff(nbhd,u) # don't want to include u in the neighborhood + + tu = as.numeric(V(lcc)$vic_date[u]) + tvs = as.numeric(V(lcc)$vic_date[nbhd]) + Es = tu0){ + nbhd = nbhd[Es] + dists = as.numeric(shortest.paths(lcc,u,nbhd)) + + for (j in 1:sum(Es)){ + v = nbhd[j] + d = dists[j] + tv = tvs[j] +# dag_dat[ei,] = c(u,v,d,tu,tv) + dag_dat = rbind(dag_dat,c(u,v,d,tu,tv)) +# f = time(tu,tv) +# h = structural(d) +# weight[ei] = f*h + ei = ei+1 + } + } +} + +dag_dat = dag_dat[rowSums(dag_dat)>0,] +dag_dat = as.data.frame(dag_dat) +colnames(dag_dat) = c('from','to','dist','t1','t2') +# save(dag_dat,file='Data/dag_dat_sim2a.RData') diff --git a/R Scripts/-recover-data.R b/R Scripts/-recover-data.R new file mode 100755 index 0000000..b34ffc6 --- /dev/null +++ b/R Scripts/-recover-data.R @@ -0,0 +1,27 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") + +el = read.csv('Data/dag.csv') +lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) +vic_ids = which(V(lcc)$vic==TRUE) + +from = vic_ids[el$from] +to = vic_ids[el$to] + +t1 = as.Date(V(lcc)$vic_date[from],format='%m/%d/%y') +t2 = as.Date(V(lcc)$vic_date[to],format='%m/%d/%y') + +uf = unique(from) +dist = rep(0,length(from)) +for (i in 1:length(uf)){ + if (i%%1000==0) print(i) + f = uf[i] + fi = which(from==f) + ds = as.numeric(shortest.paths(lcc,v=f,to=to[fi])) + dist[fi] = ds +} + +dag_dat = data.frame(from=el$from,to=el$to,dist,t1,t2) + +save(dag_dat,file='Data/dag_dat.RData') +write.csv(dag_dat,'Data/dag_dat.csv') diff --git a/R Scripts/-simulate-infections.R b/R Scripts/-simulate-infections.R new file mode 100755 index 0000000..f1409af --- /dev/null +++ b/R Scripts/-simulate-infections.R @@ -0,0 +1,34 @@ +library(igraph) +setwd("/Users/Ben/Documents/Harvard/Fall 2014/CS 284r Social Data Mining/Cascade Project/") + +load('Data/dag_dat.RData') +load('Data/lcc.RData') + +n.infections = length(vic_ids) +n.nodes = (vcount(lcc)) +n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1)) +gamma = n.infections/(n.nodes*n.days) + +vic = rep(FALSE,n.nodes) +for(t in 1:n.days){ + if (t%%1000==0) print(t) + infections = which(runif(n.nodes)0 +V(lcc.sim)$vic_date = vic +vic_ids.sim = which(vic>0) +# save(lcc.sim,vic_ids.sim, file='Data/lcc_sim1b.RData') + +#### simulation method 2 #### +load('Data/dag_dat.RData') +load('Data/lcc.RData') +n.infections = length(vic_ids) +n.days = as.numeric(max(dag_dat$t2) - min(dag_dat$t1)) +lcc.sim = lcc +V(lcc.sim)$vic_date[vic_ids] = sample(n.days, n.infections, replace=TRUE) +vic_ids.sim = which(V(lcc.sim)$vic) +# save(lcc.sim, vic_ids.sim, file='Data/lcc_sim2a.RData') diff --git a/R Scripts/.Rhistory b/R Scripts/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/R Scripts/README.rtf b/R Scripts/README.rtf new file mode 100644 index 0000000..48e0353 --- /dev/null +++ b/R Scripts/README.rtf @@ -0,0 +1,22 @@ +{\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\margl1440\margr1440\vieww10800\viewh8400\viewkind0 +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural + +\f0\fs24 \cf0 PIPELINE:\ +data-prep \'97 generate co-offending network and lcc\ +create-hyper-lcc \'97 generate new network with a unique node for each infection\ +generate-dag-dat \'97 generate a dag data frame based on possible infection edges\ +find-cascades \'97 calculate edge weights and find transmission edges, yielding a forest of cascades\ +\ +OTHER FILES:\ +analyze-cascades: simple code to analyze cascades the model generates\ +compare-sim:\ +data-exploration:\ +lm: linear model to predict victims based on demographic and other features\ +plot-graph:\ +sim-analysis:\ +simulate-infections:\ +structural: function to calculate the structural component of the infection model\ +temporal: function to calculate the temporal component of the infection model} \ No newline at end of file diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R new file mode 100755 index 0000000..463de18 --- /dev/null +++ b/R Scripts/analyze-cascades.R @@ -0,0 +1,58 @@ +# library(igraph) +# setwd("~/Documents/Cascade Project/") +# +# load('Results/dag_dat_all.RData') +# load('Results/weight-12-1-14.RData') +# load('Results/hyper-lcc.RData') + +# dag = graph.edgelist(as.matrix(dag_dat[,1:2])) +# dag = set.edge.attribute(dag,'weight',value=weight) +# dag_dat = dag_dat[which(E(dag)$weight>=0.4),] +# dag = delete.edges(dag, which(E(dag)$weight<0.4)) + +table(clusters(dag)$csize) + +clusters = clusters(dag) +membership = clusters$membership +csize = clusters$csize +order = rev(order(csize)) + +#use table not hist +plot(sizes,counts,log='xy',type='o',lwd=3, + xlab='Size of Cascade', ylab='Number of Cascades', main='Distribution of Cascade Sizes') + +i = 14 +V = which(clusters(dag)$membership==order[i]) # get all nodes in cluster +cc = induced.subgraph(dag,V) +Vi = vic_ids[V] +Ei = intersect(which(dag_dat_vics$from[realized] %in% Vi),which(dag_dat_vics$to[realized] %in% Vi)) +cc_dat = (dag_dat_vics[realized,])[Ei,] + +### plot cascade ### +cols = rep('lightblue',vcount(cc)) +seed = which(degree(cc,mode='in')==0) +cols[seed] = 'red' +plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1, + edge.width=E(cc)$weight*20/max(E(cc)$weight),layout=layout.reingold.tilford(cc,root=seed), + vertex.label=V(hyp_lcc)$vic.day[Vi]) +# vertex.label=as.Date(V(hyp_lcc)$vic.day[Vi], format ='%m/%d/%y')) +# vertex.label=V(lcc)$vic_date[Vi]) + +### basic graph statistics +trl = mean(transitivity(cc,type='local',isolates='zero')) +apl = average.path.length(cc) +indeg = degree(cc,mode='in') +outdeg = degree(cc,mode='out') +ds = mean(cc_dat$dist) + +### node demographic statistics +from = vic_ids[cc_dat$from] +to = vic_ids[cc_dat$to] +V(lcc)$sex[from] == V(lcc)$sex[to] +V(lcc)$sex[Vi] +V(lcc)$race[Vi] +as.numeric(V(lcc)$age[Vi]) +V(lcc)$gang.member[Vi] +V(lcc)$gang.name[Vi] +V(lcc)$faction.name[Vi] + diff --git a/R Scripts/arrest-vs-infection-times.R b/R Scripts/arrest-vs-infection-times.R new file mode 100644 index 0000000..222be3e --- /dev/null +++ b/R Scripts/arrest-vs-infection-times.R @@ -0,0 +1,43 @@ +library(igraph) +setwd('~/Documents/Cascade Project/Raw Data/') + +##### Load arrest data +arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) +arrests$ir2 <- paste("ir", arrests$ir_no) +sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'), + as.character('arrest_date'))) +colnames(sub.arrests) <- c("events", "individuals", "dates") +sub.arrests$dates = as.Date(sub.arrests$dates,format='%m/%d/%Y') +sub.arrests = sub.arrests[order(sub.arrests$dates),] +# individuals <- unique(sub.arrests$individuals) +# events <- unique(sub.arrests$events) + +##### Look at nonfatal victims +shootings <- read.csv("shooting-data-withdate2.csv", header = T) +victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] +victims = victims[!is.na(victims$IR_NO),] +victims$ir2 <- paste("ir", victims$IR_NO) +victims$INCIDENT_DATE = as.Date(victims$INCIDENT_DATE,format='%m/%d/%y') +victims = victims[order(victims$INCIDENT_DATE),] # sort so match gets first infection +victims = victims[victims$ir2 %in% arrests$ir2,] # only look at vics who were arrested + +vic_dates = victims$INCIDENT_DATE[match(unique(victims$ir2),victims$ir2)] +arrest_dates = sub.arrests$dates[match(unique(victims$ir2), sub.arrests$individuals)] +sum(vic_dates==arrest_dates) +# the number of vics in arrest table doesn't match number of nonfatal vics in person? + +##### Look at fatal victims +murders = read.csv("murder-victims-13nov.csv", header=T) +murders = murders[!is.na(murders$VICTIM_IR_NO),] +murders = murders[murders$INJURY_DESCR=="SHOT",] +murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] +murders$ir2 = paste("ir", murders$VICTIM_IR_NO) +start_date = as.Date("2005-12-31") +murders$INJURY_DATE = as.Date(murders$INJURY_DATE,format='%m/%d/%y') +murders = murders[murders$INJURY_DATE>=start_date,] +murders = murders[order(murders$INJURY_DATE),] +murders = murders[murders$ir2 %in% arrests$ir2,] + +mur_dates = murders$INJURY_DATE +arrest_dates = sub.arrests$dates[match(murders$ir2, sub.arrests$individuals)] +sum(mur_dates==arrest_dates) diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R new file mode 100644 index 0000000..1cf99e0 --- /dev/null +++ b/R Scripts/benchmarking.R @@ -0,0 +1,130 @@ +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/hyper-lcc.RData') +load('Results/dag_dat_all.RData') +source('Scripts/temporal.R') +source('Scripts/structural.R') + +##### Initialize data +formula = vic ~ sex + race + age + gang.member + gang.name +lcc_verts$sex = as.factor(lcc_verts$sex) +lcc_verts$race = as.factor(lcc_verts$race) +lcc_verts$age = as.numeric(lcc_verts$age) +lcc_verts$gang.name = as.factor(lcc_verts$gang.name) +# sum(hyp_lcc_verts$vic)/length(days) + +##### Loop through days +alpha = 1/100 +gamma = 0.18 +days = 70:max(hyp_lcc_verts$vic.day, na.rm=T) +lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) +nvics = sum(hyp_lcc_verts$vic.day %in% days) +correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +# correct_rank1 = correct_rank2 = correct_rank3 = c() +edges_all = dag_dat_all[dag_dat_all$dist<2,] +ptm = proc.time() +for (day in days){ + if (day %% 100 == 0) print(day) + + ##### Demographics model +# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day0){ + # create nodes for each nonfatal shooting + for (nf in 1:lcc_verts$vic.nonfatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = T + hyp_lcc_verts$vic.type[ri] = 'nonfatal' + if(nf==1){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_1[i] + hyp_lcc_verts$spawn.date[ri] = 0 + } else if(nf==2){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_2[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_1[i] + } else if(nf==3){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_3[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_2[i] + } else if(nf==4){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_4[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_3[i] + } else if(nf==5){ + hyp_lcc_verts$vic.day[ri] = lcc_verts$nonfatal_day_5[i] + hyp_lcc_verts$spawn.date[ri] = lcc_verts$nonfatal_day_4[i] + } + ri = ri+1 + } + # if no fatal infection, create uninfected duplicate + if (!lcc_verts$vic.fatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = F + hyp_lcc_verts$vic.type[ri] = NA + hyp_lcc_verts$vic.day[ri] = NA + hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i], + lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i], + lcc_verts$nonfatal_day_5[i],na.rm=T) + ri = ri+1 + } + } + # create a node for each fatal shooting + # if also nonfatal shootings, spawn at last nonfatal shooting + if (lcc_verts$vic.fatal[i]){ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = T + hyp_lcc_verts$vic.type[ri] = 'fatal' + hyp_lcc_verts$vic.day[ri] = lcc_verts$fatal_day[i] + if (lcc_verts$vic.nonfatal[i]>0){ + hyp_lcc_verts$spawn.date[ri] = max(lcc_verts$nonfatal_day_1[i],lcc_verts$nonfatal_day_2[i], + lcc_verts$nonfatal_day_3[i],lcc_verts$nonfatal_day_4[i], + lcc_verts$nonfatal_day_5[i],na.rm=T) + } else { + hyp_lcc_verts$spawn.date[ri] = 0 + } + ri = ri+1 + } + } + # create an uninfected node for each uninfected person + else{ + hyp_lcc_verts[ri,cols] = lcc_verts[i,cols] + hyp_lcc_verts$id[ri] = ri + hyp_lcc_verts$vic[ri] = F + hyp_lcc_verts$vic.type[ri] = NA + hyp_lcc_verts$vic.day[ri] = NA + hyp_lcc_verts$spawn.date[ri] = 0 + ri = ri+1 + } +} +print(proc.time()-ptm) # 1.5 hrs +row.names(hyp_lcc_verts) = NULL +n.nodes = sum(sum(V(lcc)$vic.fatal),sum(V(lcc)$vic.nonfatal), + sum(V(lcc)$vic.nonfatal>0 & !V(lcc)$vic.fatal),sum(V(lcc)$vic==FALSE)) +stopifnot(dim(hyp_lcc_verts)[1] == n.nodes) + +##### Create new edgelist +print('Edges') +hyp_lcc_edges = data.frame(from=0, to=0, weight=0) +ei = 1 +ptm = proc.time() +for(i in 1:dim(hyp_lcc_verts)[1]){ + if (i%%10000==0) print(i) + ego_id = hyp_lcc_verts$id[i] + ego_ir = hyp_lcc_verts$ir_no[i] + alter_irs = union(lcc_edges$from[which(lcc_edges$to==ego_ir)], + lcc_edges$to[which(lcc_edges$from==ego_ir)]) +# alter_irs = union(ego_ir, alter_irs) # add edges to other infected selves + alter_ids = hyp_lcc_verts$id[which(hyp_lcc_verts$ir_no %in% alter_irs)] + alter_ids = alter_ids[ego_id1] +for (date in dates){ + if (which(date==dates)%%10000==0) print(which(date==dates)) + ids = which(a$arrest_date==date) + grp = a[ids,] + stab = table(grp$street_nme) + streets = attr(stab,'name')[stab>1] + for (street in streets){ + arr_ids = as.numeric(rownames(grp[grp$street_nme==street,])) + arrests$rd_no[arr_ids] = paste('rd',arr_ids[1]) + } +} +# now make unique rd_nos for the other people arrested alone +null_rds = which(arrests$rd_no=='') +arrests$rd_no[null_rds] = paste('rd',null_rds) +save(arrests,file='arrests.RData') + +#===================== +# (2) Structure Data +#===================== + +#get the fields we need for all three: incidents, mni, and "type" +sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'))) +colnames(sub.arrests) <- c("events", "individuals") + +#============================= +# (3) Prep for making graphs +#============================= + +individuals <- unique(sub.arrests$individuals) + +events <- unique(sub.arrests$events) + +if (any(individuals %in% events)) + + stop('vertex name collision') + +vertices <- data.frame(name=c(events, individuals), + type=c(rep(FALSE, length(events)), + rep(TRUE, length(individuals))), + stringsAsFactors=FALSE) + +#=================================================================== +# (4) Make the GRAPH file +#=================================================================== +g <- graph.data.frame(sub.arrests, vertices=vertices) + + +#=================================================================== +# Sanity check the resulting igraph object +stopifnot(ecount(g) == nrow(sub.arrests)) +stopifnot(vcount(g) == nrow(vertices)) + +names <- V(g)$name +stopifnot(isTRUE(identical(sort(names), sort(vertices$name)))) + +inames <- V(g)[V(g)$type]$name +stopifnot(isTRUE(identical(sort(inames), sort(individuals)))) + +enames <- V(g)[! V(g)$type]$name +stopifnot(isTRUE(identical(sort(enames), sort(events)))) + +#=================================================================== +#now do the converstion into a single network +net1 <- bipartite.projection(g) +person <- net1[[2]] + +#=================================================================== +# (5) Define attributes on nodes +#=================================================================== +# set attributes from arrests file + +attribs <- arrests +match_vector = match(V(person)$name, attribs$ir2) +V(person)$sex <- as.character(attribs$sex_code_cd[match_vector]) +V(person)$race <- as.character(attribs$race_code_cd[match_vector]) +# V(person)$age <- as.character(attribs$age[match_vector]) +V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y')) + +# January 1, 2006 is Day 1 of the study period +start_date = as.Date("2005-12-31") + +## Get first arrest date in the study period for each person +sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') +sub.arrests = sub.arrests[order(sub.arrests$dates),] +sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),] +arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y') +arrest.days = as.numeric(arrest.dates-start_date) +V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)] + +V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)], + V(person)$dob, + units='days')/365.25) + +#=================================================================== + + +# get victim attributes +shootings <- read.csv("shooting-data-withdate2.csv", header = T) +victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] +victims = victims[!is.na(victims$IR_NO),] +victims$ir2 <- paste("ir", victims$IR_NO) + +# get murder victim attributes +murders = read.csv("murder-victims-13nov.csv", header=T) +murders = murders[!is.na(murders$VICTIM_IR_NO),] +murders = murders[murders$INJURY_DESCR=="SHOT",] +murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] +murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] +murders$ir2 = paste("ir", murders$VICTIM_IR_NO) + +# set victim data in network +vtab = as.data.frame(table(victims$ir2)) +match_vector = match(V(person)$name,vtab$Var1) +V(person)$vic.nonfatal = vtab$Freq[match_vector] +V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0 +V(person)$vic.fatal = V(person)$name %in% murders$ir2 +V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2) + +# add fatal shooting dates to the network +match_vector = match(V(person)$name, murders$ir2) +fatal_dates = murders$INJURY_DATE[match_vector] +fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y')) +V(person)$fatal_date = fatal_dates + +# add nonfatal shooting dates to the network +match_vector = match(victims$ir2,V(person)$name) +vics = which(V(person)$vic.nonfatal>0) +nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics)) +for(i in 1:length(vics)){ + if (i%%3000==0) print(i) + name = vics[i] + ids = which(match_vector==name) + dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) + nfd1[i] = as.character(dates[1]) + nfd2[i] = as.character(dates[2]) + nfd3[i] = as.character(dates[3]) + nfd4[i] = as.character(dates[4]) + nfd5[i] = as.character(dates[5]) +} +V(person)$nonfatal_date_1 = NA +V(person)$nonfatal_date_2 = NA +V(person)$nonfatal_date_3 = NA +V(person)$nonfatal_date_4 = NA +V(person)$nonfatal_date_5 = NA +V(person)$nonfatal_date_1[vics] = nfd1 +V(person)$nonfatal_date_2[vics] = nfd2 +V(person)$nonfatal_date_3[vics] = nfd3 +V(person)$nonfatal_date_4[vics] = nfd4 +V(person)$nonfatal_date_5[vics] = nfd5 + +# convert dates into numeric values ("days") +start_date +V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) +V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date) +V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date) +V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date) +V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date) +V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date) + +#=================================================================== +# set gang attributes +gangs <- read.csv("Sept2014-ganglist.csv", header=T) +gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] +gangs$ir2 <- paste("ir", gangs$IR_NO) + +V(person)$gang.member <- V(person)$name %in% gangs$ir2 + +match_vector = match(V(person)$name, gangs$ir2) +gnames = gangs$GANG_NAME[match_vector] +gnames = as.character(gnames) +gnames[V(person)$gang.member==''] = 'Unknown' +gnames[V(person)$gang.member==F] = 'None' +V(person)$gang.name <- as.character(gnames) +V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) + +#=================================================================== +# create id number +V(person)$id = rank(V(person)$name) + +# save data +# person = remove.edge.attribute(person,'weight') +# person_data = get.data.frame(person,'both') +save(person, file="chi-19mar2015.RData") + +#=================================================================== +# get LCC of the network +lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) +V(lcc)$ir_no = V(lcc)$name +vic_ids = which(V(lcc)$vic) +lcc_verts = get.data.frame(lcc,'vertices') +lcc_edges = get.data.frame(lcc,'edges') +save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") + +lcc_data = get.data.frame(lcc,'both') +lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) +row.names(lcc_data$vertices) = lcc_data$vertices$id +vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal', + 'fatal_day','nonfatal_day_1','nonfatal_day_2', + 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')] + +write.csv(vertices,file='lcc_vertices.csv') +write.csv(lcc_data$edges,file='lcc_edges.csv') + +lcc = graph.data.frame(lcc_edges, directed=FALSE, + vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')]) +write.graph(lcc,'lcc.gml','gml') diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R new file mode 100644 index 0000000..ecd5c83 --- /dev/null +++ b/R Scripts/find-cascades.R @@ -0,0 +1,73 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +load('Results/hyper-lcc.RData') +load('Results/dag_dat_vics.RData') +source('Scripts/temporal.R') +# source('Scripts/homophily.R') +source('Scripts/structural.R') + +vic_ids = which(V(hyp_lcc)$vic==TRUE) + +#### Initialize model parameters +alpha = 1/5 +# gamma = 0.3 +delta = 0.1 +beta = 9.955713e-6 + +##### Get weights +# find max n days where infection possible given alpha +edges = dag_dat_vics +# edges = edges[(edges$t2-edges$t1)<300,] + +p_t = temporal(edges$t1, edges$t2, alpha) +p_s = structural(delta, edges$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * temporal(edges$t1, edges$t2+1, alpha) +weights = p/p_tilde + +##### Find transmission edges +# load('Results/lm-probs.RData') +# names = attr(lm.probs,'names') +# probs = as.numeric(lm.probs) +# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)] +# betas = 0.055 +thresh = beta/(1-beta) + +realized = c() +# edges = edges[weights>thresh,] +# weights = weights[weights>thresh] +for (u in vic_ids){ + max_prob = -1 + max_edge = NULL + Ei = which(edges$to==u) + for(i in Ei){ + if(weights[i]>thresh){ + prob = p[i] * prod(p_tilde[setdiff(Ei,i)]) + if(prob>max_prob){ + max_prob = prob + max_edge = i + } + } + } + realized = c(realized, max_edge) +} +# if (length(Ei)>0){ +# max_edge = Ei[which.max(weights[Ei])] # how to deal with ties???? +# realized = c(realized, max_edge) +# } + +##### Create forest of cascades +# dag is all of the victim nodes but with only transmission edges, or don't index by vic_ids +dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=hyp_lcc_verts[vic_ids,]) +dag = set.edge.attribute(dag,'weight',value=weights[realized]) +table(clusters(dag)$csize) + +t=table(clusters(dag)$csize) +sizes = as.numeric(attr(t,'name')) +counts = as.numeric(t) +plot(sizes,counts,log='xy',type='o',lwd=3) +n.seeds = sum(counts) +n.nodes = sum(sizes*counts) # for vics case is just length(vic_ids) +n.infections = n.nodes-n.seeds + +# dag = graph.edgelist(as.matrix(edges[realized,1:2])) diff --git a/R Scripts/generate-dag-dat.R b/R Scripts/generate-dag-dat.R new file mode 100755 index 0000000..660a503 --- /dev/null +++ b/R Scripts/generate-dag-dat.R @@ -0,0 +1,59 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") +load('Results/hyper-lcc.RData') + +vic_ids = which(V(hyp_lcc)$vic==TRUE) + +edgeWeights = function(eis){return(c(hyp_lcc_edges$weight[eis],Inf,Inf)[1:3])} + +dag_dat_all = data.frame(matrix(nrow=1,ncol=8)) +hyp_lcc2 = remove.edge.attribute(hyp_lcc,'weight') +ei = 1 +ptm=proc.time() +for (u in vic_ids){ + if ((which(vic_ids==u) %% 1000)==0) print(which(vic_ids==u)) + tu = hyp_lcc_verts$vic.day[u] + u_spawn = hyp_lcc_verts$spawn.date[u] + nbhd = unlist(neighborhood(hyp_lcc,nodes=u,order=3)) # get nodes within neighborhood + nbhd = nbhd[-1] # don't want to include u in the neighborhood + tvs = hyp_lcc_verts$vic.day[nbhd] + v_spawn = hyp_lcc_verts$spawn.date[nbhd] + nbhd = nbhd[tu>v_spawn & (is.na(tvs) | tu0){ + min_time[i] = min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows]) + min_time_dist[i] = dag_dat_vics$dist[rows[which.min(dag_dat_vics$t2[rows]-dag_dat_vics$t1[rows])]] + i = i + 1 + } +} +median(min_time) +mean(min_time<100) +save(min_time_1,min_time_2,min_time_3,file='Results/min_inf_time.RData') diff --git a/R Scripts/likelihood.R b/R Scripts/likelihood.R new file mode 100644 index 0000000..a4e5254 --- /dev/null +++ b/R Scripts/likelihood.R @@ -0,0 +1,30 @@ +# run after find-cascades + +load('Results/dag_dat_all.RData') + +edges_all = dag_dat_all +edges_all$t2[is.na(edges_all$t2)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1 +realized_all = as.numeric(rownames(dag_dat_vics)[realized]) + +p_t = temporal(edges_all$t1, edges_all$t2, alpha) +p_s = structural(delta, edges_all$dist) +p = p_s * p_t +p_tilde = 1 - p_s + p_s * temporal(edges_all$t1, edges_all$t2+1, alpha) + +vic.days = hyp_lcc_verts$vic.day +vic.days[is.na(vic.days)] = max(hyp_lcc_verts$vic.day,na.rm=T)+1 +ages = vic.days-hyp_lcc_verts$spawn.date + +# realized edges +prob_realized = sum(log(p[realized_all])) + +# non-realized edges +prob_not_realized = sum(log(p_tilde[-realized_all])) + +# seeds +prob_beta = n.seeds * log(beta) + +# non-seeds +prob_beta_tilde = sum(ages) * log(1-beta) + +ll = prob_beta + prob_beta_tilde + prob_realized + prob_not_realized diff --git a/R Scripts/lm.R b/R Scripts/lm.R new file mode 100755 index 0000000..f1bbae0 --- /dev/null +++ b/R Scripts/lm.R @@ -0,0 +1,74 @@ +library(pROC) +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/lcc_bw.RData') + +is_vic = V(lcc)$vic +vics = which(V(lcc)$vic==T) +nonvics = which(V(lcc)$vic==F) +thresh = length(vics)/length(union(vics,nonvics)) + +nbhdVics = function(neighbs){return(sum(is_vic[neighbs[-1]]))} +#setdiff(is_vic order-(order-1) to get only those at the given order +n1s = neighborhood(lcc,1,V(lcc)) +n2s = neighborhood(lcc,2,V(lcc)) +n1vics = unlist(lapply(n1s,nbhdVics)) +n2vics = unlist(lapply(n2s,nbhdVics)) + +data = data.frame(vic = V(lcc)$vic, + sex = V(lcc)$sex, + race = V(lcc)$race, + age = V(lcc)$age, + gang.member = V(lcc)$gang.member, + gang.name = V(lcc)$gang.name, +# fname = V(lcc)$faction.name, + deg = degree(lcc), + bw = bw, + n1v = n1vics, + n2v = n2vics + ) +data$age = as.numeric(data$age) +data$gname = as.factor(data$gname) + +train = 1:dim(data)[1]; test = 1:dim(data)[1] +# train = sort(union(sample(vics,length(vics)*0.8),sample(nonvics,length(nonvics)*0.8))) +# test = sort((1:length(is_vic))[-train]) + +formula = vic ~ sex + race + age + gmember + gname + deg + bw + n1v + n2v +formula = vic ~ sex + race + age + gmember + gname +formula = vic ~ deg + bw +formula = vic ~ n1v + n2v + +glm.fit = glm(formula, data=data, family=binomial) +glm.probs = predict(glm.fit, newdata=data, type='response') +roc = roc(data$vic[test],glm.probs,ci=TRUE) +plot(roc,print.auc=TRUE) +roc$auc + +lm.fit = lm(formula, data=data) +lm.probs = predict(lm.fit, newdata=data, type='response') +p = 1/(1+exp(-lm.probs)) +z = log(lm.probs/(1-lm.probs)) +logit = 1/(1+exp(-z)) + +save(glm.probs, file='Results/lm-probs.RData') + +########## +lm.pred = lm.probs>thresh +tab = table(lm.pred,data$vic) +tab +(tab[1,1]+tab[2,2])/sum(tab) + +propensity = as.numeric(pred) +similarity = 1 - abs(propensity[u]-propensity[v]) + +############### +# Random forest approach -- not as good +library(randomForest) +data$vic = factor(data$vic) +rf.fit = randomForest(formula,data=data[train,],do.trace=50,ntree=20) +rf.probs = (predict(rf.fit,newdata=data[test,],type='prob'))[,1] +roc = roc(data$vic[test],rf.probs,ci=TRUE) +plot(roc,print.auc=TRUE) + diff --git a/R Scripts/plot-graph.R b/R Scripts/plot-graph.R new file mode 100755 index 0000000..b6f9341 --- /dev/null +++ b/R Scripts/plot-graph.R @@ -0,0 +1,22 @@ +#### plot the victims and non-victims of the network +cols = rep('lightgrey',vcount(g)) #number of nodes in g +cols[V(g)$vic==1] = 'red' +shapes = rep('circle',vcount(g)) +shapes[V(g)$vic==1] = 'square' +plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols, vertex.shape=shapes,vertex.size=2) +legend('topleft', legend=c("non-victim", "victim"), pch=c(16,15), col=c("lightgrey", "red"), inset=0.1) + +#### plot the neighborhood of a node +v = 10 +v1 = unlist(neighborhood(lcc,nodes=v,order=1)) +v2 = unlist(neighborhood(lcc,nodes=v,order=2)) +v3 = unlist(neighborhood(lcc,nodes=v,order=3)) +v3 = setdiff(v3,v2) +v2 = setdiff(v2,v1) +v1 = setdiff(v1,v) +cols = rep('lightgrey',vcount(g)) +cols[v] = 'red' +cols[v1] = 'blue' +cols[v2] = 'green' +cols[v3] = 'black' +plot(g, layout=layout.kamada.kawai, vertex.label=NA, vertex.color=cols,vertex.size=2) diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R new file mode 100755 index 0000000..3960578 --- /dev/null +++ b/R Scripts/sim-analysis.R @@ -0,0 +1,72 @@ +library(igraph) +setwd("~/Documents/Cascade Project/") + +load('Results/hyper-lcc.RData') +vic_ids = which(V(hyp_lcc)$vic==1) +n.infections = length(vic_ids) +n.days = max(hyp_lcc_verts$vic.day,na.rm=T) - min(hyp_lcc_verts$vic.day,na.rm=T) +inf.dates = hyp_lcc_verts$vic.day[vic_ids] + +ptm = proc.time() +n = 1500 +mean.time = matrix(0,3,n) +med.time = matrix(0,3,n) +mean.50 = matrix(0,3,n) +mean.100 = matrix(0,3,n) +n.vicpairs = matrix(0,3,n) +for(sim in 1:3){ + print(paste('sim:',sim)) + for(q in 1:n){ + if (q%%250==0) print(paste('run:',q)) + graph = hyp_lcc + if (sim<3) sim.dates = sample(n.days, n.infections, replace=TRUE) # sims 1 + 2 + if (sim==3) sim.dates = sample(inf.dates) # sim 3 + if (sim==1) vics = sample(vcount(hyp_lcc), n.infections, replace=FALSE) # sim 1 + if (sim>1) vics = vic_ids # sims 2 + 3 + if (sim==0) {vics = vic_ids; sim.dates = inf.dates} # data + + vic.time = c() + for (i in 1:n.infections){ + u = vics[i] + nbhd = unlist(neighborhood(graph, nodes=u, order=1)) + nbhd = intersect(vic_ids,nbhd) + nbhd = setdiff(nbhd,u) + nbhd = nbhd[u