..

author: Ben Green <bgreen@g.harvard.edu> 2015-10-04 17:18:05 -0400
committer: Ben Green <bgreen@g.harvard.edu> 2015-10-04 17:18:05 -0400
commit: 75868035a6e363c2ad67ff0342440aff3972a970 (patch)
tree: 5bf65d9f3d5fa47dda8c1695ce34aa24e0f811dd /R Scripts
parent: 3969a594c49e58aafe04ff352b02d0d61eb228cf (diff)
download: criminal_cascades-75868035a6e363c2ad67ff0342440aff3972a970.tar.gz
8 files changed, 79 insertions, 35 deletions
diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R
index 8bbb598..affd77a 100755
--- a/R Scripts/analyze-cascades.R
+++ b/R Scripts/analyze-cascades.R
@@ -17,6 +17,7 @@ plot(sizes,counts,log='xy',pch=20,col='#1f78b4',xaxt='n',yaxt='n',
 axis(1,at=c(1,5,10,50,100,500),cex.axis=0.6)
 axis(2,at=c(1,10,100,1000),cex.axis=0.6)
 
+mean(rep(sizes,counts))
 sizes_pl = displ$new(counts[counts>1])
 est = estimate_xmin(sizes_pl)
 sizes_pl$setXmin(est)
diff --git a/R Scripts/arrest-vs-infection-times.R b/R Scripts/arrest-vs-infection-times.R
index 7c0bfcf..4d2a81d 100644
--- a/R Scripts/arrest-vs-infection-times.R
+++ b/R Scripts/arrest-vs-infection-times.R
@@ -19,11 +19,13 @@ first_arrests = as.numeric(arr$date2) - offset
 lag = first_vics - first_arrests
 hist(lag,100,col='')
 h = hist(lag,150,col='#1f78b4',border=NA,axes=T,
-         xlab='',ylab='Frequency',main=NULL)
+         xlab='Days between first arrest and first gunshot victimization',
+         ylab='Frequency',main=NULL,cex.axis=0.6,cex.lab=0.6)
 box(lwd=1.1)
 
-
-
+dobs = as.Date(arr$birth_date,format='%m/%d/%Y')
+ages = as.numeric(start_date + first_vics - dobs)
+plot(table(round(ages/365)))
 
 
 
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R
index f8f6892..22ec60c 100755
--- a/R Scripts/data-exploration.R
+++ b/R Scripts/data-exploration.R
@@ -33,6 +33,17 @@ plot(1:max(degree(lcc)),degree.distribution(lcc)[-1]*vcount(lcc),
      xlab='Degree', ylab='Number of vertices', main='')
 pl = power.law.fit(degree.distribution(lcc))
 
+##### small-worldness of lcc
+trg = transitivity(lcc,type='global')
+trl = mean(transitivity(lcc,type='local',isolates='zero'))
+apl = average.path.length(lcc)
+
+erg = erdos.renyi.game(n=vcount(lcc),p.or.m=ecount(lcc),type='gnm')
+transitivity(erg,type='global')
+mean(transitivity(erg,type='local',isolates='zero'))
+average.path.length(erg)
+
+
 ##### Victims
 vic_ids = which(V(lcc)$vic==TRUE)
 non_vic_ids = which(V(lcc)$vic==FALSE)
@@ -53,3 +64,6 @@ sum(sum(!is.na(lcc_verts$fatal_day)),
     sum(!is.na(lcc_verts$nonfatal_day_3)),
     sum(!is.na(lcc_verts$nonfatal_day_4)),
     sum(!is.na(lcc_verts$nonfatal_day_5)))
+
+
+
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index ca2fdc2..a6c31e6 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -86,15 +86,30 @@ for (i in 1:dim(a)[1]){
   ir = a$ir_no[i]
   arr = arrests[arrests$ir_no==ir,]
   arr = arr[arr$sex_code_cd != 'X',]
-  if(dim(arr)[1]>0){
+  if(dim(arr)[1]>0){# need to match rownames like districts?
     arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd)))
   }
 } 
 arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M'
 
+##### residential districts
+arrests$o_district[arrests$o_district=='31'] = ''
+a = arrests[arrests$o_district=='' & arrests$o_city=='CHICAGO',]
+for (i in 1:dim(a)[1]){
+  if(i%%200==0) print(i)
+  ir = a$ir_no[i]
+  arr = arrests[arrests$ir_no==ir,]
+  arr = arr[arr$o_district != '',]
+  if(dim(arr)[1]>0){
+    arrests$o_district[match(as.numeric(rownames(a[i,])),rownames(arrests))] = names(which.max(table(arr$o_district)))
+  }
+} 
+arrests$o_district[arrests$o_district==''] = 0
+# lcc_verts$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]
+# V(lcc)$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]
+
 #I need to add the "ir" for this to make sense when I "project"
 arrests$ir2 <- paste("ir", arrests$ir_no)
-
 # save altered arrests data
 save(arrests,file='arrests.RData')
 
@@ -180,7 +195,7 @@ murders = read.csv("murder-victims-13nov.csv", header=T)
 murders = murders[!is.na(murders$VICTIM_IR_NO),]
 murders = murders[murders$INJURY_DESCR=="SHOT",]
 murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
-murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
+murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,]
 murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
 
 # clear nonfatals that led to death
diff --git a/R Scripts/plot-crime-data.R b/R Scripts/plot-crime-data.R
index 13800cc..7bb768a 100644
--- a/R Scripts/plot-crime-data.R
+++ b/R Scripts/plot-crime-data.R
@@ -15,17 +15,10 @@ plot(data$Year,data$Homicide.Rate,type='l',col='#1f78b4',lwd=2,
 ###################
 # load shootings data
 # for lcc
-load('lcc.RData')
-vic_dates = as.Date(unlist(lcc_verts[,10:15]))
-vic_dates = vic_dates[!is.na(vic_dates)]
-vdh = hist(vic_dates, breaks='months')
-
-plot(vdh$mids,vdh$counts,type='l',col='#1f78b4',lwd=2,
-     xlab='',ylab='Shootings',xaxt='n',cex.lab=.6,cex.axis=.6)
-axis(1,at=vdh$breaks[seq(1,102,12)],
-     lab=2006:2014,cex.axis=.6)
-
-
+# load('lcc.RData')
+# vic_dates = as.Date(unlist(lcc_verts[,10:15]))
+# vic_dates = vic_dates[!is.na(vic_dates)]
+start_date = as.Date("2005-12-31")
 # for all recorded shootings
 shootings <- read.csv("shooting-data-withdate2.csv", header = T)
 victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
@@ -37,7 +30,7 @@ murders = read.csv("murder-victims-13nov.csv", header=T)
 murders = murders[!is.na(murders$VICTIM_IR_NO),]
 murders = murders[murders$INJURY_DESCR=="SHOT",]
 murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
-murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
+murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,]
 murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
 
 # clear nonfatals that led to death
@@ -51,4 +44,14 @@ for(i in 1:dim(v)[1]){
 }
 victims = victims[-rows,]
 
-vic_dates = c(as.Date(murders$INJURY_DATE,format='%m/%d/%y'),as.Date(victims$INCIDENT_DATE,format='%m/%d/%y'))
-\ No newline at end of file
+vic_dates = c(as.Date(murders$INJURY_DATE,format='%m/%d/%y'),as.Date(victims$INCIDENT_DATE,format='%m/%d/%y'))
+
+vdh = hist(vic_dates, breaks='months')
+
+plot(vdh$mids[1:99],vdh$counts[1:99],type='l',col='#1f78b4',lwd=2,ylim=c(11,289),
+     xlab='',ylab='Shootings',xaxt='n',cex.lab=.6,cex.axis=.6)
+axis(1,at=vdh$breaks[seq(1,100,12)],
+     lab=2006:2014,cex.axis=.6)
+
+plot(colMeans(matrix(vdh$counts[1:96],ncol=12,byrow=T)))
+
diff --git a/R Scripts/plot-data-prep.R b/R Scripts/plot-data-prep.R
index 27c2416..b91d8de 100644
--- a/R Scripts/plot-data-prep.R
+++ b/R Scripts/plot-data-prep.R
@@ -28,7 +28,8 @@ plot.new()
 vps <- baseViewports()
 pushViewport(vps$figure)
 vp1 <-plotViewport()
-grid.table(data)
+theme = ttheme_default(core=list(bg_params = list(fill = c("grey95"))))
+grid.table(data,theme = theme)
 
 # (B) bipartite person-event graph
 layout = matrix(c(rep(seq(4,0,-1),2),rep(0,5),rep(1,5)),ncol=2)
@@ -36,7 +37,8 @@ labels = c( 1:5, toupper(letters[1:5]) )
 cols = c( rep('#1a9850',5), rep('#1f78b4',5) )
 plot(g, layout=layout[,2:1], vertex.color=cols,edge.color='black',
      vertex.frame.color=NA,vertex.size=30,vertex.label=labels,
-     vertex.label.color='white',vertex.label.family='sans')
+     vertex.label.color='white',vertex.label.family='sans',
+     vertex.label.font=2,vertex.label.cex=1.25)
 
 # (C) unipartite person-person graph
 par(mar=rep(1,4))
@@ -44,13 +46,13 @@ g2 = bipartite.projection(g)$proj1
 layout.g2 = matrix(c(0,.33,-.75,-.33,0,-1,.75,-.33,0,1),ncol=2,byrow=T)
 plot(g2, vertex.color='#1f78b4', edge.color='black',layout=layout.g2,rescale=F,
      vertex.frame.color=NA,vertex.size=30,vertex.label=toupper(letters[1:5]),
-     vertex.label.color='white',vertex.label.family='sans')
+     vertex.label.color='white',vertex.label.family='sans',
+     vertex.label.font=2,vertex.label.cex=1.25)
 
 ##############################################################################
 ##############################################################################
 #### Hawkes Process Diagram
 layout(matrix(c(1,2,3,3,3,3), ncol=2, byrow = TRUE))
-par(mar=c(1,1,1,1))
 
 vics = data.frame(IC = toupper(letters[1:5]), 
                   Victim = c('TRUE','FALSE','FALSE','TRUE','FALSE'),
@@ -62,15 +64,17 @@ plot.new()
 vps <- baseViewports()
 pushViewport(vps$figure)
 vp1 <-plotViewport()
-grid.table(vics)
+grid.table(vics,theme=theme)
 
 cols = rep('#1f78b4',vcount(g2))
 cols[vics$Victim==T] = '#e41a1c'
 
 # B
+# par(mar=c(1,1,1,1))
 plot(g2, vertex.color=cols, edge.color='black',layout=layout.g2,rescale=F,
      vertex.frame.color=NA,vertex.size=30,vertex.label=toupper(letters[1:5]),
-     vertex.label.color='white',vertex.label.family='sans',axes=F)
+     vertex.label.color='white',vertex.label.family='sans',axes=F,
+     vertex.label.font=2,vertex.label.cex=1.25)
 
 
 # C 
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index d17ef83..3cffcbe 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -22,7 +22,8 @@ edges_all = data.table(dag_dat_lcc)
 vic_ids = which(lcc_verts$vic==T)
 lcc_verts = lcc_verts[,c('id','ir_no','nonfatal_day_1','nonfatal_day_2',
                          'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5',
-                         'fatal_day','sex','race','dob','gang.member','gang.name')]
+                         'fatal_day','sex','race','dob','gang.member',
+                         'gang.name','district')]
 lcc_verts$vic.day = NA
 lcc_verts$vic.day[vic_ids] = as.numeric(apply(vic_times_lcc[vic_ids,2:7],1,min,na.rm=T))
 lcc_verts$age = FALSE
@@ -35,10 +36,11 @@ vic_times_lcc$ir_no = lcc_verts$ir_no
 rownames(vic_times_lcc) = vic_times_lcc$ir_no
 
 ##### Initialize data
-formula = vic ~ sex + race + age + gang.member + gang.name + arrests
+formula = vic ~ sex + race + age + gang.member + gang.name + arrests + district
 lcc_verts$sex = as.factor(lcc_verts$sex)
 lcc_verts$race = as.factor(lcc_verts$race)
 lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
+lcc_verts$district = as.factor(lcc_verts$district)
 dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
 
 #####
@@ -48,7 +50,7 @@ days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1,
 days = days[!is.na(days)]
 days = sort(days)
 days = split(days, ceiling(seq_along(days)/92))
-lambdas = c(0, exp(seq(log(0.0001), log(0.01), length.out=100)), 1)
+lambdas = c(0, exp(seq(log(0.00001), log(0.01), length.out=100)), 1)
 
 ##### Loop through days
 correct_rank = c()
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
index 51659f5..39fd476 100755
--- a/R Scripts/sim-analysis.R
+++ b/R Scripts/sim-analysis.R
@@ -11,6 +11,7 @@ idxs = which(!is.na(vic_days)) %% vcount(lcc)
 vic_days = vic_days[!is.na(vic_days)]
 n.infected = sum(lcc_verts$vic)
 birthyears = as.numeric(format(as.Date(lcc_verts$dob[idxs]),'%Y'))
+districts = as.numeric(lcc_verts$district[idxs])
 
 # find all pairs of vic neighbors to check
 ## nbrs = neighborhood(lcc,order=1,nodes=vic_ids); save(nbrs,file='Raw Data/vic-nbrs.RData')
@@ -33,16 +34,17 @@ n = 10000
 ptm = proc.time()
 simdata = foreach(q=1:n, .combine=rbind) %dopar% {  
   # shuffle infection dates
-  sim.dates = rep(0,length(vic_days))
+  sim.dates = vic_days # data
   for(y in unique(birthyears)){
-    year_ids = which(birthyears==y)
-    if (length(year_ids)>1) sim.dates[year_ids] = sample(vic_days[year_ids])
+    for(d in unique(districts)){
+      ids = which(birthyears==y & districts==d)
+      if (length(ids)>1) sim.dates[ids] = sample(vic_days[ids])
+    }
   }
-#   sim.dates = vic_days # data
   
   vic.dt = rep(0,dim(vic.nbrs)[1])
   for(i in 1:dim(vic.nbrs)[1]){
-    mindt = 5000
+    mindt = 50000
     tus = sim.dates[which(vic.nbrs[i,1]==idxs)]
     tvs = sim.dates[which(vic.nbrs[i,2]==idxs)]
     for(tu in tus){
@@ -73,16 +75,17 @@ xlabs = c('Mean days between infections',
           'Infections within 30 days',
           'Infections within 100 days')
 xlims = matrix(c(660,765,475,625,0.03,0.08,0.1,0.18),ncol=2,byrow=T)
+yl = c(0,0.11*n)
 par(mfrow=c(2,2))
 for(i in 1:4){
   data = d[i] 
   sdata = simdata[,i]
   xl = xlims[i,]
-  h = hist(sdata,50,xlim=xl,col='#1f78b4',freq=T,border=NA,axes=F,
+  h = hist(sdata,50,xlim=xl,ylim=yl,col='#1f78b4',freq=T,border=NA,axes=F,
        xlab=xlabs[i],ylab='Relative frequency',main=NULL)
   if(i %in% c(1,2)) axis(1,at=pretty(xl,5))
   if(i %in% c(3,4)) axis(1,at=pretty(xl,5),lab=paste(pretty(xl,5)*100,'%',sep=''))
-  axis(2,at=pretty(h$counts,3),lab=pretty(h$counts/n,3))
+  axis(2,at=c(0,0.05,0.1)*n,lab=c(0,0.05,0.1))
   abline(v=data,lwd=4,col='#e41a1c')
   box(lwd=1.1)
 }
author	Ben Green <bgreen@g.harvard.edu>	2015-10-04 17:18:05 -0400
committer	Ben Green <bgreen@g.harvard.edu>	2015-10-04 17:18:05 -0400
commit	75868035a6e363c2ad67ff0342440aff3972a970 (patch)
tree	5bf65d9f3d5fa47dda8c1695ce34aa24e0f811dd /R Scripts
parent	3969a594c49e58aafe04ff352b02d0d61eb228cf (diff)
download	criminal_cascades-75868035a6e363c2ad67ff0342440aff3972a970.tar.gz