summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Green <bgreen@g.harvard.edu>2015-10-04 17:18:05 -0400
committerBen Green <bgreen@g.harvard.edu>2015-10-04 17:18:05 -0400
commit75868035a6e363c2ad67ff0342440aff3972a970 (patch)
tree5bf65d9f3d5fa47dda8c1695ce34aa24e0f811dd
parent3969a594c49e58aafe04ff352b02d0d61eb228cf (diff)
downloadcriminal_cascades-75868035a6e363c2ad67ff0342440aff3972a970.tar.gz
..
-rwxr-xr-xR Scripts/analyze-cascades.R1
-rw-r--r--R Scripts/arrest-vs-infection-times.R8
-rwxr-xr-xR Scripts/data-exploration.R14
-rwxr-xr-xR Scripts/data-prep.R21
-rw-r--r--R Scripts/plot-crime-data.R29
-rw-r--r--R Scripts/plot-data-prep.R16
-rw-r--r--R Scripts/predict-victims.R8
-rwxr-xr-xR Scripts/sim-analysis.R17
-rw-r--r--supplements/hawkes-model-sm.docxbin0 -> 1068297 bytes
9 files changed, 79 insertions, 35 deletions
diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R
index 8bbb598..affd77a 100755
--- a/R Scripts/analyze-cascades.R
+++ b/R Scripts/analyze-cascades.R
@@ -17,6 +17,7 @@ plot(sizes,counts,log='xy',pch=20,col='#1f78b4',xaxt='n',yaxt='n',
axis(1,at=c(1,5,10,50,100,500),cex.axis=0.6)
axis(2,at=c(1,10,100,1000),cex.axis=0.6)
+mean(rep(sizes,counts))
sizes_pl = displ$new(counts[counts>1])
est = estimate_xmin(sizes_pl)
sizes_pl$setXmin(est)
diff --git a/R Scripts/arrest-vs-infection-times.R b/R Scripts/arrest-vs-infection-times.R
index 7c0bfcf..4d2a81d 100644
--- a/R Scripts/arrest-vs-infection-times.R
+++ b/R Scripts/arrest-vs-infection-times.R
@@ -19,11 +19,13 @@ first_arrests = as.numeric(arr$date2) - offset
lag = first_vics - first_arrests
hist(lag,100,col='')
h = hist(lag,150,col='#1f78b4',border=NA,axes=T,
- xlab='',ylab='Frequency',main=NULL)
+ xlab='Days between first arrest and first gunshot victimization',
+ ylab='Frequency',main=NULL,cex.axis=0.6,cex.lab=0.6)
box(lwd=1.1)
-
-
+dobs = as.Date(arr$birth_date,format='%m/%d/%Y')
+ages = as.numeric(start_date + first_vics - dobs)
+plot(table(round(ages/365)))
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R
index f8f6892..22ec60c 100755
--- a/R Scripts/data-exploration.R
+++ b/R Scripts/data-exploration.R
@@ -33,6 +33,17 @@ plot(1:max(degree(lcc)),degree.distribution(lcc)[-1]*vcount(lcc),
xlab='Degree', ylab='Number of vertices', main='')
pl = power.law.fit(degree.distribution(lcc))
+##### small-worldness of lcc
+trg = transitivity(lcc,type='global')
+trl = mean(transitivity(lcc,type='local',isolates='zero'))
+apl = average.path.length(lcc)
+
+erg = erdos.renyi.game(n=vcount(lcc),p.or.m=ecount(lcc),type='gnm')
+transitivity(erg,type='global')
+mean(transitivity(erg,type='local',isolates='zero'))
+average.path.length(erg)
+
+
##### Victims
vic_ids = which(V(lcc)$vic==TRUE)
non_vic_ids = which(V(lcc)$vic==FALSE)
@@ -53,3 +64,6 @@ sum(sum(!is.na(lcc_verts$fatal_day)),
sum(!is.na(lcc_verts$nonfatal_day_3)),
sum(!is.na(lcc_verts$nonfatal_day_4)),
sum(!is.na(lcc_verts$nonfatal_day_5)))
+
+
+
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index ca2fdc2..a6c31e6 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -86,15 +86,30 @@ for (i in 1:dim(a)[1]){
ir = a$ir_no[i]
arr = arrests[arrests$ir_no==ir,]
arr = arr[arr$sex_code_cd != 'X',]
- if(dim(arr)[1]>0){
+ if(dim(arr)[1]>0){# need to match rownames like districts?
arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd)))
}
}
arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M'
+##### residential districts
+arrests$o_district[arrests$o_district=='31'] = ''
+a = arrests[arrests$o_district=='' & arrests$o_city=='CHICAGO',]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0) print(i)
+ ir = a$ir_no[i]
+ arr = arrests[arrests$ir_no==ir,]
+ arr = arr[arr$o_district != '',]
+ if(dim(arr)[1]>0){
+ arrests$o_district[match(as.numeric(rownames(a[i,])),rownames(arrests))] = names(which.max(table(arr$o_district)))
+ }
+}
+arrests$o_district[arrests$o_district==''] = 0
+# lcc_verts$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]
+# V(lcc)$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]
+
#I need to add the "ir" for this to make sense when I "project"
arrests$ir2 <- paste("ir", arrests$ir_no)
-
# save altered arrests data
save(arrests,file='arrests.RData')
@@ -180,7 +195,7 @@ murders = read.csv("murder-victims-13nov.csv", header=T)
murders = murders[!is.na(murders$VICTIM_IR_NO),]
murders = murders[murders$INJURY_DESCR=="SHOT",]
murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
-murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
+murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,]
murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
# clear nonfatals that led to death
diff --git a/R Scripts/plot-crime-data.R b/R Scripts/plot-crime-data.R
index 13800cc..7bb768a 100644
--- a/R Scripts/plot-crime-data.R
+++ b/R Scripts/plot-crime-data.R
@@ -15,17 +15,10 @@ plot(data$Year,data$Homicide.Rate,type='l',col='#1f78b4',lwd=2,
###################
# load shootings data
# for lcc
-load('lcc.RData')
-vic_dates = as.Date(unlist(lcc_verts[,10:15]))
-vic_dates = vic_dates[!is.na(vic_dates)]
-vdh = hist(vic_dates, breaks='months')
-
-plot(vdh$mids,vdh$counts,type='l',col='#1f78b4',lwd=2,
- xlab='',ylab='Shootings',xaxt='n',cex.lab=.6,cex.axis=.6)
-axis(1,at=vdh$breaks[seq(1,102,12)],
- lab=2006:2014,cex.axis=.6)
-
-
+# load('lcc.RData')
+# vic_dates = as.Date(unlist(lcc_verts[,10:15]))
+# vic_dates = vic_dates[!is.na(vic_dates)]
+start_date = as.Date("2005-12-31")
# for all recorded shootings
shootings <- read.csv("shooting-data-withdate2.csv", header = T)
victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
@@ -37,7 +30,7 @@ murders = read.csv("murder-victims-13nov.csv", header=T)
murders = murders[!is.na(murders$VICTIM_IR_NO),]
murders = murders[murders$INJURY_DESCR=="SHOT",]
murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
-murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
+murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,]
murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
# clear nonfatals that led to death
@@ -51,4 +44,14 @@ for(i in 1:dim(v)[1]){
}
victims = victims[-rows,]
-vic_dates = c(as.Date(murders$INJURY_DATE,format='%m/%d/%y'),as.Date(victims$INCIDENT_DATE,format='%m/%d/%y')) \ No newline at end of file
+vic_dates = c(as.Date(murders$INJURY_DATE,format='%m/%d/%y'),as.Date(victims$INCIDENT_DATE,format='%m/%d/%y'))
+
+vdh = hist(vic_dates, breaks='months')
+
+plot(vdh$mids[1:99],vdh$counts[1:99],type='l',col='#1f78b4',lwd=2,ylim=c(11,289),
+ xlab='',ylab='Shootings',xaxt='n',cex.lab=.6,cex.axis=.6)
+axis(1,at=vdh$breaks[seq(1,100,12)],
+ lab=2006:2014,cex.axis=.6)
+
+plot(colMeans(matrix(vdh$counts[1:96],ncol=12,byrow=T)))
+
diff --git a/R Scripts/plot-data-prep.R b/R Scripts/plot-data-prep.R
index 27c2416..b91d8de 100644
--- a/R Scripts/plot-data-prep.R
+++ b/R Scripts/plot-data-prep.R
@@ -28,7 +28,8 @@ plot.new()
vps <- baseViewports()
pushViewport(vps$figure)
vp1 <-plotViewport()
-grid.table(data)
+theme = ttheme_default(core=list(bg_params = list(fill = c("grey95"))))
+grid.table(data,theme = theme)
# (B) bipartite person-event graph
layout = matrix(c(rep(seq(4,0,-1),2),rep(0,5),rep(1,5)),ncol=2)
@@ -36,7 +37,8 @@ labels = c( 1:5, toupper(letters[1:5]) )
cols = c( rep('#1a9850',5), rep('#1f78b4',5) )
plot(g, layout=layout[,2:1], vertex.color=cols,edge.color='black',
vertex.frame.color=NA,vertex.size=30,vertex.label=labels,
- vertex.label.color='white',vertex.label.family='sans')
+ vertex.label.color='white',vertex.label.family='sans',
+ vertex.label.font=2,vertex.label.cex=1.25)
# (C) unipartite person-person graph
par(mar=rep(1,4))
@@ -44,13 +46,13 @@ g2 = bipartite.projection(g)$proj1
layout.g2 = matrix(c(0,.33,-.75,-.33,0,-1,.75,-.33,0,1),ncol=2,byrow=T)
plot(g2, vertex.color='#1f78b4', edge.color='black',layout=layout.g2,rescale=F,
vertex.frame.color=NA,vertex.size=30,vertex.label=toupper(letters[1:5]),
- vertex.label.color='white',vertex.label.family='sans')
+ vertex.label.color='white',vertex.label.family='sans',
+ vertex.label.font=2,vertex.label.cex=1.25)
##############################################################################
##############################################################################
#### Hawkes Process Diagram
layout(matrix(c(1,2,3,3,3,3), ncol=2, byrow = TRUE))
-par(mar=c(1,1,1,1))
vics = data.frame(IC = toupper(letters[1:5]),
Victim = c('TRUE','FALSE','FALSE','TRUE','FALSE'),
@@ -62,15 +64,17 @@ plot.new()
vps <- baseViewports()
pushViewport(vps$figure)
vp1 <-plotViewport()
-grid.table(vics)
+grid.table(vics,theme=theme)
cols = rep('#1f78b4',vcount(g2))
cols[vics$Victim==T] = '#e41a1c'
# B
+# par(mar=c(1,1,1,1))
plot(g2, vertex.color=cols, edge.color='black',layout=layout.g2,rescale=F,
vertex.frame.color=NA,vertex.size=30,vertex.label=toupper(letters[1:5]),
- vertex.label.color='white',vertex.label.family='sans',axes=F)
+ vertex.label.color='white',vertex.label.family='sans',axes=F,
+ vertex.label.font=2,vertex.label.cex=1.25)
# C
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index d17ef83..3cffcbe 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -22,7 +22,8 @@ edges_all = data.table(dag_dat_lcc)
vic_ids = which(lcc_verts$vic==T)
lcc_verts = lcc_verts[,c('id','ir_no','nonfatal_day_1','nonfatal_day_2',
'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5',
- 'fatal_day','sex','race','dob','gang.member','gang.name')]
+ 'fatal_day','sex','race','dob','gang.member',
+ 'gang.name','district')]
lcc_verts$vic.day = NA
lcc_verts$vic.day[vic_ids] = as.numeric(apply(vic_times_lcc[vic_ids,2:7],1,min,na.rm=T))
lcc_verts$age = FALSE
@@ -35,10 +36,11 @@ vic_times_lcc$ir_no = lcc_verts$ir_no
rownames(vic_times_lcc) = vic_times_lcc$ir_no
##### Initialize data
-formula = vic ~ sex + race + age + gang.member + gang.name + arrests
+formula = vic ~ sex + race + age + gang.member + gang.name + arrests + district
lcc_verts$sex = as.factor(lcc_verts$sex)
lcc_verts$race = as.factor(lcc_verts$race)
lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
+lcc_verts$district = as.factor(lcc_verts$district)
dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
#####
@@ -48,7 +50,7 @@ days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1,
days = days[!is.na(days)]
days = sort(days)
days = split(days, ceiling(seq_along(days)/92))
-lambdas = c(0, exp(seq(log(0.0001), log(0.01), length.out=100)), 1)
+lambdas = c(0, exp(seq(log(0.00001), log(0.01), length.out=100)), 1)
##### Loop through days
correct_rank = c()
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
index 51659f5..39fd476 100755
--- a/R Scripts/sim-analysis.R
+++ b/R Scripts/sim-analysis.R
@@ -11,6 +11,7 @@ idxs = which(!is.na(vic_days)) %% vcount(lcc)
vic_days = vic_days[!is.na(vic_days)]
n.infected = sum(lcc_verts$vic)
birthyears = as.numeric(format(as.Date(lcc_verts$dob[idxs]),'%Y'))
+districts = as.numeric(lcc_verts$district[idxs])
# find all pairs of vic neighbors to check
## nbrs = neighborhood(lcc,order=1,nodes=vic_ids); save(nbrs,file='Raw Data/vic-nbrs.RData')
@@ -33,16 +34,17 @@ n = 10000
ptm = proc.time()
simdata = foreach(q=1:n, .combine=rbind) %dopar% {
# shuffle infection dates
- sim.dates = rep(0,length(vic_days))
+ sim.dates = vic_days # data
for(y in unique(birthyears)){
- year_ids = which(birthyears==y)
- if (length(year_ids)>1) sim.dates[year_ids] = sample(vic_days[year_ids])
+ for(d in unique(districts)){
+ ids = which(birthyears==y & districts==d)
+ if (length(ids)>1) sim.dates[ids] = sample(vic_days[ids])
+ }
}
-# sim.dates = vic_days # data
vic.dt = rep(0,dim(vic.nbrs)[1])
for(i in 1:dim(vic.nbrs)[1]){
- mindt = 5000
+ mindt = 50000
tus = sim.dates[which(vic.nbrs[i,1]==idxs)]
tvs = sim.dates[which(vic.nbrs[i,2]==idxs)]
for(tu in tus){
@@ -73,16 +75,17 @@ xlabs = c('Mean days between infections',
'Infections within 30 days',
'Infections within 100 days')
xlims = matrix(c(660,765,475,625,0.03,0.08,0.1,0.18),ncol=2,byrow=T)
+yl = c(0,0.11*n)
par(mfrow=c(2,2))
for(i in 1:4){
data = d[i]
sdata = simdata[,i]
xl = xlims[i,]
- h = hist(sdata,50,xlim=xl,col='#1f78b4',freq=T,border=NA,axes=F,
+ h = hist(sdata,50,xlim=xl,ylim=yl,col='#1f78b4',freq=T,border=NA,axes=F,
xlab=xlabs[i],ylab='Relative frequency',main=NULL)
if(i %in% c(1,2)) axis(1,at=pretty(xl,5))
if(i %in% c(3,4)) axis(1,at=pretty(xl,5),lab=paste(pretty(xl,5)*100,'%',sep=''))
- axis(2,at=pretty(h$counts,3),lab=pretty(h$counts/n,3))
+ axis(2,at=c(0,0.05,0.1)*n,lab=c(0,0.05,0.1))
abline(v=data,lwd=4,col='#e41a1c')
box(lwd=1.1)
}
diff --git a/supplements/hawkes-model-sm.docx b/supplements/hawkes-model-sm.docx
new file mode 100644
index 0000000..be2257e
--- /dev/null
+++ b/supplements/hawkes-model-sm.docx
Binary files differ