summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Green <bgreen@g.harvard.edu>2015-09-14 23:19:34 -0400
committerBen Green <bgreen@g.harvard.edu>2015-09-14 23:19:37 -0400
commit58faa01748fe0e6f6d040d1296266d17bd7a3543 (patch)
treeb1a2bf0709ec3d4c252d90c4dba8e42b3057c91b
parentab0b1f3cefedb35327a19ec1b6afd560bfdf802d (diff)
downloadcriminal_cascades-58faa01748fe0e6f6d040d1296266d17bd7a3543.tar.gz
prediction and plotting cascades
-rwxr-xr-xR Scripts/analyze-cascades.R7
-rwxr-xr-xR Scripts/data-prep.R56
-rw-r--r--R Scripts/predict-victims-plots.R35
-rw-r--r--R Scripts/predict-victims.R8
-rw-r--r--R Scripts/prior-arrests-by-day.R2
5 files changed, 72 insertions, 36 deletions
diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R
index 463de18..e758d60 100755
--- a/R Scripts/analyze-cascades.R
+++ b/R Scripts/analyze-cascades.R
@@ -21,7 +21,7 @@ order = rev(order(csize))
plot(sizes,counts,log='xy',type='o',lwd=3,
xlab='Size of Cascade', ylab='Number of Cascades', main='Distribution of Cascade Sizes')
-i = 14
+i = 4
V = which(clusters(dag)$membership==order[i]) # get all nodes in cluster
cc = induced.subgraph(dag,V)
Vi = vic_ids[V]
@@ -35,8 +35,9 @@ cols[seed] = 'red'
plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1,
edge.width=E(cc)$weight*20/max(E(cc)$weight),layout=layout.reingold.tilford(cc,root=seed),
vertex.label=V(hyp_lcc)$vic.day[Vi])
-# vertex.label=as.Date(V(hyp_lcc)$vic.day[Vi], format ='%m/%d/%y'))
-# vertex.label=V(lcc)$vic_date[Vi])
+plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1,
+ layout=layout.reingold.tilford(cc,root=seed))
+
### basic graph statistics
trl = mean(transitivity(cc,type='local',isolates='zero'))
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 127acca..ca2fdc2 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -80,6 +80,18 @@ for (i in 1:dim(a)[1]){
}
}
+# clean up entries where sex is missing
+a = arrests[arrests$sex_code_cd=='X',]
+for (i in 1:dim(a)[1]){
+ ir = a$ir_no[i]
+ arr = arrests[arrests$ir_no==ir,]
+ arr = arr[arr$sex_code_cd != 'X',]
+ if(dim(arr)[1]>0){
+ arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd)))
+ }
+}
+arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M'
+
#I need to add the "ir" for this to make sense when I "project"
arrests$ir2 <- paste("ir", arrests$ir_no)
@@ -171,6 +183,17 @@ murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
+# clear nonfatals that led to death
+v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,]
+rows = c()
+for(i in 1:dim(v)[1]){
+ row = which(rownames(victims)==as.numeric(rownames(v[i,])))
+ m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],]
+ dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y')
+ if(dup==T) rows = c(rows,row)
+}
+victims = victims[-rows,]
+
# set victim data in network
vtab = as.data.frame(table(victims$ir2))
match_vector = match(V(person)$name,vtab$Var1)
@@ -193,7 +216,8 @@ for(i in 1:length(vics)){
if (i%%3000==0) print(i)
name = vics[i]
ids = which(match_vector==name)
- dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))
+ dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')))
+# if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]]
nfd1[i] = as.character(dates[1])
nfd2[i] = as.character(dates[2])
nfd3[i] = as.character(dates[3])
@@ -211,6 +235,7 @@ V(person)$nonfatal_date_3[vics] = nfd3
V(person)$nonfatal_date_4[vics] = nfd4
V(person)$nonfatal_date_5[vics] = nfd5
+
# convert dates into numeric values ("days")
start_date
V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date)
@@ -223,25 +248,40 @@ V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_d
#===================================================================
# set gang attributes
gangs <- read.csv("Sept2014-ganglist.csv", header=T)
-gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
gangs$ir2 <- paste("ir", gangs$IR_NO)
+t = table(gangs$IR_NO)
+t = t[t>1]
+irs = as.numeric(attr(t,'name'))
+for(ir in irs){
+ if(which(ir==irs)%%1000==0)print(which(ir==irs))
+ g = gangs[gangs$IR_NO==ir,]
+ gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME)))
+}
+
+gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
+gnames = as.character(gangs$GANG_NAME)
+gnames[is.na(gnames)] = 'Unknown'
+
V(person)$gang.member <- V(person)$name %in% gangs$ir2
match_vector = match(V(person)$name, gangs$ir2)
-gnames = gangs$GANG_NAME[match_vector]
-gnames = as.character(gnames)
-gnames[V(person)$gang.member==''] = 'Unknown'
+gnames = gnames[match_vector]
gnames[V(person)$gang.member==F] = 'None'
V(person)$gang.name <- as.character(gnames)
-V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
+# V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
+
+# clean up later to make this fit with process
+t = table(V(person)$gang.name)
+gs = names(t)[t<50]
+V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown'
#===================================================================
# save data
person = remove.edge.attribute(person,'weight')
# person_data = get.data.frame(person,'both')
-save(person, file="chi-9sep2015.RData")
+save(person, file="chi-14sep2015.RData")
#===================================================================
# get LCC of the network
@@ -254,7 +294,7 @@ lcc_edges = as_data_frame(lcc,'edges')
# update lcc_verts
lcc_verts = get.data.frame(lcc,'vertices')
-lcc_verts = lcc_verts[,c(1,23,24,2:22)]
+lcc_verts = lcc_verts[,c(1,23,24,2:21)]
# save file
save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData")
diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R
index 0655d28..dee3cdc 100644
--- a/R Scripts/predict-victims-plots.R
+++ b/R Scripts/predict-victims-plots.R
@@ -1,27 +1,32 @@
##### Plot results
+# load('Results/correct_rank_91415.RData')
+nvics = dim(correct_rank)[1]
correct_rank1 = correct_rank[,length(lambdas)] # demographics model
correct_rank2 = correct_rank[,1] # cascade model
correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model
-counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.0005)),
- sum(correct_rank1<(vcount(lcc)*0.001)),
- sum(correct_rank1<(vcount(lcc)*0.01)),
- sum(correct_rank2<(vcount(lcc)*0.0005)),
- sum(correct_rank2<(vcount(lcc)*0.001)),
- sum(correct_rank2<(vcount(lcc)*0.01)),
- sum(correct_rank3<(vcount(lcc)*0.0005)),
- sum(correct_rank3<(vcount(lcc)*0.001)),
- sum(correct_rank3<(vcount(lcc)*0.01))),
+popsizes = c(0.1,0.5,1.0)/100
+vcount(lcc)*popsizes
+counts = matrix(c( sum(correct_rank1<(vcount(lcc)*popsizes[1])),
+ sum(correct_rank1<(vcount(lcc)*popsizes[2])),
+ sum(correct_rank1<(vcount(lcc)*popsizes[3])),
+ sum(correct_rank2<(vcount(lcc)*popsizes[1])),
+ sum(correct_rank2<(vcount(lcc)*popsizes[2])),
+ sum(correct_rank2<(vcount(lcc)*popsizes[3])),
+ sum(correct_rank3<(vcount(lcc)*popsizes[1])),
+ sum(correct_rank3<(vcount(lcc)*popsizes[2])),
+ sum(correct_rank3<(vcount(lcc)*popsizes[3]))),
nrow=3, byrow=T)
counts = counts*100/nvics
barplot(counts,
xlab="Size of High-Risk Population",
ylab="Percent of Victims Predicted",
- names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1),
+ names.arg=paste(as.character(popsizes*100),'%',sep=''),
+ ylim=c(0,max(counts)*1.1),
col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)),
beside=TRUE)
legend("topleft", inset=0.05,
- c("Demographics (Logit)", "Cascades (Sum parent)", "Combined Model"),
+ c("Demographics", "Cascades", "Combined Model"),
fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)))
box(which='plot')
@@ -34,18 +39,12 @@ barplot(counts,
axis(side = 4)
mtext(side = 4, line = 3, "Number of Victims Predicted")
-popsizes = c(0.1, 0.5, 1)
-plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts)))
-lines(popsizes,counts[2,])
-lines(popsizes,counts[3,])
-lines(c(0,1),c(0,1))
#### Precision-Recall Curve
-plot(ecdf(correct_rank1),col='red',xlim=c(0,50),lwd=2)
+plot(ecdf(correct_rank1),col='red',lwd=2,xlim=c(1,100))
plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T)
plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T)
legend("bottomright", inset=0.05,
c("Demographics Model", "Cascade Model", "Combined Model"),
fill=c('red','darkblue','darkgreen'))
-lines(c(0,vcount(lcc)),c(0,1))
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index fc6cb7f..ccf894a 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -47,7 +47,7 @@ days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1,
lcc_verts$nonfatal_day_4,lcc_verts$nonfatal_day_5))
days = days[!is.na(days)]
days = sort(days)
-days = split(days, ceiling(seq_along(days)/456))
+days = split(days, ceiling(seq_along(days)/92))
lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=100)), 1)
##### Loop through days
@@ -60,9 +60,7 @@ for(i in 1:length(days)){
##### Demographics model
victims = lcc_verts
-
vics = which(victims$vic.day<day)
- vic.days = victims$vic.day[vics]
victims$age[-vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[-vics]))
victims$arrests[-vics] = unlist(lapply(prior_arrests[-vics],nArrests,day=day))
@@ -86,7 +84,7 @@ for(i in 1:length(days)){
irs = lcc_verts$ir_no[ids]
risk = data.table(id=ids, ir=irs, weight=weights)
- if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum
+ if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir]
##### Combined Model
combined = dt
@@ -108,4 +106,4 @@ for(i in 1:length(days)){
print(proc.time()-ptm)
}
-# save(correct_rank, file='Results/correct_rank_62815.RData') \ No newline at end of file
+# save(correct_rank, file='Results/correct_rank_91415.RData') \ No newline at end of file
diff --git a/R Scripts/prior-arrests-by-day.R b/R Scripts/prior-arrests-by-day.R
index 30d458c..604d85c 100644
--- a/R Scripts/prior-arrests-by-day.R
+++ b/R Scripts/prior-arrests-by-day.R
@@ -10,8 +10,6 @@ arrests$arrest_date = as.Date(arrests$arrest_date,format='%m/%d/%Y')
arrests$arrest_day = as.numeric(arrests$arrest_date - start_date)
arrests$id = match(arrests$ir2, lcc_verts$ir_no)
-arrest_days = function(arr,i){return(arr$arrest_day[arr$id==i)}
-
prior_arrests = sapply(1:vcount(lcc),function(x) NULL)
for(i in 1:vcount(lcc)){
if(i%%10000==0) print(i)