summaryrefslogtreecommitdiffstats
path: root/R Scripts/predict-victims.R
diff options
context:
space:
mode:
authorBen Green <bgreen@g.harvard.edu>2015-09-13 23:52:40 -0400
committerBen Green <bgreen@g.harvard.edu>2015-09-13 23:52:42 -0400
commitf23e9e5da58d1d19913703e11f7ff96a7adff96b (patch)
treeb9459b3670cdf125f8bdf0888288be3a0a1357a8 /R Scripts/predict-victims.R
parentfffb68b515a5cae198d8b756b562ddea6f2c814c (diff)
downloadcriminal_cascades-f23e9e5da58d1d19913703e11f7ff96a7adff96b.tar.gz
updated prediction code for new model and data format
Diffstat (limited to 'R Scripts/predict-victims.R')
-rw-r--r--R Scripts/predict-victims.R71
1 files changed, 52 insertions, 19 deletions
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index c662651..fc6cb7f 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -5,26 +5,50 @@ library(doMC)
registerDoMC(cores=4)
setwd('~/Documents/Violence Cascades/')
load('Raw Data/lcc.RData')
-load('Results/hyper-lcc.RData')
-load('Results/dag_dat_all.RData')
+load('Raw Data/dag_dat_lcc.RData')
+load('Raw Data/vic_times_lcc.RData')
+load('Raw Data/prior-arrests.RData')
source('criminal_cascades/R Scripts/temporal.R')
source('criminal_cascades/R Scripts/structural.R')
+nArrests = function(arrests,day){return(sum(arrests<day))}
+
+##### Set parameters
+alpha = 0.00317
+beta = 0.0039
+start_date = as.Date("2005-12-31")
+
+edges_all = data.table(dag_dat_lcc)
+vic_ids = which(lcc_verts$vic==T)
+lcc_verts = lcc_verts[,c('id','ir_no','nonfatal_day_1','nonfatal_day_2',
+ 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5',
+ 'fatal_day','sex','race','dob','gang.member','gang.name')]
+lcc_verts$vic.day = NA
+lcc_verts$vic.day[vic_ids] = as.numeric(apply(vic_times_lcc[vic_ids,2:7],1,min,na.rm=T))
+lcc_verts$age = FALSE
+lcc_verts$age[vic_ids] = as.numeric(start_date + lcc_verts$vic.day[vic_ids] - as.Date(lcc_verts$dob[vic_ids]))
+lcc_verts$arrests = NA
+lcc_verts$arrests[vic_ids] = unlist(mapply(nArrests,prior_arrests[vic_ids],day=lcc_verts$vic.day[vic_ids]))
+lcc_verts$vic = FALSE
+rownames(lcc_verts) = lcc_verts$ir_no
+vic_times_lcc$ir_no = lcc_verts$ir_no
+rownames(vic_times_lcc) = vic_times_lcc$ir_no
+
##### Initialize data
-formula = vic ~ sex + race + age + gang.member + gang.name
+formula = vic ~ sex + race + age + gang.member + gang.name + arrests
lcc_verts$sex = as.factor(lcc_verts$sex)
lcc_verts$race = as.factor(lcc_verts$race)
-lcc_verts$age = as.numeric(lcc_verts$age)
lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
-alpha = 0.0028
-delta = 0.06
-days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T)
+#####
+days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1,
+ lcc_verts$nonfatal_day_2,lcc_verts$nonfatal_day_3,
+ lcc_verts$nonfatal_day_4,lcc_verts$nonfatal_day_5))
+days = days[!is.na(days)]
+days = sort(days)
days = split(days, ceiling(seq_along(days)/456))
-lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=50)), 1)
-nvics = sum(hyp_lcc_verts$vic)
-edges_all = data.table(dag_dat_all)
+lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=100)), 1)
##### Loop through days
correct_rank = c()
@@ -35,22 +59,31 @@ for(i in 1:length(days)){
cr = foreach (day = ds, .combine=rbind) %dopar% {
##### Demographics model
- vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
- victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
+ victims = lcc_verts
+
+ vics = which(victims$vic.day<day)
+ vic.days = victims$vic.day[vics]
+
+ victims$age[-vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[-vics]))
+ victims$arrests[-vics] = unlist(lapply(prior_arrests[-vics],nArrests,day=day))
+
victims$vic[vics] = TRUE
victims$vic[-vics] = FALSE
-# fit = lm(formula, data=victims)
+
fit = glm(formula, data=victims, family=binomial)
- # fit = randomForest(formula, data=victims[,1:5], ntree=100)
- probs = predict(fit, newdata=lcc_verts, type='response')
+
+ lcc_verts_day = victims
+ lcc_verts_day$age[vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[vics]))
+ lcc_verts_day$arrests[vics] = unlist(lapply(prior_arrests[vics],nArrests,day=day))
+ probs = predict(fit, newdata=lcc_verts_day, type='response')
##### Cascade Model
edges = edges_all[which(edges_all$t1<day),]
- f = temporal(edges$t1, day, alpha)
- h = structural(delta,edges$dist)
+ f = temporal(edges$t1, day, beta)
+ h = structural(alpha,edges$dist)
weights = f*h
ids = edges$to
- irs = hyp_lcc_verts$ir_no[ids]
+ irs = lcc_verts$ir_no[ids]
risk = data.table(id=ids, ir=irs, weight=weights)
if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum
@@ -61,7 +94,7 @@ for(i in 1:length(days)){
combined$cas[match(risk$ir, dt$ir)] = risk$weight
##### Gather results
- infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
+ infected_irs = attr(which(rowSums(vic_times_lcc[,2:7]==day,na.rm=T)==1),'name')
crday = matrix(nrow=length(infected_irs), ncol=length(lambdas))
for (lambda in lambdas){
combined$comb = lambda*combined$dem + (1-lambda)*combined$cas