summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--R Scripts/background.R3
-rwxr-xr-xR Scripts/data-exploration.R14
-rw-r--r--R Scripts/duplicate-shotings.R9
-rw-r--r--R Scripts/find-cascades.R79
-rwxr-xr-xR Scripts/generate-dag-dat-unweighted.R8
-rw-r--r--R Scripts/predict-victims-plots.R12
-rw-r--r--R Scripts/predict-victims.R71
-rw-r--r--R Scripts/prior-arrests-by-day.R31
-rwxr-xr-xR Scripts/structural.R4
-rwxr-xr-xR Scripts/temporal.R7
-rw-r--r--hawkes/README.rtf13
-rw-r--r--hawkes/README.txt7
12 files changed, 165 insertions, 93 deletions
diff --git a/R Scripts/background.R b/R Scripts/background.R
new file mode 100644
index 0000000..0620e16
--- /dev/null
+++ b/R Scripts/background.R
@@ -0,0 +1,3 @@
+background = function(mu0,day){
+ return (mu0 * (1 + 0.43 * sin((2*pi)/(365.24) * day + 4.36)) )
+} \ No newline at end of file
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R
index 11e33cf..f9617f9 100755
--- a/R Scripts/data-exploration.R
+++ b/R Scripts/data-exploration.R
@@ -38,3 +38,17 @@ non_vic_ids = which(V(lcc)$vic==FALSE)
hist(as.numeric(V(lcc)$vic_date[vic_ids]),100,col='lightblue',
xlab='Day of Study Period',main='Infections During the Study Period')
+# n infections
+sum(!is.na(lcc_verts$fatal_day))
+sum(!is.na(lcc_verts$nonfatal_day_1))
+sum(!is.na(lcc_verts$nonfatal_day_2))
+sum(!is.na(lcc_verts$nonfatal_day_3))
+sum(!is.na(lcc_verts$nonfatal_day_4))
+sum(!is.na(lcc_verts$nonfatal_day_5))
+
+sum(sum(!is.na(lcc_verts$fatal_day)),
+ sum(!is.na(lcc_verts$nonfatal_day_1)),
+ sum(!is.na(lcc_verts$nonfatal_day_2)),
+ sum(!is.na(lcc_verts$nonfatal_day_3)),
+ sum(!is.na(lcc_verts$nonfatal_day_4)),
+ sum(!is.na(lcc_verts$nonfatal_day_5)))
diff --git a/R Scripts/duplicate-shotings.R b/R Scripts/duplicate-shotings.R
new file mode 100644
index 0000000..0d17a4e
--- /dev/null
+++ b/R Scripts/duplicate-shotings.R
@@ -0,0 +1,9 @@
+vic_times = vic_times_lcc[vic_ids,]
+rows=c()
+for(i in 1:dim(vic_times[1])){
+ row = as.numeric(vic_times[i,2:7])
+ if(max(table(row))>1){
+ rows = c(rows,i)
+ }
+}
+
diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R
index ded9b9c..77c7b08 100644
--- a/R Scripts/find-cascades.R
+++ b/R Scripts/find-cascades.R
@@ -1,64 +1,57 @@
library(igraph)
-setwd("~/Documents/Cascade Project/")
-load('Results/hyper-lcc.RData')
-load('Results/dag_dat_vics.RData')
+setwd('~/Documents/Violence Cascades/')
+load('Raw Data/lcc.RData')
+load('Raw Data/dag_dat_lcc.RData')
source('criminal_cascades/R Scripts/temporal.R')
source('criminal_cascades/R Scripts/structural.R')
+source('criminal_cascades/R Scripts/background.R')
-vic_ids = which(V(hyp_lcc)$vic==TRUE)
+vic_ids = which(lcc_verts$vic==TRUE)
+edges = dag_dat_lcc
#### Initialize model parameters
-alpha = 0.041
-# gamma = 0.3
-delta = 0.1
-# beta = 0.00796964464237
+mu0 = 1.1845e-05 #background
+alpha = 0.00317 #structural
+beta = 0.0039 #temporal
##### Get weights
-# find max n days where infection possible given alpha
-edges = dag_dat_test[!is.na(dag_dat_test$t2),]
-# edges = edges[(edges$t2-edges$t1)<300,]
-
-p_t = temporal(edges$t1, edges$t2, alpha)
-p_s = structural(delta, edges$dist)
-p = p_s * p_t
-p_tilde = 1 - p_s + p_s * temporal(edges$t1, edges$t2+1, alpha)
-weights = p/p_tilde
+# p_t = temporal(edges$t1, edges$t2, beta)
+# p_s = structural(alpha, edges$dist)
+# p = p_s * p_t
+# background = background(mu0,t)
##### Find transmission edges
-# load('Results/lm-probs.RData')
-# names = attr(lm.probs,'names')
-# probs = as.numeric(lm.probs)
-# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)]
-# betas = 0.055
-# thresh = beta/(1-beta)
-
realized = c()
-# edges = edges[weights>thresh,]
-# weights = weights[weights>thresh]
for (u in vic_ids){
- max_prob = -1
- max_edge = NULL
- Ei = which(edges$to==u)
- for(i in Ei){
- if(weights[i]>thresh){
- prob = p[i] * prod(p_tilde[setdiff(Ei,i)])
- if(prob>max_prob){
- max_prob = prob
- max_edge = i
+ days = as.numeric(vic_times[u,2:7])
+ days = days[!is.na(days)]
+ for(day in days){
+ bkgnd = background(mu0,day)
+ Ei = which(edges$to==u & edges$t1<day)
+ p_t = temporal(edges$t1[Ei], day, beta)
+ p_s = structural(alpha, edges$dist[Ei])
+ p = p_s*p_t
+
+ pc = sum(p)
+
+ if (pc>bkgnd){
+ max_prob = -1
+ max_edge = NULL
+ for(i in 1:length(Ei)){
+ if(p[i]>max_prob){
+ max_prob = p[i]
+ max_edge = i
+ }
}
- }
+ }
+ realized = c(realized, max_edge)
}
- realized = c(realized, max_edge)
+ edges[realized,c('from','to')]
}
-edges[realized,c('from','to')]
-# if (length(Ei)>0){
-# max_edge = Ei[which.max(weights[Ei])] # how to deal with ties????
-# realized = c(realized, max_edge)
-# }
##### Create forest of cascades
# dag is all of the victim nodes but with only transmission edges, or don't index by vic_ids
-dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=hyp_lcc_verts[vic_ids,])
+dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=lcc_verts[vic_ids,])
dag = set.edge.attribute(dag,'weight',value=weights[realized])
table(clusters(dag)$csize)
diff --git a/R Scripts/generate-dag-dat-unweighted.R b/R Scripts/generate-dag-dat-unweighted.R
index 06988b9..7b874d6 100755
--- a/R Scripts/generate-dag-dat-unweighted.R
+++ b/R Scripts/generate-dag-dat-unweighted.R
@@ -57,3 +57,11 @@ vic_times_lcc = lcc_verts[,c('name','nonfatal_day_1','nonfatal_day_2',
'nonfatal_day_5','fatal_day')]
save(vic_times_lcc, file='vic_times_lcc.RData')
write.csv(vic_times_lcc, file='vic_times_lcc.csv')
+
+sum(sum(!is.na(vic_times_lcc$fatal_day)),
+ sum(!is.na(vic_times_lcc$nonfatal_day_1)),
+ sum(!is.na(vic_times_lcc$nonfatal_day_2)),
+ sum(!is.na(vic_times_lcc$nonfatal_day_3)),
+ sum(!is.na(vic_times_lcc$nonfatal_day_4)),
+ sum(!is.na(vic_times_lcc$nonfatal_day_5)))
+
diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R
index 87b0a25..0655d28 100644
--- a/R Scripts/predict-victims-plots.R
+++ b/R Scripts/predict-victims-plots.R
@@ -1,16 +1,4 @@
##### Plot results
-hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
- xlab='Risk Ranking of Victims',main='')
-hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
-hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),add=T)
-legend("topright", c("Demographics Model", "Cascade Model"),
- fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8)))
-
-counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics,
- colSums(correct_rank<(vcount(lcc)/200))*100/nvics,
- colSums(correct_rank<(vcount(lcc)/100))*100/nvics),
- nrow=3, byrow=T)
-plot(lambdas,counts[1,],log='x',type='l')
correct_rank1 = correct_rank[,length(lambdas)] # demographics model
correct_rank2 = correct_rank[,1] # cascade model
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
index c662651..fc6cb7f 100644
--- a/R Scripts/predict-victims.R
+++ b/R Scripts/predict-victims.R
@@ -5,26 +5,50 @@ library(doMC)
registerDoMC(cores=4)
setwd('~/Documents/Violence Cascades/')
load('Raw Data/lcc.RData')
-load('Results/hyper-lcc.RData')
-load('Results/dag_dat_all.RData')
+load('Raw Data/dag_dat_lcc.RData')
+load('Raw Data/vic_times_lcc.RData')
+load('Raw Data/prior-arrests.RData')
source('criminal_cascades/R Scripts/temporal.R')
source('criminal_cascades/R Scripts/structural.R')
+nArrests = function(arrests,day){return(sum(arrests<day))}
+
+##### Set parameters
+alpha = 0.00317
+beta = 0.0039
+start_date = as.Date("2005-12-31")
+
+edges_all = data.table(dag_dat_lcc)
+vic_ids = which(lcc_verts$vic==T)
+lcc_verts = lcc_verts[,c('id','ir_no','nonfatal_day_1','nonfatal_day_2',
+ 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5',
+ 'fatal_day','sex','race','dob','gang.member','gang.name')]
+lcc_verts$vic.day = NA
+lcc_verts$vic.day[vic_ids] = as.numeric(apply(vic_times_lcc[vic_ids,2:7],1,min,na.rm=T))
+lcc_verts$age = FALSE
+lcc_verts$age[vic_ids] = as.numeric(start_date + lcc_verts$vic.day[vic_ids] - as.Date(lcc_verts$dob[vic_ids]))
+lcc_verts$arrests = NA
+lcc_verts$arrests[vic_ids] = unlist(mapply(nArrests,prior_arrests[vic_ids],day=lcc_verts$vic.day[vic_ids]))
+lcc_verts$vic = FALSE
+rownames(lcc_verts) = lcc_verts$ir_no
+vic_times_lcc$ir_no = lcc_verts$ir_no
+rownames(vic_times_lcc) = vic_times_lcc$ir_no
+
##### Initialize data
-formula = vic ~ sex + race + age + gang.member + gang.name
+formula = vic ~ sex + race + age + gang.member + gang.name + arrests
lcc_verts$sex = as.factor(lcc_verts$sex)
lcc_verts$race = as.factor(lcc_verts$race)
-lcc_verts$age = as.numeric(lcc_verts$age)
lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0)
-alpha = 0.0028
-delta = 0.06
-days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T)
+#####
+days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1,
+ lcc_verts$nonfatal_day_2,lcc_verts$nonfatal_day_3,
+ lcc_verts$nonfatal_day_4,lcc_verts$nonfatal_day_5))
+days = days[!is.na(days)]
+days = sort(days)
days = split(days, ceiling(seq_along(days)/456))
-lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=50)), 1)
-nvics = sum(hyp_lcc_verts$vic)
-edges_all = data.table(dag_dat_all)
+lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=100)), 1)
##### Loop through days
correct_rank = c()
@@ -35,22 +59,31 @@ for(i in 1:length(days)){
cr = foreach (day = ds, .combine=rbind) %dopar% {
##### Demographics model
- vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
- victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
+ victims = lcc_verts
+
+ vics = which(victims$vic.day<day)
+ vic.days = victims$vic.day[vics]
+
+ victims$age[-vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[-vics]))
+ victims$arrests[-vics] = unlist(lapply(prior_arrests[-vics],nArrests,day=day))
+
victims$vic[vics] = TRUE
victims$vic[-vics] = FALSE
-# fit = lm(formula, data=victims)
+
fit = glm(formula, data=victims, family=binomial)
- # fit = randomForest(formula, data=victims[,1:5], ntree=100)
- probs = predict(fit, newdata=lcc_verts, type='response')
+
+ lcc_verts_day = victims
+ lcc_verts_day$age[vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[vics]))
+ lcc_verts_day$arrests[vics] = unlist(lapply(prior_arrests[vics],nArrests,day=day))
+ probs = predict(fit, newdata=lcc_verts_day, type='response')
##### Cascade Model
edges = edges_all[which(edges_all$t1<day),]
- f = temporal(edges$t1, day, alpha)
- h = structural(delta,edges$dist)
+ f = temporal(edges$t1, day, beta)
+ h = structural(alpha,edges$dist)
weights = f*h
ids = edges$to
- irs = hyp_lcc_verts$ir_no[ids]
+ irs = lcc_verts$ir_no[ids]
risk = data.table(id=ids, ir=irs, weight=weights)
if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum
@@ -61,7 +94,7 @@ for(i in 1:length(days)){
combined$cas[match(risk$ir, dt$ir)] = risk$weight
##### Gather results
- infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
+ infected_irs = attr(which(rowSums(vic_times_lcc[,2:7]==day,na.rm=T)==1),'name')
crday = matrix(nrow=length(infected_irs), ncol=length(lambdas))
for (lambda in lambdas){
combined$comb = lambda*combined$dem + (1-lambda)*combined$cas
diff --git a/R Scripts/prior-arrests-by-day.R b/R Scripts/prior-arrests-by-day.R
new file mode 100644
index 0000000..30d458c
--- /dev/null
+++ b/R Scripts/prior-arrests-by-day.R
@@ -0,0 +1,31 @@
+library(igraph)
+setwd('~/Documents/Violence Cascades/Raw Data/')
+
+load('arrests.RData')
+load('lcc.RData')
+start_date = as.Date("2005-12-31")
+
+arrests = arrests[arrests$ir2 %in% lcc_verts$ir_no,c('ir2','arrest_date')]
+arrests$arrest_date = as.Date(arrests$arrest_date,format='%m/%d/%Y')
+arrests$arrest_day = as.numeric(arrests$arrest_date - start_date)
+arrests$id = match(arrests$ir2, lcc_verts$ir_no)
+
+arrest_days = function(arr,i){return(arr$arrest_day[arr$id==i)}
+
+prior_arrests = sapply(1:vcount(lcc),function(x) NULL)
+for(i in 1:vcount(lcc)){
+ if(i%%10000==0) print(i)
+ prior_arrests[[i]] = sort(arrests$arrest_day[arrests$id==i])
+}
+save(prior_arrests,file='prior-arrests.RData')
+
+
+#### turn vic_times_lcc into a list
+vic_times = sapply(1:vcount(lcc),function(x) NULL)
+for(i in 1:vcount(lcc)){
+ if(i%%10000==0) print(i)
+ days = as.numeric(vic_times_lcc[i,2:7])
+ days = days[!is.na(days)]
+ vic_times[[i]] = days
+}
+save(vic_times,file='vic_times_list.RData')
diff --git a/R Scripts/structural.R b/R Scripts/structural.R
index b68711c..94c85b4 100755
--- a/R Scripts/structural.R
+++ b/R Scripts/structural.R
@@ -2,6 +2,6 @@
# return (delta/(1. + 1./(w1*lmbda) + 1./(w2*lmbda) + 1./(w3*lmbda)))
# }
-structural = function(delta,dist){
- return (delta^dist)
+structural = function(alpha,dist){
+ return (alpha/dist)
} \ No newline at end of file
diff --git a/R Scripts/temporal.R b/R Scripts/temporal.R
index 13194a4..3502b45 100755
--- a/R Scripts/temporal.R
+++ b/R Scripts/temporal.R
@@ -1,9 +1,8 @@
-temporal = function(tu,tv,alpha){
- dt = tv - tu - 1
+temporal = function(tu,tv,beta){
+ dt = tv - tu
if (sum(dt<0)==0) {
- return (alpha*exp(-alpha*dt))
-# return ((0.01/alpha)*(1+dt/alpha)^(-1.01))
+ return (beta*exp(-beta*dt))
} else {
return (NA)
}
diff --git a/hawkes/README.rtf b/hawkes/README.rtf
deleted file mode 100644
index be4aff3..0000000
--- a/hawkes/README.rtf
+++ /dev/null
@@ -1,13 +0,0 @@
-{\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170
-{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\margl1440\margr1440\vieww10800\viewh8400\viewkind0
-\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural
-
-\f0\fs24 \cf0 \
-\
-data/data2 - reformatting the data files\
-\
-main - most of the optimization\
-\
-cause - assign responsibility for each infection} \ No newline at end of file
diff --git a/hawkes/README.txt b/hawkes/README.txt
new file mode 100644
index 0000000..3991d3b
--- /dev/null
+++ b/hawkes/README.txt
@@ -0,0 +1,7 @@
+
+
+data and data2 - reformatting the data files
+
+main - most of the optimization
+
+cause - assign responsibility for each infection \ No newline at end of file