diff options
Diffstat (limited to 'R Scripts')
| -rw-r--r-- | R Scripts/background.R | 3 | ||||
| -rwxr-xr-x | R Scripts/data-exploration.R | 14 | ||||
| -rw-r--r-- | R Scripts/duplicate-shotings.R | 9 | ||||
| -rw-r--r-- | R Scripts/find-cascades.R | 79 | ||||
| -rwxr-xr-x | R Scripts/generate-dag-dat-unweighted.R | 8 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 12 | ||||
| -rw-r--r-- | R Scripts/predict-victims.R | 71 | ||||
| -rw-r--r-- | R Scripts/prior-arrests-by-day.R | 31 | ||||
| -rwxr-xr-x | R Scripts/structural.R | 4 | ||||
| -rwxr-xr-x | R Scripts/temporal.R | 7 |
10 files changed, 158 insertions, 80 deletions
diff --git a/R Scripts/background.R b/R Scripts/background.R new file mode 100644 index 0000000..0620e16 --- /dev/null +++ b/R Scripts/background.R @@ -0,0 +1,3 @@ +background = function(mu0,day){ + return (mu0 * (1 + 0.43 * sin((2*pi)/(365.24) * day + 4.36)) ) +}
\ No newline at end of file diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R index 11e33cf..f9617f9 100755 --- a/R Scripts/data-exploration.R +++ b/R Scripts/data-exploration.R @@ -38,3 +38,17 @@ non_vic_ids = which(V(lcc)$vic==FALSE) hist(as.numeric(V(lcc)$vic_date[vic_ids]),100,col='lightblue', xlab='Day of Study Period',main='Infections During the Study Period') +# n infections +sum(!is.na(lcc_verts$fatal_day)) +sum(!is.na(lcc_verts$nonfatal_day_1)) +sum(!is.na(lcc_verts$nonfatal_day_2)) +sum(!is.na(lcc_verts$nonfatal_day_3)) +sum(!is.na(lcc_verts$nonfatal_day_4)) +sum(!is.na(lcc_verts$nonfatal_day_5)) + +sum(sum(!is.na(lcc_verts$fatal_day)), + sum(!is.na(lcc_verts$nonfatal_day_1)), + sum(!is.na(lcc_verts$nonfatal_day_2)), + sum(!is.na(lcc_verts$nonfatal_day_3)), + sum(!is.na(lcc_verts$nonfatal_day_4)), + sum(!is.na(lcc_verts$nonfatal_day_5))) diff --git a/R Scripts/duplicate-shotings.R b/R Scripts/duplicate-shotings.R new file mode 100644 index 0000000..0d17a4e --- /dev/null +++ b/R Scripts/duplicate-shotings.R @@ -0,0 +1,9 @@ +vic_times = vic_times_lcc[vic_ids,] +rows=c() +for(i in 1:dim(vic_times[1])){ + row = as.numeric(vic_times[i,2:7]) + if(max(table(row))>1){ + rows = c(rows,i) + } +} + diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R index ded9b9c..77c7b08 100644 --- a/R Scripts/find-cascades.R +++ b/R Scripts/find-cascades.R @@ -1,64 +1,57 @@ library(igraph) -setwd("~/Documents/Cascade Project/") -load('Results/hyper-lcc.RData') -load('Results/dag_dat_vics.RData') +setwd('~/Documents/Violence Cascades/') +load('Raw Data/lcc.RData') +load('Raw Data/dag_dat_lcc.RData') source('criminal_cascades/R Scripts/temporal.R') source('criminal_cascades/R Scripts/structural.R') +source('criminal_cascades/R Scripts/background.R') -vic_ids = which(V(hyp_lcc)$vic==TRUE) +vic_ids = which(lcc_verts$vic==TRUE) +edges = dag_dat_lcc #### Initialize model parameters -alpha = 0.041 -# gamma = 0.3 -delta = 0.1 -# beta = 0.00796964464237 +mu0 = 1.1845e-05 #background +alpha = 0.00317 #structural +beta = 0.0039 #temporal ##### Get weights -# find max n days where infection possible given alpha -edges = dag_dat_test[!is.na(dag_dat_test$t2),] -# edges = edges[(edges$t2-edges$t1)<300,] - -p_t = temporal(edges$t1, edges$t2, alpha) -p_s = structural(delta, edges$dist) -p = p_s * p_t -p_tilde = 1 - p_s + p_s * temporal(edges$t1, edges$t2+1, alpha) -weights = p/p_tilde +# p_t = temporal(edges$t1, edges$t2, beta) +# p_s = structural(alpha, edges$dist) +# p = p_s * p_t +# background = background(mu0,t) ##### Find transmission edges -# load('Results/lm-probs.RData') -# names = attr(lm.probs,'names') -# probs = as.numeric(lm.probs) -# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)] -# betas = 0.055 -# thresh = beta/(1-beta) - realized = c() -# edges = edges[weights>thresh,] -# weights = weights[weights>thresh] for (u in vic_ids){ - max_prob = -1 - max_edge = NULL - Ei = which(edges$to==u) - for(i in Ei){ - if(weights[i]>thresh){ - prob = p[i] * prod(p_tilde[setdiff(Ei,i)]) - if(prob>max_prob){ - max_prob = prob - max_edge = i + days = as.numeric(vic_times[u,2:7]) + days = days[!is.na(days)] + for(day in days){ + bkgnd = background(mu0,day) + Ei = which(edges$to==u & edges$t1<day) + p_t = temporal(edges$t1[Ei], day, beta) + p_s = structural(alpha, edges$dist[Ei]) + p = p_s*p_t + + pc = sum(p) + + if (pc>bkgnd){ + max_prob = -1 + max_edge = NULL + for(i in 1:length(Ei)){ + if(p[i]>max_prob){ + max_prob = p[i] + max_edge = i + } } - } + } + realized = c(realized, max_edge) } - realized = c(realized, max_edge) + edges[realized,c('from','to')] } -edges[realized,c('from','to')] -# if (length(Ei)>0){ -# max_edge = Ei[which.max(weights[Ei])] # how to deal with ties???? -# realized = c(realized, max_edge) -# } ##### Create forest of cascades # dag is all of the victim nodes but with only transmission edges, or don't index by vic_ids -dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=hyp_lcc_verts[vic_ids,]) +dag = graph.data.frame(edges[realized,1:2], directed=TRUE, vertices=lcc_verts[vic_ids,]) dag = set.edge.attribute(dag,'weight',value=weights[realized]) table(clusters(dag)$csize) diff --git a/R Scripts/generate-dag-dat-unweighted.R b/R Scripts/generate-dag-dat-unweighted.R index 06988b9..7b874d6 100755 --- a/R Scripts/generate-dag-dat-unweighted.R +++ b/R Scripts/generate-dag-dat-unweighted.R @@ -57,3 +57,11 @@ vic_times_lcc = lcc_verts[,c('name','nonfatal_day_1','nonfatal_day_2', 'nonfatal_day_5','fatal_day')] save(vic_times_lcc, file='vic_times_lcc.RData') write.csv(vic_times_lcc, file='vic_times_lcc.csv') + +sum(sum(!is.na(vic_times_lcc$fatal_day)), + sum(!is.na(vic_times_lcc$nonfatal_day_1)), + sum(!is.na(vic_times_lcc$nonfatal_day_2)), + sum(!is.na(vic_times_lcc$nonfatal_day_3)), + sum(!is.na(vic_times_lcc$nonfatal_day_4)), + sum(!is.na(vic_times_lcc$nonfatal_day_5))) + diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R index 87b0a25..0655d28 100644 --- a/R Scripts/predict-victims-plots.R +++ b/R Scripts/predict-victims-plots.R @@ -1,16 +1,4 @@ ##### Plot results -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), - xlab='Risk Ranking of Victims',main='') -hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),add=T) -legend("topright", c("Demographics Model", "Cascade Model"), - fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) - -counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, - colSums(correct_rank<(vcount(lcc)/200))*100/nvics, - colSums(correct_rank<(vcount(lcc)/100))*100/nvics), - nrow=3, byrow=T) -plot(lambdas,counts[1,],log='x',type='l') correct_rank1 = correct_rank[,length(lambdas)] # demographics model correct_rank2 = correct_rank[,1] # cascade model diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R index c662651..fc6cb7f 100644 --- a/R Scripts/predict-victims.R +++ b/R Scripts/predict-victims.R @@ -5,26 +5,50 @@ library(doMC) registerDoMC(cores=4) setwd('~/Documents/Violence Cascades/') load('Raw Data/lcc.RData') -load('Results/hyper-lcc.RData') -load('Results/dag_dat_all.RData') +load('Raw Data/dag_dat_lcc.RData') +load('Raw Data/vic_times_lcc.RData') +load('Raw Data/prior-arrests.RData') source('criminal_cascades/R Scripts/temporal.R') source('criminal_cascades/R Scripts/structural.R') +nArrests = function(arrests,day){return(sum(arrests<day))} + +##### Set parameters +alpha = 0.00317 +beta = 0.0039 +start_date = as.Date("2005-12-31") + +edges_all = data.table(dag_dat_lcc) +vic_ids = which(lcc_verts$vic==T) +lcc_verts = lcc_verts[,c('id','ir_no','nonfatal_day_1','nonfatal_day_2', + 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5', + 'fatal_day','sex','race','dob','gang.member','gang.name')] +lcc_verts$vic.day = NA +lcc_verts$vic.day[vic_ids] = as.numeric(apply(vic_times_lcc[vic_ids,2:7],1,min,na.rm=T)) +lcc_verts$age = FALSE +lcc_verts$age[vic_ids] = as.numeric(start_date + lcc_verts$vic.day[vic_ids] - as.Date(lcc_verts$dob[vic_ids])) +lcc_verts$arrests = NA +lcc_verts$arrests[vic_ids] = unlist(mapply(nArrests,prior_arrests[vic_ids],day=lcc_verts$vic.day[vic_ids])) +lcc_verts$vic = FALSE +rownames(lcc_verts) = lcc_verts$ir_no +vic_times_lcc$ir_no = lcc_verts$ir_no +rownames(vic_times_lcc) = vic_times_lcc$ir_no + ##### Initialize data -formula = vic ~ sex + race + age + gang.member + gang.name +formula = vic ~ sex + race + age + gang.member + gang.name + arrests lcc_verts$sex = as.factor(lcc_verts$sex) lcc_verts$race = as.factor(lcc_verts$race) -lcc_verts$age = as.numeric(lcc_verts$age) lcc_verts$gang.name = as.factor(lcc_verts$gang.name) dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0) -alpha = 0.0028 -delta = 0.06 -days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T) +##### +days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1, + lcc_verts$nonfatal_day_2,lcc_verts$nonfatal_day_3, + lcc_verts$nonfatal_day_4,lcc_verts$nonfatal_day_5)) +days = days[!is.na(days)] +days = sort(days) days = split(days, ceiling(seq_along(days)/456)) -lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=50)), 1) -nvics = sum(hyp_lcc_verts$vic) -edges_all = data.table(dag_dat_all) +lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=100)), 1) ##### Loop through days correct_rank = c() @@ -35,22 +59,31 @@ for(i in 1:length(days)){ cr = foreach (day = ds, .combine=rbind) %dopar% { ##### Demographics model - vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) - victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] + victims = lcc_verts + + vics = which(victims$vic.day<day) + vic.days = victims$vic.day[vics] + + victims$age[-vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[-vics])) + victims$arrests[-vics] = unlist(lapply(prior_arrests[-vics],nArrests,day=day)) + victims$vic[vics] = TRUE victims$vic[-vics] = FALSE -# fit = lm(formula, data=victims) + fit = glm(formula, data=victims, family=binomial) - # fit = randomForest(formula, data=victims[,1:5], ntree=100) - probs = predict(fit, newdata=lcc_verts, type='response') + + lcc_verts_day = victims + lcc_verts_day$age[vics] = as.numeric(start_date + day - 1 - as.Date(victims$dob[vics])) + lcc_verts_day$arrests[vics] = unlist(lapply(prior_arrests[vics],nArrests,day=day)) + probs = predict(fit, newdata=lcc_verts_day, type='response') ##### Cascade Model edges = edges_all[which(edges_all$t1<day),] - f = temporal(edges$t1, day, alpha) - h = structural(delta,edges$dist) + f = temporal(edges$t1, day, beta) + h = structural(alpha,edges$dist) weights = f*h ids = edges$to - irs = hyp_lcc_verts$ir_no[ids] + irs = lcc_verts$ir_no[ids] risk = data.table(id=ids, ir=irs, weight=weights) if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum @@ -61,7 +94,7 @@ for(i in 1:length(days)){ combined$cas[match(risk$ir, dt$ir)] = risk$weight ##### Gather results - infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + infected_irs = attr(which(rowSums(vic_times_lcc[,2:7]==day,na.rm=T)==1),'name') crday = matrix(nrow=length(infected_irs), ncol=length(lambdas)) for (lambda in lambdas){ combined$comb = lambda*combined$dem + (1-lambda)*combined$cas diff --git a/R Scripts/prior-arrests-by-day.R b/R Scripts/prior-arrests-by-day.R new file mode 100644 index 0000000..30d458c --- /dev/null +++ b/R Scripts/prior-arrests-by-day.R @@ -0,0 +1,31 @@ +library(igraph) +setwd('~/Documents/Violence Cascades/Raw Data/') + +load('arrests.RData') +load('lcc.RData') +start_date = as.Date("2005-12-31") + +arrests = arrests[arrests$ir2 %in% lcc_verts$ir_no,c('ir2','arrest_date')] +arrests$arrest_date = as.Date(arrests$arrest_date,format='%m/%d/%Y') +arrests$arrest_day = as.numeric(arrests$arrest_date - start_date) +arrests$id = match(arrests$ir2, lcc_verts$ir_no) + +arrest_days = function(arr,i){return(arr$arrest_day[arr$id==i)} + +prior_arrests = sapply(1:vcount(lcc),function(x) NULL) +for(i in 1:vcount(lcc)){ + if(i%%10000==0) print(i) + prior_arrests[[i]] = sort(arrests$arrest_day[arrests$id==i]) +} +save(prior_arrests,file='prior-arrests.RData') + + +#### turn vic_times_lcc into a list +vic_times = sapply(1:vcount(lcc),function(x) NULL) +for(i in 1:vcount(lcc)){ + if(i%%10000==0) print(i) + days = as.numeric(vic_times_lcc[i,2:7]) + days = days[!is.na(days)] + vic_times[[i]] = days +} +save(vic_times,file='vic_times_list.RData') diff --git a/R Scripts/structural.R b/R Scripts/structural.R index b68711c..94c85b4 100755 --- a/R Scripts/structural.R +++ b/R Scripts/structural.R @@ -2,6 +2,6 @@ # return (delta/(1. + 1./(w1*lmbda) + 1./(w2*lmbda) + 1./(w3*lmbda))) # } -structural = function(delta,dist){ - return (delta^dist) +structural = function(alpha,dist){ + return (alpha/dist) }
\ No newline at end of file diff --git a/R Scripts/temporal.R b/R Scripts/temporal.R index 13194a4..3502b45 100755 --- a/R Scripts/temporal.R +++ b/R Scripts/temporal.R @@ -1,9 +1,8 @@ -temporal = function(tu,tv,alpha){ - dt = tv - tu - 1 +temporal = function(tu,tv,beta){ + dt = tv - tu if (sum(dt<0)==0) { - return (alpha*exp(-alpha*dt)) -# return ((0.01/alpha)*(1+dt/alpha)^(-1.01)) + return (beta*exp(-beta*dt)) } else { return (NA) } |
