summaryrefslogtreecommitdiffstats
path: root/R Scripts
diff options
context:
space:
mode:
authorBen Green <ben@SEASITs-MacBook-Pro.local>2015-06-28 17:38:33 -0400
committerBen Green <ben@SEASITs-MacBook-Pro.local>2015-06-28 17:38:33 -0400
commit6e527bbf612465bf5d739b9652abc0165550993c (patch)
tree9525bed16d9e4568747855afd84a03937090f1cb /R Scripts
parent7167a81cfb8b872dd1547e5a8669004b191417db (diff)
downloadcriminal_cascades-6e527bbf612465bf5d739b9652abc0165550993c.tar.gz
Worked on synthetic data recovery so we can tell how high the actual
infector is ranked among all potential parents. Cleaned up code for the predicting victims benchmarking test.
Diffstat (limited to 'R Scripts')
-rw-r--r--R Scripts/benchmarking.R130
-rw-r--r--R Scripts/find-cascades.R11
-rw-r--r--R Scripts/find-parents.R40
-rw-r--r--R Scripts/generate-network.R5
-rw-r--r--R Scripts/predict-victims-plots.R61
-rw-r--r--R Scripts/predict-victims.R67
6 files changed, 177 insertions, 137 deletions
diff --git a/R Scripts/benchmarking.R b/R Scripts/benchmarking.R
deleted file mode 100644
index 1cf99e0..0000000
--- a/R Scripts/benchmarking.R
+++ /dev/null
@@ -1,130 +0,0 @@
-library(igraph)
-setwd('~/Documents/Cascade Project')
-load('Raw Data/lcc.RData')
-load('Results/hyper-lcc.RData')
-load('Results/dag_dat_all.RData')
-source('Scripts/temporal.R')
-source('Scripts/structural.R')
-
-##### Initialize data
-formula = vic ~ sex + race + age + gang.member + gang.name
-lcc_verts$sex = as.factor(lcc_verts$sex)
-lcc_verts$race = as.factor(lcc_verts$race)
-lcc_verts$age = as.numeric(lcc_verts$age)
-lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
-# sum(hyp_lcc_verts$vic)/length(days)
-
-##### Loop through days
-alpha = 1/100
-gamma = 0.18
-days = 70:max(hyp_lcc_verts$vic.day, na.rm=T)
-lambdas = 0#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1)
-nvics = sum(hyp_lcc_verts$vic.day %in% days)
-correct_rank = matrix(nrow=nvics, ncol=length(lambdas))
-# correct_rank1 = correct_rank2 = correct_rank3 = c()
-edges_all = dag_dat_all[dag_dat_all$dist<2,]
-ptm = proc.time()
-for (day in days){
- if (day %% 100 == 0) print(day)
-
- ##### Demographics model
-# vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
-# victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
-# victims$vic[vics] = TRUE
-# victims$vic[-vics] = FALSE
-# # glm.fit = glm(formula, data=victims, family=binomial)
-# glm.fit = lm(formula, data=victims)
-# glm.probs = predict(glm.fit, newdata=lcc_verts, type='response')
-
- ##### Cascade Model
- edges = edges_all[which(edges_all$t1<day),]
- f = temporal(edges$t1, day, alpha)
- h = structural(gamma,edges$dist)
- weights = f*h
- ids = edges$to
- irs = hyp_lcc_verts$ir_no[ids]
- risk = data.frame(id=ids, ir=irs, weight=weights)
- risk = risk[order(weights, decreasing=T),]
- risk = risk[match(unique(risk$ir),risk$ir),]
-# maybe need to change this to reflect new algorithm that accounts for \tilde{p}
-
- ##### Combined Model
-# combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0)
- combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight
-
- ##### Gather results
- infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
- for (lambda in lambdas){
- combined$comb = combined$cas#lambda*combined$dem + (1-lambda)*(1-combined$dem)*combined$cas
- c_idx = which(lambdas==lambda)
- r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs))
- # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank
- correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)])
- # maybe should also mark down vic/nonvic status of each?
- }
-
-}
-print(proc.time()-ptm)
-
-
-##### Plot results
-hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
- xlab='Risk Ranking of Victims',main='')
-hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
-hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
-legend("topright", c("Demographics Model", "Cascade Model"),
- fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8)))
-
-counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics,
- colSums(correct_rank<(vcount(lcc)/200))*100/nvics,
- colSums(correct_rank<(vcount(lcc)/100))*100/nvics),
- nrow=3, byrow=T)
-plot(lambdas,counts[1,],log='x',type='l')
-
-correct_rank1 = correct_rank[,length(lambdas)]
-correct_rank2 = correct_rank[,1]
-correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))]
-counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)),
- sum(correct_rank1<(vcount(lcc)*0.005)),
- sum(correct_rank1<(vcount(lcc)*0.01)),
- sum(correct_rank2<(vcount(lcc)*0.001)),
- sum(correct_rank2<(vcount(lcc)*0.005)),
- sum(correct_rank2<(vcount(lcc)*0.01)),
- sum(correct_rank3<(vcount(lcc)*0.001)),
- sum(correct_rank3<(vcount(lcc)*0.005)),
- sum(correct_rank3<(vcount(lcc)*0.01))),
- nrow=3, byrow=T)
-counts = counts*100/nvics
-barplot(counts,
- xlab="Size of High-Risk Population",
- ylab="Percent of Victims Predicted",
- names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1),
- col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)),
- beside=TRUE)
-legend("topleft", inset=0.05,
- c("Demographics Model", "Cascade Model", "Combined Model"),
- fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)))
-box(which='plot')
-par(new=T)
-counts = counts/(100/nvics)
-barplot(counts,
- ylim=c(0,max(counts)*1.1),
- col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)),
- beside=TRUE)
-axis(side = 4)
-mtext(side = 4, line = 3, "Number of Victims Predicted")
-
-popsizes = c(0.1, 0.5, 1)
-plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts)))
-lines(popsizes,counts[2,])
-lines(popsizes,counts[3,])
-lines(c(0,1),c(0,1))
-
-#### Precision-Recall Curve
-plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2)
-plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T)
-plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T)
-legend("bottomright", inset=0.05,
- c("Demographics Model", "Cascade Model", "Combined Model"),
- fill=c('red','darkblue','darkgreen'))
-lines(c(0,vcount(lcc)),c(0,1))
diff --git a/R Scripts/find-cascades.R b/R Scripts/find-cascades.R
index 0a69c5c..7673909 100644
--- a/R Scripts/find-cascades.R
+++ b/R Scripts/find-cascades.R
@@ -9,14 +9,14 @@ source('criminal_cascades/R Scripts/structural.R')
vic_ids = which(V(hyp_lcc)$vic==TRUE)
#### Initialize model parameters
-alpha = 1/21
+alpha = 0.041
# gamma = 0.3
-delta = 0.001
-beta = 0.00796964464237
+delta = 0.1
+# beta = 0.00796964464237
##### Get weights
# find max n days where infection possible given alpha
-edges = dag_dat_vics
+edges = dag_dat_test[!is.na(dag_dat_test$t2),]
# edges = edges[(edges$t2-edges$t1)<300,]
p_t = temporal(edges$t1, edges$t2, alpha)
@@ -31,7 +31,7 @@ weights = p/p_tilde
# probs = as.numeric(lm.probs)
# betas = probs[match(hyp_lcc_verts$ir_no[edges$to],names)]
# betas = 0.055
-thresh = beta/(1-beta)
+# thresh = beta/(1-beta)
realized = c()
# edges = edges[weights>thresh,]
@@ -51,6 +51,7 @@ for (u in vic_ids){
}
realized = c(realized, max_edge)
}
+edges[realized,c('from','to')]
# if (length(Ei)>0){
# max_edge = Ei[which.max(weights[Ei])] # how to deal with ties????
# realized = c(realized, max_edge)
diff --git a/R Scripts/find-parents.R b/R Scripts/find-parents.R
new file mode 100644
index 0000000..3ec8809
--- /dev/null
+++ b/R Scripts/find-parents.R
@@ -0,0 +1,40 @@
+# library(igraph)
+# setwd("~/Documents/Cascade Project/")
+# load('Results/hyper-lcc.RData')
+# load('Results/dag_dat_vics.RData')
+# source('criminal_cascades/R Scripts/temporal.R')
+# source('criminal_cascades/R Scripts/structural.R')
+
+##### Initialize parameters based on what ml2 found
+alpha = 0.061
+delta = 0.082
+
+##### Get weights
+edges = dag_dat_test[!is.na(dag_dat_test$t2),]
+
+dt = edges$t2 - edges$t1
+p_t = exp(-alpha*dt) * (exp(alpha)-1)
+p_s = structural(delta, edges$dist)
+p = p_s * p_t
+p_tilde = 1 - p_s + p_s * exp(-alpha*dt)
+weights = p/p_tilde
+edges$weight = weights
+
+##### Find most likely parents
+parents = data.frame(vic=0,Npars=0,par_rank=0)
+vics = setdiff(vic_ids,seeds)
+for (u in vics){
+ u_parents = edges[edges$to==u,]
+ u_parents = u_parents[order(u_parents$weight,decreasing=T),]
+ Nparents = dim(u_parents)[1]
+ infector = V(g)$infector[u]
+ infectorID = which(u_parents$from==infector)
+ parents[which(vics==u),] = c(u, Nparents, infectorID)
+}
+
+##### Get some summary statistics on how well
+median(parents$par_rank[parents$Npars>9])
+median(parents$par_rank[parents$Npars>99])
+
+
+
diff --git a/R Scripts/generate-network.R b/R Scripts/generate-network.R
index dc4a4f8..3b40969 100644
--- a/R Scripts/generate-network.R
+++ b/R Scripts/generate-network.R
@@ -9,7 +9,6 @@ delta = 0.15
# lmbda = 1/10
t_max = 1000
-# g = watts.strogatz.game(1, 100, 3, 0.25)
N = 5000
g = forest.fire.game(nodes=N, fw.prob=0.3, ambs=1, directed=F)
plot(g, vertex.size=5, vertex.label=NA)
@@ -31,7 +30,7 @@ for (day in 1:t_max){
infected = setdiff(infected,seeds) # don't try to infect seeds
inf.days = day + ceiling(alpha*rexp(length(infected),alpha))
V(g)$vic[infected] = TRUE
- infects = (inf.days < V(g)$vic.day[infected]) %in% c(NA,T)
+ infects = (inf.days <= V(g)$vic.day[infected]) %in% c(NA,T)
V(g)$vic.day[infected[infects]] = inf.days[infects]
V(g)$infector[infected[infects]] = vic
}
@@ -76,3 +75,5 @@ infectors = cbind(setdiff(vic_ids,seeds),
V(g)$infector[setdiff(vic_ids,seeds)],
recovered$infector[recovered$victim %in% setdiff(vic_ids,seeds)])
mean(infectors[,2]==infectors[,3])
+
+dag_dat_test[dag_dat_test$to==4984,] \ No newline at end of file
diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R
new file mode 100644
index 0000000..8a93667
--- /dev/null
+++ b/R Scripts/predict-victims-plots.R
@@ -0,0 +1,61 @@
+##### Plot results
+hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),
+ xlab='Risk Ranking of Victims',main='')
+hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
+hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T)
+legend("topright", c("Demographics Model", "Cascade Model"),
+ fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8)))
+
+counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics,
+ colSums(correct_rank<(vcount(lcc)/200))*100/nvics,
+ colSums(correct_rank<(vcount(lcc)/100))*100/nvics),
+ nrow=3, byrow=T)
+plot(lambdas,counts[1,],log='x',type='l')
+
+correct_rank1 = correct_rank[,length(lambdas)]
+correct_rank2 = correct_rank[,1]
+correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))]
+counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)),
+ sum(correct_rank1<(vcount(lcc)*0.005)),
+ sum(correct_rank1<(vcount(lcc)*0.01)),
+ sum(correct_rank2<(vcount(lcc)*0.001)),
+ sum(correct_rank2<(vcount(lcc)*0.005)),
+ sum(correct_rank2<(vcount(lcc)*0.01)),
+ sum(correct_rank3<(vcount(lcc)*0.001)),
+ sum(correct_rank3<(vcount(lcc)*0.005)),
+ sum(correct_rank3<(vcount(lcc)*0.01))),
+ nrow=3, byrow=T)
+counts = counts*100/nvics
+barplot(counts,
+ xlab="Size of High-Risk Population",
+ ylab="Percent of Victims Predicted",
+ names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1),
+ col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)),
+ beside=TRUE)
+legend("topleft", inset=0.05,
+ c("Demographics Model", "Cascade Model", "Combined Model"),
+ fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)))
+box(which='plot')
+par(new=T)
+counts = counts/(100/nvics)
+barplot(counts,
+ ylim=c(0,max(counts)*1.1),
+ col=c(rgb(0,0,1,0),rgb(1,0,0,0),rgb(0,1,0,0)),
+ beside=TRUE)
+axis(side = 4)
+mtext(side = 4, line = 3, "Number of Victims Predicted")
+
+popsizes = c(0.1, 0.5, 1)
+plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts)))
+lines(popsizes,counts[2,])
+lines(popsizes,counts[3,])
+lines(c(0,1),c(0,1))
+
+#### Precision-Recall Curve
+plot(ecdf(correct_rank1),col='red',xlim=c(0,vcount(lcc)),lwd=2)
+plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T)
+plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T)
+legend("bottomright", inset=0.05,
+ c("Demographics Model", "Cascade Model", "Combined Model"),
+ fill=c('red','darkblue','darkgreen'))
+lines(c(0,vcount(lcc)),c(0,1))
diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R
new file mode 100644
index 0000000..470815d
--- /dev/null
+++ b/R Scripts/predict-victims.R
@@ -0,0 +1,67 @@
+library(igraph)
+setwd('~/Documents/Cascade Project')
+load('Raw Data/lcc.RData')
+load('Results/hyper-lcc.RData')
+load('Results/dag_dat_all.RData')
+source('criminal_cascades/R Scripts/temporal.R')
+source('criminal_cascades/R Scripts/structural.R')
+
+##### Initialize data
+formula = vic ~ sex + race + age + gang.member + gang.name
+lcc_verts$sex = as.factor(lcc_verts$sex)
+lcc_verts$race = as.factor(lcc_verts$race)
+lcc_verts$age = as.numeric(lcc_verts$age)
+lcc_verts$gang.name = as.factor(lcc_verts$gang.name)
+# sum(hyp_lcc_verts$vic)/length(days)
+
+alpha = 0.0028
+delta = 0.06
+days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T)
+lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1)
+nvics = sum(lcc_verts$vic)#sum(hyp_lcc_verts$vic.day %in% days)
+correct_rank = matrix(nrow=nvics, ncol=length(lambdas))
+edges_all = dag_dat_all
+
+##### Loop through days
+ptm = proc.time()
+for (day in days){
+ if (which(day==days) %% 100 == 0) print(day)
+
+ ##### Demographics model
+ vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name)
+ victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')]
+ victims$vic[vics] = TRUE
+ victims$vic[-vics] = FALSE
+# glm.fit = glm(formula, data=victims, family=binomial)
+ glm.fit = lm(formula, data=victims)
+ glm.probs = predict(glm.fit, newdata=lcc_verts, type='response')
+
+ ##### Cascade Model
+ edges = edges_all[which(edges_all$t1<day),]
+ f = temporal(edges$t1, day, alpha)
+ h = structural(delta,edges$dist)
+ weights = f*h
+ ids = edges$to
+ irs = hyp_lcc_verts$ir_no[ids]
+ risk = data.frame(id=ids, ir=irs, weight=weights)
+ risk = risk[order(weights, decreasing=T),]
+ risk = risk[match(unique(risk$ir),risk$ir),]
+# maybe need to change this to reflect new algorithm that accounts for \tilde{p}
+
+ ##### Combined Model
+ combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0)
+ combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight
+
+ ##### Gather results
+ infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)]
+ for (lambda in lambdas){
+ combined$comb = lambda*combined$dem + (1-lambda)*combined$cas
+ c_idx = which(lambdas==lambda)
+ r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs))
+ # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank
+ correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)])
+ # maybe should also mark down vic/nonvic status of each?
+ }
+
+}
+print(proc.time()-ptm)