diff options
| author | Ben Green <ben@SEASITs-MacBook-Pro.local> | 2015-07-01 00:49:23 -0400 |
|---|---|---|
| committer | Ben Green <ben@SEASITs-MacBook-Pro.local> | 2015-07-01 00:49:23 -0400 |
| commit | 8e09ca6ca68c71bdab65525b529e2adfa281823c (patch) | |
| tree | d395bfcb0f9f0bc1092072ae1a8a9d3ad9c98a4a /R Scripts | |
| parent | 6e527bbf612465bf5d739b9652abc0165550993c (diff) | |
| download | criminal_cascades-8e09ca6ca68c71bdab65525b529e2adfa281823c.tar.gz | |
Got predict-victims running in parallel, drastically reducing the time
for each test. Also changed how we get the rankings of infected
individuals each day.
Diffstat (limited to 'R Scripts')
| -rw-r--r-- | R Scripts/find-parents.R | 15 | ||||
| -rw-r--r-- | R Scripts/generate-network.R | 40 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 9 | ||||
| -rw-r--r-- | R Scripts/predict-victims.R | 39 |
4 files changed, 60 insertions, 43 deletions
diff --git a/R Scripts/find-parents.R b/R Scripts/find-parents.R index 3ec8809..023d7ba 100644 --- a/R Scripts/find-parents.R +++ b/R Scripts/find-parents.R @@ -6,8 +6,8 @@ # source('criminal_cascades/R Scripts/structural.R') ##### Initialize parameters based on what ml2 found -alpha = 0.061 -delta = 0.082 +alpha = 0.18 +delta = 0.09 ##### Get weights edges = dag_dat_test[!is.na(dag_dat_test$t2),] @@ -21,20 +21,27 @@ weights = p/p_tilde edges$weight = weights ##### Find most likely parents -parents = data.frame(vic=0,Npars=0,par_rank=0) +parents = data.frame(vic=0,Npars=0,par_rank=0,rand_rank=0) vics = setdiff(vic_ids,seeds) +print(length(vics)) for (u in vics){ + if(which(vics==u) %% 500 == 0) print(which(vics==u)) u_parents = edges[edges$to==u,] u_parents = u_parents[order(u_parents$weight,decreasing=T),] Nparents = dim(u_parents)[1] infector = V(g)$infector[u] infectorID = which(u_parents$from==infector) - parents[which(vics==u),] = c(u, Nparents, infectorID) + randID = sample(1:Nparents,1) + parents[which(vics==u),] = c(u, Nparents, infectorID, randID) } ##### Get some summary statistics on how well +mean(parents$par_rank==1) median(parents$par_rank[parents$Npars>9]) median(parents$par_rank[parents$Npars>99]) +edges[edges$to==2847,] +## baseline alg +# for each vic, find potential parents, pick one at random diff --git a/R Scripts/generate-network.R b/R Scripts/generate-network.R index 3b40969..dab81a4 100644 --- a/R Scripts/generate-network.R +++ b/R Scripts/generate-network.R @@ -3,20 +3,20 @@ setwd("~/Documents/Cascade Project/") source('criminal_cascades/R Scripts/temporal.R') source('criminal_cascades/R Scripts/structural.R') -alpha = 1/100 -beta = 0.02 -delta = 0.15 +alpha = 1/10 +beta = 0.01 +delta = 0.25 # lmbda = 1/10 t_max = 1000 N = 5000 g = forest.fire.game(nodes=N, fw.prob=0.3, ambs=1, directed=F) -plot(g, vertex.size=5, vertex.label=NA) +plot(g, vertex.size=3, vertex.label=NA) V(g)$seed = runif(vcount(g))<beta seeds = which(V(g)$seed) V(g)$vic = V(g)$seed -V(g)$vic.day[V(g)$seed] = sample(1:t_max, sum(V(g)$seed)) +V(g)$vic.day[V(g)$seed] = 1#sample(1:t_max, sum(V(g)$seed)) V(g)$spawn.date = 0 V(g)$infector = NA @@ -28,17 +28,19 @@ for (day in 1:t_max){ dists = as.numeric(shortest.paths(g,vic,neighbors)) infected = neighbors[which(runif(length(neighbors))<structural(delta, dists))] infected = setdiff(infected,seeds) # don't try to infect seeds - inf.days = day + ceiling(alpha*rexp(length(infected),alpha)) + inf.days = day + ceiling(rexp(length(infected),alpha)) + realized = ((inf.days <= V(g)$vic.day[infected]) %in% c(NA,T)) & (inf.days<=t_max) + infected = infected[realized] V(g)$vic[infected] = TRUE - infects = (inf.days <= V(g)$vic.day[infected]) %in% c(NA,T) - V(g)$vic.day[infected[infects]] = inf.days[infects] - V(g)$infector[infected[infects]] = vic + V(g)$vic.day[infected] = inf.days[realized] + V(g)$infector[infected] = vic } } vic_ids = which(V(g)$vic) +print(length(vic_ids)) cols = rep('lightblue',N); cols[V(g)$vic]='red'; cols[V(g)$seed]='darkred' -plot(g, vertex.size=5, vertex.label=NA, vertex.color=cols) +plot(g, vertex.size=3, vertex.label=NA, vertex.color=cols) ##### generate dag_dat dag_dat_test = data.frame(matrix(nrow=1,ncol=10)) @@ -68,12 +70,12 @@ rownames(dag_dat_test) = NULL write.csv(dag_dat_test, file='Results/dag_dat_test.csv') -##### analyze performance of recovery algorithm -recovered = read.csv('Results/infectors.csv',header=F,col.names=c('victim','infector')) -recovered = recovered[order(recovered$victim),] -infectors = cbind(setdiff(vic_ids,seeds), - V(g)$infector[setdiff(vic_ids,seeds)], - recovered$infector[recovered$victim %in% setdiff(vic_ids,seeds)]) -mean(infectors[,2]==infectors[,3]) - -dag_dat_test[dag_dat_test$to==4984,]
\ No newline at end of file +##### analyze performance of recovery algorithm ------ +# recovered = read.csv('Results/infectors.csv',header=F,col.names=c('victim','infector')) +# recovered = recovered[order(recovered$victim),] +# infectors = cbind(setdiff(vic_ids,seeds), +# V(g)$infector[setdiff(vic_ids,seeds)], +# recovered$infector[recovered$victim %in% setdiff(vic_ids,seeds)]) +# mean(infectors[,2]==infectors[,3]) +# +# dag_dat_test[dag_dat_test$to==4984,] diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R index 8a93667..2ac62c8 100644 --- a/R Scripts/predict-victims-plots.R +++ b/R Scripts/predict-victims-plots.R @@ -2,7 +2,7 @@ hist(correct_rank3,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), xlab='Risk Ranking of Victims',main='') hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) -hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),add=T) legend("topright", c("Demographics Model", "Cascade Model"), fill=c(rgb(1,0,1,1/8), rgb(0,0,1,1/8))) @@ -12,9 +12,9 @@ counts = matrix(c(colSums(correct_rank<(vcount(lcc)/1000))*100/nvics, nrow=3, byrow=T) plot(lambdas,counts[1,],log='x',type='l') -correct_rank1 = correct_rank[,length(lambdas)] -correct_rank2 = correct_rank[,1] -correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] +correct_rank1 = correct_rank[,length(lambdas)] # demographics model +correct_rank2 = correct_rank[,1] # cascade model +correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), sum(correct_rank1<(vcount(lcc)*0.005)), sum(correct_rank1<(vcount(lcc)*0.01)), @@ -59,3 +59,4 @@ legend("bottomright", inset=0.05, c("Demographics Model", "Cascade Model", "Combined Model"), fill=c('red','darkblue','darkgreen')) lines(c(0,vcount(lcc)),c(0,1)) + diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R index 470815d..2bda7e2 100644 --- a/R Scripts/predict-victims.R +++ b/R Scripts/predict-victims.R @@ -1,4 +1,7 @@ library(igraph) +library(foreach) +library(doMC) +registerDoMC(cores=4) setwd('~/Documents/Cascade Project') load('Raw Data/lcc.RData') load('Results/hyper-lcc.RData') @@ -7,34 +10,36 @@ source('criminal_cascades/R Scripts/temporal.R') source('criminal_cascades/R Scripts/structural.R') ##### Initialize data -formula = vic ~ sex + race + age + gang.member + gang.name +formula = vic ~ sex + race + age + gang.member #+ gang.name lcc_verts$sex = as.factor(lcc_verts$sex) lcc_verts$race = as.factor(lcc_verts$race) lcc_verts$age = as.numeric(lcc_verts$age) lcc_verts$gang.name = as.factor(lcc_verts$gang.name) # sum(hyp_lcc_verts$vic)/length(days) +df = data.frame(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0) alpha = 0.0028 delta = 0.06 days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T) lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) -nvics = sum(lcc_verts$vic)#sum(hyp_lcc_verts$vic.day %in% days) -correct_rank = matrix(nrow=nvics, ncol=length(lambdas)) +nvics = sum(hyp_lcc_verts$vic.day %in% days) edges_all = dag_dat_all ##### Loop through days +writeLines(c(""), "Results/log.txt") ptm = proc.time() -for (day in days){ - if (which(day==days) %% 100 == 0) print(day) - +correct_rank = foreach (day = days, .combine=rbind) %dopar% { + if (which(day==days) %% 100 == 0){sink("Results/log.txt", append=TRUE);cat(paste("day:",day,"\n"))} + ##### Demographics model vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] victims$vic[vics] = TRUE victims$vic[-vics] = FALSE -# glm.fit = glm(formula, data=victims, family=binomial) - glm.fit = lm(formula, data=victims) - glm.probs = predict(glm.fit, newdata=lcc_verts, type='response') + fit = lm(formula, data=victims) +# fit = glm(formula, data=victims, family=binomial) +# fit = randomForest(formula, data=victims[,1:5], ntree=100) + probs = predict(fit, newdata=lcc_verts, type='response') ##### Cascade Model edges = edges_all[which(edges_all$t1<day),] @@ -49,19 +54,21 @@ for (day in days){ # maybe need to change this to reflect new algorithm that accounts for \tilde{p} ##### Combined Model - combined = data.frame(ir=attr(glm.probs,'name'), dem=as.numeric(glm.probs), cas=0, comb=0) - combined$cas[match(risk$ir, attr(glm.probs,'name'))] = risk$weight + combined = df#data.frame(ir=attr(probs,'name'), dem=as.numeric(probs), cas=0, comb=0) + combined$dem[match(attr(probs,'name'), df$ir)] = as.numeric(probs) + combined$cas[match(risk$ir, attr(probs,'name'))] = risk$weight ##### Gather results infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + crday = matrix(nrow=length(infected_irs), ncol=length(lambdas)) for (lambda in lambdas){ combined$comb = lambda*combined$dem + (1-lambda)*combined$cas c_idx = which(lambdas==lambda) - r_idx = head(which(is.na(correct_rank[,c_idx])),length(infected_irs)) - # !! order should be first: rank of (3,5,5,7) should be (1,2,2,4), may need to do n-rank - correct_rank[r_idx,c_idx] = match(infected_irs, combined$ir[order(combined$comb, decreasing=T)]) - # maybe should also mark down vic/nonvic status of each? + crday[,c_idx] = rank(-combined$comb,ties.method='average')[match(infected_irs,combined$ir)] } - + + return(crday) } print(proc.time()-ptm) + +# save(correct_rank, file='Results/correct_rank_62815.RData')
\ No newline at end of file |
