diff options
Diffstat (limited to 'R Scripts')
| -rwxr-xr-x | R Scripts/data-exploration.R | 4 | ||||
| -rw-r--r-- | R Scripts/demographics.R | 24 | ||||
| -rw-r--r-- | R Scripts/predict-victims-plots.R | 6 | ||||
| -rw-r--r-- | R Scripts/predict-victims.R | 98 |
4 files changed, 80 insertions, 52 deletions
diff --git a/R Scripts/data-exploration.R b/R Scripts/data-exploration.R index 404ce9e..11e33cf 100755 --- a/R Scripts/data-exploration.R +++ b/R Scripts/data-exploration.R @@ -1,6 +1,6 @@ library(igraph) -setwd("Documents/Cascade Project/Raw Data/") -load('chi-19mar2015.RData') +setwd("~/Documents/Violence Cascades/") +load('Results/hyper-lcc.RData') d = remove.edge.attribute(person,'weight') lcc = induced.subgraph(d,which(clusters(d)$membership==which.max(clusters(d)$csize))) diff --git a/R Scripts/demographics.R b/R Scripts/demographics.R new file mode 100644 index 0000000..349eda9 --- /dev/null +++ b/R Scripts/demographics.R @@ -0,0 +1,24 @@ +library(igraph) +setwd('~/Documents/Violence Cascades/') +load('Raw Data/lcc.RData') + +length(vic_ids) + +mean(lcc_verts$sex=="M") +mean(lcc_verts$sex[vic_ids]=="M") + +mean(lcc_verts$race=="BLK") + +mean(lcc_verts$race %in% c("WHI", "WWH")) + +mean(lcc_verts$gang.member) + +d = degree(lcc) +mean(d) + +vic_frac = rep(NA,vcount(lcc)) +for(i in 1:vcount(lcc)){ + if (i %% 10000==0) print(i) + neighbors = unlist(neighborhood(lcc, nodes=i, order=2, mode='all'))[-1] + vic_frac[i] = sum(lcc_verts$vic[neighbors])/length(neighbors) +}
\ No newline at end of file diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R index 553aa89..b872201 100644 --- a/R Scripts/predict-victims-plots.R +++ b/R Scripts/predict-victims-plots.R @@ -1,5 +1,5 @@ ##### Plot results -hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), +hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8), xlab='Risk Ranking of Victims',main='') hist(correct_rank1,150,xlim=c(0,vcount(lcc)),col=rgb(1,0,1,1/8),add=T) hist(correct_rank2,150,xlim=c(0,vcount(lcc)),col=rgb(0,0,1,1/8),add=T) @@ -15,7 +15,7 @@ plot(lambdas,counts[1,],log='x',type='l') correct_rank1 = correct_rank[,length(lambdas)] # demographics model correct_rank2 = correct_rank[,1] # cascade model correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model -counts = matrix(c(sum(correct_rank1<(vcount(lcc)*0.001)), +counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.001)), sum(correct_rank1<(vcount(lcc)*0.005)), sum(correct_rank1<(vcount(lcc)*0.01)), sum(correct_rank2<(vcount(lcc)*0.001)), @@ -33,7 +33,7 @@ barplot(counts, col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), beside=TRUE) legend("topleft", inset=0.05, - c("Demographics Model", "Cascade Model", "Combined Model"), + c("Demographics (Logit)", "Cascades (Sum parent)", "Combined Model"), fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) box(which='plot') diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R index 2bda7e2..c662651 100644 --- a/R Scripts/predict-victims.R +++ b/R Scripts/predict-victims.R @@ -1,8 +1,9 @@ library(igraph) +library(data.table) library(foreach) library(doMC) registerDoMC(cores=4) -setwd('~/Documents/Cascade Project') +setwd('~/Documents/Violence Cascades/') load('Raw Data/lcc.RData') load('Results/hyper-lcc.RData') load('Results/dag_dat_all.RData') @@ -10,65 +11,68 @@ source('criminal_cascades/R Scripts/temporal.R') source('criminal_cascades/R Scripts/structural.R') ##### Initialize data -formula = vic ~ sex + race + age + gang.member #+ gang.name +formula = vic ~ sex + race + age + gang.member + gang.name lcc_verts$sex = as.factor(lcc_verts$sex) lcc_verts$race = as.factor(lcc_verts$race) lcc_verts$age = as.numeric(lcc_verts$age) lcc_verts$gang.name = as.factor(lcc_verts$gang.name) -# sum(hyp_lcc_verts$vic)/length(days) -df = data.frame(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0) +dt = data.table(ir=lcc_verts$ir_no, dem=0, cas=0, comb=0) alpha = 0.0028 delta = 0.06 days = sort(unique(hyp_lcc_verts$vic.day)) # 70:max(hyp_lcc_verts$vic.day, na.rm=T) -lambdas = c(0,1)#c(0, exp(seq(log(0.0000001), log(.0005), length.out=150)), 1) -nvics = sum(hyp_lcc_verts$vic.day %in% days) -edges_all = dag_dat_all +days = split(days, ceiling(seq_along(days)/456)) +lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=50)), 1) +nvics = sum(hyp_lcc_verts$vic) +edges_all = data.table(dag_dat_all) ##### Loop through days -writeLines(c(""), "Results/log.txt") -ptm = proc.time() -correct_rank = foreach (day = days, .combine=rbind) %dopar% { - if (which(day==days) %% 100 == 0){sink("Results/log.txt", append=TRUE);cat(paste("day:",day,"\n"))} - - ##### Demographics model - vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) - victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] - victims$vic[vics] = TRUE - victims$vic[-vics] = FALSE - fit = lm(formula, data=victims) -# fit = glm(formula, data=victims, family=binomial) -# fit = randomForest(formula, data=victims[,1:5], ntree=100) - probs = predict(fit, newdata=lcc_verts, type='response') +correct_rank = c() +for(i in 1:length(days)){ + ptm = proc.time() + print(c(i,length(days))) + ds = unlist(days[i], use.names=F) + cr = foreach (day = ds, .combine=rbind) %dopar% { - ##### Cascade Model - edges = edges_all[which(edges_all$t1<day),] - f = temporal(edges$t1, day, alpha) - h = structural(delta,edges$dist) - weights = f*h - ids = edges$to - irs = hyp_lcc_verts$ir_no[ids] - risk = data.frame(id=ids, ir=irs, weight=weights) - risk = risk[order(weights, decreasing=T),] - risk = risk[match(unique(risk$ir),risk$ir),] -# maybe need to change this to reflect new algorithm that accounts for \tilde{p} - - ##### Combined Model - combined = df#data.frame(ir=attr(probs,'name'), dem=as.numeric(probs), cas=0, comb=0) - combined$dem[match(attr(probs,'name'), df$ir)] = as.numeric(probs) - combined$cas[match(risk$ir, attr(probs,'name'))] = risk$weight + ##### Demographics model + vics = match(unique(hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day<day)]),lcc_verts$name) + victims = lcc_verts[,c('vic','sex','race','age','gang.member','gang.name')] + victims$vic[vics] = TRUE + victims$vic[-vics] = FALSE +# fit = lm(formula, data=victims) + fit = glm(formula, data=victims, family=binomial) + # fit = randomForest(formula, data=victims[,1:5], ntree=100) + probs = predict(fit, newdata=lcc_verts, type='response') + + ##### Cascade Model + edges = edges_all[which(edges_all$t1<day),] + f = temporal(edges$t1, day, alpha) + h = structural(delta,edges$dist) + weights = f*h + ids = edges$to + irs = hyp_lcc_verts$ir_no[ids] + + risk = data.table(id=ids, ir=irs, weight=weights) + if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum - ##### Gather results - infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] - crday = matrix(nrow=length(infected_irs), ncol=length(lambdas)) - for (lambda in lambdas){ - combined$comb = lambda*combined$dem + (1-lambda)*combined$cas - c_idx = which(lambdas==lambda) - crday[,c_idx] = rank(-combined$comb,ties.method='average')[match(infected_irs,combined$ir)] + ##### Combined Model + combined = dt + combined$dem[match(attr(probs,'name'), dt$ir)] = as.numeric(probs) + combined$cas[match(risk$ir, dt$ir)] = risk$weight + + ##### Gather results + infected_irs = hyp_lcc_verts$ir_no[which(hyp_lcc_verts$vic.day==day)] + crday = matrix(nrow=length(infected_irs), ncol=length(lambdas)) + for (lambda in lambdas){ + combined$comb = lambda*combined$dem + (1-lambda)*combined$cas + c_idx = which(lambdas==lambda) + crday[,c_idx] = rank(-combined$comb,ties.method='average')[match(infected_irs,combined$ir)] + } + + return(crday) } - - return(crday) + correct_rank = rbind(correct_rank,cr) + print(proc.time()-ptm) } -print(proc.time()-ptm) # save(correct_rank, file='Results/correct_rank_62815.RData')
\ No newline at end of file |
