From 58faa01748fe0e6f6d040d1296266d17bd7a3543 Mon Sep 17 00:00:00 2001 From: Ben Green Date: Mon, 14 Sep 2015 23:19:34 -0400 Subject: prediction and plotting cascades --- R Scripts/analyze-cascades.R | 7 ++--- R Scripts/data-prep.R | 56 +++++++++++++++++++++++++++++++++------ R Scripts/predict-victims-plots.R | 35 ++++++++++++------------ R Scripts/predict-victims.R | 8 +++--- R Scripts/prior-arrests-by-day.R | 2 -- 5 files changed, 72 insertions(+), 36 deletions(-) (limited to 'R Scripts') diff --git a/R Scripts/analyze-cascades.R b/R Scripts/analyze-cascades.R index 463de18..e758d60 100755 --- a/R Scripts/analyze-cascades.R +++ b/R Scripts/analyze-cascades.R @@ -21,7 +21,7 @@ order = rev(order(csize)) plot(sizes,counts,log='xy',type='o',lwd=3, xlab='Size of Cascade', ylab='Number of Cascades', main='Distribution of Cascade Sizes') -i = 14 +i = 4 V = which(clusters(dag)$membership==order[i]) # get all nodes in cluster cc = induced.subgraph(dag,V) Vi = vic_ids[V] @@ -35,8 +35,9 @@ cols[seed] = 'red' plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1, edge.width=E(cc)$weight*20/max(E(cc)$weight),layout=layout.reingold.tilford(cc,root=seed), vertex.label=V(hyp_lcc)$vic.day[Vi]) -# vertex.label=as.Date(V(hyp_lcc)$vic.day[Vi], format ='%m/%d/%y')) -# vertex.label=V(lcc)$vic_date[Vi]) +plot(cc,vertex.size=10,edge.arrow.size=0.5,vertex.color=cols,vertex.label.cex=1, + layout=layout.reingold.tilford(cc,root=seed)) + ### basic graph statistics trl = mean(transitivity(cc,type='local',isolates='zero')) diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R index 127acca..ca2fdc2 100755 --- a/R Scripts/data-prep.R +++ b/R Scripts/data-prep.R @@ -80,6 +80,18 @@ for (i in 1:dim(a)[1]){ } } +# clean up entries where sex is missing +a = arrests[arrests$sex_code_cd=='X',] +for (i in 1:dim(a)[1]){ + ir = a$ir_no[i] + arr = arrests[arrests$ir_no==ir,] + arr = arr[arr$sex_code_cd != 'X',] + if(dim(arr)[1]>0){ + arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd))) + } +} +arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M' + #I need to add the "ir" for this to make sense when I "project" arrests$ir2 <- paste("ir", arrests$ir_no) @@ -171,6 +183,17 @@ murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] murders$ir2 = paste("ir", murders$VICTIM_IR_NO) +# clear nonfatals that led to death +v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,] +rows = c() +for(i in 1:dim(v)[1]){ + row = which(rownames(victims)==as.numeric(rownames(v[i,]))) + m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],] + dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y') + if(dup==T) rows = c(rows,row) +} +victims = victims[-rows,] + # set victim data in network vtab = as.data.frame(table(victims$ir2)) match_vector = match(V(person)$name,vtab$Var1) @@ -193,7 +216,8 @@ for(i in 1:length(vics)){ if (i%%3000==0) print(i) name = vics[i] ids = which(match_vector==name) - dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) + dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))) +# if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]] nfd1[i] = as.character(dates[1]) nfd2[i] = as.character(dates[2]) nfd3[i] = as.character(dates[3]) @@ -211,6 +235,7 @@ V(person)$nonfatal_date_3[vics] = nfd3 V(person)$nonfatal_date_4[vics] = nfd4 V(person)$nonfatal_date_5[vics] = nfd5 + # convert dates into numeric values ("days") start_date V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) @@ -223,25 +248,40 @@ V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_d #=================================================================== # set gang attributes gangs <- read.csv("Sept2014-ganglist.csv", header=T) -gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] gangs$ir2 <- paste("ir", gangs$IR_NO) +t = table(gangs$IR_NO) +t = t[t>1] +irs = as.numeric(attr(t,'name')) +for(ir in irs){ + if(which(ir==irs)%%1000==0)print(which(ir==irs)) + g = gangs[gangs$IR_NO==ir,] + gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME))) +} + +gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] +gnames = as.character(gangs$GANG_NAME) +gnames[is.na(gnames)] = 'Unknown' + V(person)$gang.member <- V(person)$name %in% gangs$ir2 match_vector = match(V(person)$name, gangs$ir2) -gnames = gangs$GANG_NAME[match_vector] -gnames = as.character(gnames) -gnames[V(person)$gang.member==''] = 'Unknown' +gnames = gnames[match_vector] gnames[V(person)$gang.member==F] = 'None' V(person)$gang.name <- as.character(gnames) -V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) +# V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) + +# clean up later to make this fit with process +t = table(V(person)$gang.name) +gs = names(t)[t<50] +V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown' #=================================================================== # save data person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') -save(person, file="chi-9sep2015.RData") +save(person, file="chi-14sep2015.RData") #=================================================================== # get LCC of the network @@ -254,7 +294,7 @@ lcc_edges = as_data_frame(lcc,'edges') # update lcc_verts lcc_verts = get.data.frame(lcc,'vertices') -lcc_verts = lcc_verts[,c(1,23,24,2:22)] +lcc_verts = lcc_verts[,c(1,23,24,2:21)] # save file save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") diff --git a/R Scripts/predict-victims-plots.R b/R Scripts/predict-victims-plots.R index 0655d28..dee3cdc 100644 --- a/R Scripts/predict-victims-plots.R +++ b/R Scripts/predict-victims-plots.R @@ -1,27 +1,32 @@ ##### Plot results +# load('Results/correct_rank_91415.RData') +nvics = dim(correct_rank)[1] correct_rank1 = correct_rank[,length(lambdas)] # demographics model correct_rank2 = correct_rank[,1] # cascade model correct_rank3 = correct_rank[,which.min(colMeans(correct_rank))] # best combined model -counts = matrix(c( sum(correct_rank1<(vcount(lcc)*0.0005)), - sum(correct_rank1<(vcount(lcc)*0.001)), - sum(correct_rank1<(vcount(lcc)*0.01)), - sum(correct_rank2<(vcount(lcc)*0.0005)), - sum(correct_rank2<(vcount(lcc)*0.001)), - sum(correct_rank2<(vcount(lcc)*0.01)), - sum(correct_rank3<(vcount(lcc)*0.0005)), - sum(correct_rank3<(vcount(lcc)*0.001)), - sum(correct_rank3<(vcount(lcc)*0.01))), +popsizes = c(0.1,0.5,1.0)/100 +vcount(lcc)*popsizes +counts = matrix(c( sum(correct_rank1<(vcount(lcc)*popsizes[1])), + sum(correct_rank1<(vcount(lcc)*popsizes[2])), + sum(correct_rank1<(vcount(lcc)*popsizes[3])), + sum(correct_rank2<(vcount(lcc)*popsizes[1])), + sum(correct_rank2<(vcount(lcc)*popsizes[2])), + sum(correct_rank2<(vcount(lcc)*popsizes[3])), + sum(correct_rank3<(vcount(lcc)*popsizes[1])), + sum(correct_rank3<(vcount(lcc)*popsizes[2])), + sum(correct_rank3<(vcount(lcc)*popsizes[3]))), nrow=3, byrow=T) counts = counts*100/nvics barplot(counts, xlab="Size of High-Risk Population", ylab="Percent of Victims Predicted", - names.arg=c('0.1%','0.5%','1%'),ylim=c(0,max(counts)*1.1), + names.arg=paste(as.character(popsizes*100),'%',sep=''), + ylim=c(0,max(counts)*1.1), col=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2)), beside=TRUE) legend("topleft", inset=0.05, - c("Demographics (Logit)", "Cascades (Sum parent)", "Combined Model"), + c("Demographics", "Cascades", "Combined Model"), fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2),rgb(0,1,0,1/2))) box(which='plot') @@ -34,18 +39,12 @@ barplot(counts, axis(side = 4) mtext(side = 4, line = 3, "Number of Victims Predicted") -popsizes = c(0.1, 0.5, 1) -plot(popsizes,counts[1,],type='l',ylim=c(0,max(counts))) -lines(popsizes,counts[2,]) -lines(popsizes,counts[3,]) -lines(c(0,1),c(0,1)) #### Precision-Recall Curve -plot(ecdf(correct_rank1),col='red',xlim=c(0,50),lwd=2) +plot(ecdf(correct_rank1),col='red',lwd=2,xlim=c(1,100)) plot(ecdf(correct_rank2),col='darkblue',lwd=2,add=T) plot(ecdf(correct_rank3),col='darkgreen',lwd=2,add=T) legend("bottomright", inset=0.05, c("Demographics Model", "Cascade Model", "Combined Model"), fill=c('red','darkblue','darkgreen')) -lines(c(0,vcount(lcc)),c(0,1)) diff --git a/R Scripts/predict-victims.R b/R Scripts/predict-victims.R index fc6cb7f..ccf894a 100644 --- a/R Scripts/predict-victims.R +++ b/R Scripts/predict-victims.R @@ -47,7 +47,7 @@ days = Reduce(union, list(lcc_verts$fatal_day,lcc_verts$nonfatal_day_1, lcc_verts$nonfatal_day_4,lcc_verts$nonfatal_day_5)) days = days[!is.na(days)] days = sort(days) -days = split(days, ceiling(seq_along(days)/456)) +days = split(days, ceiling(seq_along(days)/92)) lambdas = c(0, exp(seq(log(0.0000001), log(.95), length.out=100)), 1) ##### Loop through days @@ -60,9 +60,7 @@ for(i in 1:length(days)){ ##### Demographics model victims = lcc_verts - vics = which(victims$vic.day0) risk = risk[, list(weight=sum(weight)), by=ir] # max or sum + if (dim(risk)[1]>0) risk = risk[, list(weight=sum(weight)), by=ir] ##### Combined Model combined = dt @@ -108,4 +106,4 @@ for(i in 1:length(days)){ print(proc.time()-ptm) } -# save(correct_rank, file='Results/correct_rank_62815.RData') \ No newline at end of file +# save(correct_rank, file='Results/correct_rank_91415.RData') \ No newline at end of file diff --git a/R Scripts/prior-arrests-by-day.R b/R Scripts/prior-arrests-by-day.R index 30d458c..604d85c 100644 --- a/R Scripts/prior-arrests-by-day.R +++ b/R Scripts/prior-arrests-by-day.R @@ -10,8 +10,6 @@ arrests$arrest_date = as.Date(arrests$arrest_date,format='%m/%d/%Y') arrests$arrest_day = as.numeric(arrests$arrest_date - start_date) arrests$id = match(arrests$ir2, lcc_verts$ir_no) -arrest_days = function(arr,i){return(arr$arrest_day[arr$id==i)} - prior_arrests = sapply(1:vcount(lcc),function(x) NULL) for(i in 1:vcount(lcc)){ if(i%%10000==0) print(i) -- cgit v1.2.3-70-g09d2