From 717e7f2edda52100406d17826bd617b7915c5daf Mon Sep 17 00:00:00 2001 From: Ben Green Date: Wed, 9 Sep 2015 20:27:12 -0400 Subject: updated R scripts --- R Scripts/data-prep.R | 69 +++++++++++++++++++++++++++++---- R Scripts/fit-background.R | 10 ++--- R Scripts/generate-dag-dat-unweighted.R | 14 +++---- R Scripts/sim-analysis.R | 11 ++++++ R Scripts/weight-vs-victims.R | 10 +++++ 5 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 R Scripts/weight-vs-victims.R diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R index 2994575..127acca 100755 --- a/R Scripts/data-prep.R +++ b/R Scripts/data-prep.R @@ -8,10 +8,7 @@ setwd('~/Documents/Violence Cascades/Raw Data/') #load all three sets of data arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) -#I need to add the "ir" for this to make sense when I "project" -arrests$ir2 <- paste("ir", arrests$ir_no) - -## Match arrests based on date, time, and location +## Match arrest records (RD) based on date, time, and location a = arrests[arrests$rd_no=='',] dtab = table(a$arrest_date) dates = attr(dtab,'name')[dtab>1] @@ -29,6 +26,64 @@ for (date in dates){ # now make unique rd_nos for the other people arrested alone null_rds = which(arrests$rd_no=='') arrests$rd_no[null_rds] = paste('rd',null_rds) + +# clean up entries with null birthdate +null_bdate = "1/1/1900 0:00:00" +a = arrests[arrests$birth_date == null_bdate,] +for (i in 1:dim(a)[1]){ + if(i%%200==0)print(i) + ir = a$ir_no[i] + arr = arrests[arrests$ir_no==ir,] + arr = arr[arr$birth_date != null_bdate,] + if(dim(arr)[1]>0){ + arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date))) + arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme))) + } +} +arrests = arrests[arrests$birth_date!=null_bdate,] + +# Find individual records (IR) based on birthday, sex, race, address +a = arrests[arrests$ir_no=='',] +for (i in 1:dim(a)[1]){ + if(i%%200==0) print(i) + bdate = a$birth_date[i] + sex = a$sex_code_cd[i] + race = a$race_code_cd[i] + arr = arrests[arrests$birth_date==bdate,] + arr = arr[arr$race_code_cd==race,] + arr = arr[arr$sex_code_cd==sex,] + if (dim(arr)[1]>1){ + street = a$o_street_nme[i] + arr = arr[arr$o_street_nme==street,] + } + arr = arr[arr$ir_no != '',] + if (dim(arr)[1]>0){ + arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no)))) + } +} +# fill IRs for the rest of people +a = arrests[arrests$ir_no=='',] +for (i in 1:dim(a)[1]){ + if(i%%200==0) print(i) + if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){ + bdate = a$birth_date[i] + sex = a$sex_code_cd[i] + race = a$race_code_cd[i] + arr = arrests[arrests$birth_date==bdate,] + arr = arr[arr$race_code_cd==race,] + arr = arr[arr$sex_code_cd==sex,] + if (dim(arr)[1]>1){ + street = a$o_street_nme[i] + arr = arr[arr$o_street_nme==street,] + } + arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i + } +} + +#I need to add the "ir" for this to make sense when I "project" +arrests$ir2 <- paste("ir", arrests$ir_no) + +# save altered arrests data save(arrests,file='arrests.RData') #===================== @@ -102,7 +157,6 @@ sub.arrests = sub.arrests[order(sub.arrests$dates),] #=================================================================== - # get victim attributes shootings <- read.csv("shooting-data-withdate2.csv", header = T) victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] @@ -183,12 +237,11 @@ V(person)$gang.name <- as.character(gnames) V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) #=================================================================== -# create id number # save data -# person = remove.edge.attribute(person,'weight') +person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') -save(person, file="chi-19aug2015.RData") +save(person, file="chi-9sep2015.RData") #=================================================================== # get LCC of the network diff --git a/R Scripts/fit-background.R b/R Scripts/fit-background.R index d385e7f..d1799cd 100644 --- a/R Scripts/fit-background.R +++ b/R Scripts/fit-background.R @@ -17,16 +17,16 @@ counts = as.vector(t) infs = data.frame(days,counts) # define background function -fit = function(x, mu, A, phi) {mu + A*(sin((pi/365)*x+phi))^2} -fit_form = counts ~ mu + A*(sin((pi/365)*days+phi))^2 +fit = function(x, lambda, A, phi) {lambda + A*(sin((pi/365)*x+phi))^2} +fit_form = counts ~ lambda + A*(sin((pi/365)*days+phi))^2 # explore data plot(days,counts) -curve(fit(x, mu=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue") +curve(fit(x, lambda=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue") -res = nls(formula=fit_form, data=infs, start=list(mu=1, A=12, phi=2.8)) +res = nls(formula=fit_form, data=infs, start=list(lambda=1, A=12, phi=2.8)) co = coef(res); co plot(t) plot(days,counts,pch=20,cex=0.4,xlab='Day',ylab='Number of Infections') -curve(fit(x, mu=co["mu"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue") +curve(fit(x, lambda=co["lambda"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue") diff --git a/R Scripts/generate-dag-dat-unweighted.R b/R Scripts/generate-dag-dat-unweighted.R index 61db26c..06988b9 100755 --- a/R Scripts/generate-dag-dat-unweighted.R +++ b/R Scripts/generate-dag-dat-unweighted.R @@ -1,12 +1,12 @@ library(igraph) -setwd("~/Documents/Violence Cascades/") -load('Raw Data/lcc.RData') +setwd("~/Documents/Violence Cascades/Raw Data/") +load('lcc.RData') library(foreach) library(doMC) registerDoMC(cores=4) -lcc2 = remove.edge.attribute(lcc,'weight') +lcc2 = lcc#remove.edge.attribute(lcc,'weight') vics = split(vic_ids, ceiling(seq_along(vic_ids)/500)) dag_dat_lcc = c() for(i in 1:length(vics)){ @@ -43,8 +43,8 @@ for(i in 1:length(vics)){ colnames(dag_dat_lcc) = c('from','to','t1','dist') rownames(dag_dat_lcc) = NULL -save(dag_dat_lcc, file='Results/dag_dat_lcc.RData') -write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv') +save(dag_dat_lcc, file='dag_dat_lcc.RData') +write.csv(dag_dat_lcc, file='dag_dat_lcc.csv') # dag_dat_vics = dag_dat_lcc[!is.na(dag_dat_lcc$t2),] # save(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.RData') @@ -55,5 +55,5 @@ write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv') vic_times_lcc = lcc_verts[,c('name','nonfatal_day_1','nonfatal_day_2', 'nonfatal_day_3','nonfatal_day_4', 'nonfatal_day_5','fatal_day')] -save(vic_times_lcc, file='Results/vic_times_lcc.RData') -write.csv(vic_times_lcc, file='Results/vic_times_lcc.csv') +save(vic_times_lcc, file='vic_times_lcc.RData') +write.csv(vic_times_lcc, file='vic_times_lcc.csv') diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R index af280cc..92c3d26 100755 --- a/R Scripts/sim-analysis.R +++ b/R Scripts/sim-analysis.R @@ -85,3 +85,14 @@ dates = c(V(lcc)$fatal_date[!is.na(V(lcc)$fatal_date)], dates = as.Date(dates) hist(dates, breaks='months',freq=T,col='lightblue',format='%m/%Y', xlab='Month',ylab='Number of Shootings',main='') + +### data vs sim hist comparison +hist(vic.time.data,breaks=seq(0,3000,by=10),col=rgb(0,0,1,1/2),freq=F, + xlim=c(0,max(vic.time,vic.time.data)), + xlab='Days between infections',main='') +hist(vic.time,breaks=seq(0,3000,by=10),col=rgb(1,0,0,1/2),add=T,freq=F) +legend("topright", inset=0.05, + c("Data", "Simulation"), + fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2))) +box(which='plot') + diff --git a/R Scripts/weight-vs-victims.R b/R Scripts/weight-vs-victims.R new file mode 100644 index 0000000..da60df2 --- /dev/null +++ b/R Scripts/weight-vs-victims.R @@ -0,0 +1,10 @@ +# testing to see if pairs connected by high weight edges are more likely to both be +# victims than pairs connected by low weight edges. This would provide some evidence +# that high weight edges are important for transmitting infections. + +v = rep(0,max(lcc_edges$weight)) +for(w in 1:max(lcc_edges$weight)){ + es = lcc_edges[lcc_edges$weight==w,1:2] + v[w] = mean(lcc_verts$vic[as.numeric(es$from)] & lcc_verts$vic[as.numeric(es$to)]) +} +plot(v) \ No newline at end of file -- cgit v1.2.3-70-g09d2