From 717e7f2edda52100406d17826bd617b7915c5daf Mon Sep 17 00:00:00 2001
From: Ben Green <bgreen@g.harvard.edu>
Date: Wed, 9 Sep 2015 20:27:12 -0400
Subject: updated R scripts

---
 R Scripts/data-prep.R                   | 69 +++++++++++++++++++++++++++++----
 R Scripts/fit-background.R              | 10 ++---
 R Scripts/generate-dag-dat-unweighted.R | 14 +++----
 R Scripts/sim-analysis.R                | 11 ++++++
 R Scripts/weight-vs-victims.R           | 10 +++++
 5 files changed, 94 insertions(+), 20 deletions(-)
 create mode 100644 R Scripts/weight-vs-victims.R

diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 2994575..127acca 100755
--- a/R Scripts/data-prep.R	
+++ b/R Scripts/data-prep.R	
@@ -8,10 +8,7 @@ setwd('~/Documents/Violence Cascades/Raw Data/')
 #load all three sets of data
 arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))
 
-#I need to add the "ir" for this to make sense when I "project"
-arrests$ir2 <- paste("ir", arrests$ir_no)
-
-## Match arrests based on date, time, and location
+## Match arrest records (RD) based on date, time, and location
 a = arrests[arrests$rd_no=='',]
 dtab = table(a$arrest_date)
 dates = attr(dtab,'name')[dtab>1]
@@ -29,6 +26,64 @@ for (date in dates){
 # now make unique rd_nos for the other people arrested alone
 null_rds = which(arrests$rd_no=='')
 arrests$rd_no[null_rds] = paste('rd',null_rds)
+
+# clean up entries with null birthdate
+null_bdate = "1/1/1900 0:00:00"
+a = arrests[arrests$birth_date == null_bdate,]
+for (i in 1:dim(a)[1]){
+  if(i%%200==0)print(i)
+  ir = a$ir_no[i]
+  arr = arrests[arrests$ir_no==ir,]
+  arr = arr[arr$birth_date != null_bdate,]
+  if(dim(arr)[1]>0){
+    arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date)))
+    arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme)))
+  }
+} 
+arrests = arrests[arrests$birth_date!=null_bdate,]
+
+# Find individual records (IR) based on birthday, sex, race, address
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+  if(i%%200==0) print(i)
+  bdate = a$birth_date[i]
+  sex = a$sex_code_cd[i]
+  race = a$race_code_cd[i]
+  arr = arrests[arrests$birth_date==bdate,]
+  arr = arr[arr$race_code_cd==race,]
+  arr = arr[arr$sex_code_cd==sex,]
+  if (dim(arr)[1]>1){
+    street = a$o_street_nme[i]
+    arr = arr[arr$o_street_nme==street,]
+  }
+  arr = arr[arr$ir_no != '',]
+  if (dim(arr)[1]>0){
+    arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no))))
+  }
+}
+# fill IRs for the rest of people 
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+  if(i%%200==0) print(i)
+  if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){
+    bdate = a$birth_date[i]
+    sex = a$sex_code_cd[i]
+    race = a$race_code_cd[i]
+    arr = arrests[arrests$birth_date==bdate,]
+    arr = arr[arr$race_code_cd==race,]
+    arr = arr[arr$sex_code_cd==sex,]
+    if (dim(arr)[1]>1){
+      street = a$o_street_nme[i]
+      arr = arr[arr$o_street_nme==street,]
+    }
+    arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i
+  }
+}
+
+#I need to add the "ir" for this to make sense when I "project"
+arrests$ir2 <- paste("ir", arrests$ir_no)
+
+# save altered arrests data
 save(arrests,file='arrests.RData')
 
 #=====================
@@ -102,7 +157,6 @@ sub.arrests = sub.arrests[order(sub.arrests$dates),]
 
 #===================================================================
 
-
 # get victim attributes
 shootings <- read.csv("shooting-data-withdate2.csv", header = T)
 victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
@@ -183,12 +237,11 @@ V(person)$gang.name <- as.character(gnames)
 V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
 
 #===================================================================
-# create id number
 
 # save data
-# person = remove.edge.attribute(person,'weight')
+person = remove.edge.attribute(person,'weight')
 # person_data = get.data.frame(person,'both')
-save(person, file="chi-19aug2015.RData")
+save(person, file="chi-9sep2015.RData")
 
 #===================================================================
 # get LCC of the network
diff --git a/R Scripts/fit-background.R b/R Scripts/fit-background.R
index d385e7f..d1799cd 100644
--- a/R Scripts/fit-background.R	
+++ b/R Scripts/fit-background.R	
@@ -17,16 +17,16 @@ counts = as.vector(t)
 infs = data.frame(days,counts)
 
 # define background function
-fit = function(x, mu, A, phi) {mu + A*(sin((pi/365)*x+phi))^2}
-fit_form = counts ~ mu + A*(sin((pi/365)*days+phi))^2
+fit = function(x, lambda, A, phi) {lambda + A*(sin((pi/365)*x+phi))^2}
+fit_form = counts ~ lambda + A*(sin((pi/365)*days+phi))^2
 
 # explore data
 plot(days,counts)
-curve(fit(x, mu=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue")
+curve(fit(x, lambda=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue")
 
-res = nls(formula=fit_form, data=infs, start=list(mu=1, A=12, phi=2.8))
+res = nls(formula=fit_form, data=infs, start=list(lambda=1, A=12, phi=2.8))
 co = coef(res); co
 
 plot(t)
 plot(days,counts,pch=20,cex=0.4,xlab='Day',ylab='Number of Infections')
-curve(fit(x, mu=co["mu"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue")
+curve(fit(x, lambda=co["lambda"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue")
diff --git a/R Scripts/generate-dag-dat-unweighted.R b/R Scripts/generate-dag-dat-unweighted.R
index 61db26c..06988b9 100755
--- a/R Scripts/generate-dag-dat-unweighted.R	
+++ b/R Scripts/generate-dag-dat-unweighted.R	
@@ -1,12 +1,12 @@
 library(igraph)
-setwd("~/Documents/Violence Cascades/")
-load('Raw Data/lcc.RData')
+setwd("~/Documents/Violence Cascades/Raw Data/")
+load('lcc.RData')
 
 library(foreach)
 library(doMC)
 registerDoMC(cores=4)
 
-lcc2 = remove.edge.attribute(lcc,'weight')
+lcc2 = lcc#remove.edge.attribute(lcc,'weight')
 vics = split(vic_ids, ceiling(seq_along(vic_ids)/500))
 dag_dat_lcc = c()
 for(i in 1:length(vics)){
@@ -43,8 +43,8 @@ for(i in 1:length(vics)){
 colnames(dag_dat_lcc) = c('from','to','t1','dist')
 rownames(dag_dat_lcc) = NULL
 
-save(dag_dat_lcc, file='Results/dag_dat_lcc.RData')
-write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv')
+save(dag_dat_lcc, file='dag_dat_lcc.RData')
+write.csv(dag_dat_lcc, file='dag_dat_lcc.csv')
 
 # dag_dat_vics = dag_dat_lcc[!is.na(dag_dat_lcc$t2),]
 # save(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.RData')
@@ -55,5 +55,5 @@ write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv')
 vic_times_lcc = lcc_verts[,c('name','nonfatal_day_1','nonfatal_day_2',
                              'nonfatal_day_3','nonfatal_day_4',
                              'nonfatal_day_5','fatal_day')]
-save(vic_times_lcc, file='Results/vic_times_lcc.RData')
-write.csv(vic_times_lcc, file='Results/vic_times_lcc.csv')
+save(vic_times_lcc, file='vic_times_lcc.RData')
+write.csv(vic_times_lcc, file='vic_times_lcc.csv')
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
index af280cc..92c3d26 100755
--- a/R Scripts/sim-analysis.R	
+++ b/R Scripts/sim-analysis.R	
@@ -85,3 +85,14 @@ dates = c(V(lcc)$fatal_date[!is.na(V(lcc)$fatal_date)],
 dates = as.Date(dates)
 hist(dates, breaks='months',freq=T,col='lightblue',format='%m/%Y',
      xlab='Month',ylab='Number of Shootings',main='')
+
+### data vs sim hist comparison
+hist(vic.time.data,breaks=seq(0,3000,by=10),col=rgb(0,0,1,1/2),freq=F,
+     xlim=c(0,max(vic.time,vic.time.data)),
+     xlab='Days between infections',main='')
+hist(vic.time,breaks=seq(0,3000,by=10),col=rgb(1,0,0,1/2),add=T,freq=F)
+legend("topright", inset=0.05, 
+       c("Data", "Simulation"), 
+       fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2)))
+box(which='plot')
+
diff --git a/R Scripts/weight-vs-victims.R b/R Scripts/weight-vs-victims.R
new file mode 100644
index 0000000..da60df2
--- /dev/null
+++ b/R Scripts/weight-vs-victims.R	
@@ -0,0 +1,10 @@
+# testing to see if pairs connected by high weight edges are more likely to both be
+# victims than pairs connected by low weight edges. This would provide some evidence 
+# that high weight edges are important for transmitting infections.
+
+v = rep(0,max(lcc_edges$weight))
+for(w in 1:max(lcc_edges$weight)){
+  es = lcc_edges[lcc_edges$weight==w,1:2]
+  v[w] = mean(lcc_verts$vic[as.numeric(es$from)] & lcc_verts$vic[as.numeric(es$to)])
+}
+plot(v)
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2