summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xR Scripts/data-prep.R69
-rw-r--r--R Scripts/fit-background.R10
-rwxr-xr-xR Scripts/generate-dag-dat-unweighted.R14
-rwxr-xr-xR Scripts/sim-analysis.R11
-rw-r--r--R Scripts/weight-vs-victims.R10
5 files changed, 94 insertions, 20 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 2994575..127acca 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -8,10 +8,7 @@ setwd('~/Documents/Violence Cascades/Raw Data/')
#load all three sets of data
arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))
-#I need to add the "ir" for this to make sense when I "project"
-arrests$ir2 <- paste("ir", arrests$ir_no)
-
-## Match arrests based on date, time, and location
+## Match arrest records (RD) based on date, time, and location
a = arrests[arrests$rd_no=='',]
dtab = table(a$arrest_date)
dates = attr(dtab,'name')[dtab>1]
@@ -29,6 +26,64 @@ for (date in dates){
# now make unique rd_nos for the other people arrested alone
null_rds = which(arrests$rd_no=='')
arrests$rd_no[null_rds] = paste('rd',null_rds)
+
+# clean up entries with null birthdate
+null_bdate = "1/1/1900 0:00:00"
+a = arrests[arrests$birth_date == null_bdate,]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0)print(i)
+ ir = a$ir_no[i]
+ arr = arrests[arrests$ir_no==ir,]
+ arr = arr[arr$birth_date != null_bdate,]
+ if(dim(arr)[1]>0){
+ arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date)))
+ arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme)))
+ }
+}
+arrests = arrests[arrests$birth_date!=null_bdate,]
+
+# Find individual records (IR) based on birthday, sex, race, address
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0) print(i)
+ bdate = a$birth_date[i]
+ sex = a$sex_code_cd[i]
+ race = a$race_code_cd[i]
+ arr = arrests[arrests$birth_date==bdate,]
+ arr = arr[arr$race_code_cd==race,]
+ arr = arr[arr$sex_code_cd==sex,]
+ if (dim(arr)[1]>1){
+ street = a$o_street_nme[i]
+ arr = arr[arr$o_street_nme==street,]
+ }
+ arr = arr[arr$ir_no != '',]
+ if (dim(arr)[1]>0){
+ arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no))))
+ }
+}
+# fill IRs for the rest of people
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0) print(i)
+ if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){
+ bdate = a$birth_date[i]
+ sex = a$sex_code_cd[i]
+ race = a$race_code_cd[i]
+ arr = arrests[arrests$birth_date==bdate,]
+ arr = arr[arr$race_code_cd==race,]
+ arr = arr[arr$sex_code_cd==sex,]
+ if (dim(arr)[1]>1){
+ street = a$o_street_nme[i]
+ arr = arr[arr$o_street_nme==street,]
+ }
+ arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i
+ }
+}
+
+#I need to add the "ir" for this to make sense when I "project"
+arrests$ir2 <- paste("ir", arrests$ir_no)
+
+# save altered arrests data
save(arrests,file='arrests.RData')
#=====================
@@ -102,7 +157,6 @@ sub.arrests = sub.arrests[order(sub.arrests$dates),]
#===================================================================
-
# get victim attributes
shootings <- read.csv("shooting-data-withdate2.csv", header = T)
victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
@@ -183,12 +237,11 @@ V(person)$gang.name <- as.character(gnames)
V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
#===================================================================
-# create id number
# save data
-# person = remove.edge.attribute(person,'weight')
+person = remove.edge.attribute(person,'weight')
# person_data = get.data.frame(person,'both')
-save(person, file="chi-19aug2015.RData")
+save(person, file="chi-9sep2015.RData")
#===================================================================
# get LCC of the network
diff --git a/R Scripts/fit-background.R b/R Scripts/fit-background.R
index d385e7f..d1799cd 100644
--- a/R Scripts/fit-background.R
+++ b/R Scripts/fit-background.R
@@ -17,16 +17,16 @@ counts = as.vector(t)
infs = data.frame(days,counts)
# define background function
-fit = function(x, mu, A, phi) {mu + A*(sin((pi/365)*x+phi))^2}
-fit_form = counts ~ mu + A*(sin((pi/365)*days+phi))^2
+fit = function(x, lambda, A, phi) {lambda + A*(sin((pi/365)*x+phi))^2}
+fit_form = counts ~ lambda + A*(sin((pi/365)*days+phi))^2
# explore data
plot(days,counts)
-curve(fit(x, mu=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue")
+curve(fit(x, lambda=1, A=12, phi=2.8), add=TRUE ,lwd=2, col="steelblue")
-res = nls(formula=fit_form, data=infs, start=list(mu=1, A=12, phi=2.8))
+res = nls(formula=fit_form, data=infs, start=list(lambda=1, A=12, phi=2.8))
co = coef(res); co
plot(t)
plot(days,counts,pch=20,cex=0.4,xlab='Day',ylab='Number of Infections')
-curve(fit(x, mu=co["mu"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue")
+curve(fit(x, lambda=co["lambda"], A=co["A"], phi=co["phi"]), add=TRUE ,lwd=5, col="steelblue")
diff --git a/R Scripts/generate-dag-dat-unweighted.R b/R Scripts/generate-dag-dat-unweighted.R
index 61db26c..06988b9 100755
--- a/R Scripts/generate-dag-dat-unweighted.R
+++ b/R Scripts/generate-dag-dat-unweighted.R
@@ -1,12 +1,12 @@
library(igraph)
-setwd("~/Documents/Violence Cascades/")
-load('Raw Data/lcc.RData')
+setwd("~/Documents/Violence Cascades/Raw Data/")
+load('lcc.RData')
library(foreach)
library(doMC)
registerDoMC(cores=4)
-lcc2 = remove.edge.attribute(lcc,'weight')
+lcc2 = lcc#remove.edge.attribute(lcc,'weight')
vics = split(vic_ids, ceiling(seq_along(vic_ids)/500))
dag_dat_lcc = c()
for(i in 1:length(vics)){
@@ -43,8 +43,8 @@ for(i in 1:length(vics)){
colnames(dag_dat_lcc) = c('from','to','t1','dist')
rownames(dag_dat_lcc) = NULL
-save(dag_dat_lcc, file='Results/dag_dat_lcc.RData')
-write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv')
+save(dag_dat_lcc, file='dag_dat_lcc.RData')
+write.csv(dag_dat_lcc, file='dag_dat_lcc.csv')
# dag_dat_vics = dag_dat_lcc[!is.na(dag_dat_lcc$t2),]
# save(dag_dat_lcc_vics, file='Results/dag_dat_lcc_vics.RData')
@@ -55,5 +55,5 @@ write.csv(dag_dat_lcc, file='Results/dag_dat_lcc.csv')
vic_times_lcc = lcc_verts[,c('name','nonfatal_day_1','nonfatal_day_2',
'nonfatal_day_3','nonfatal_day_4',
'nonfatal_day_5','fatal_day')]
-save(vic_times_lcc, file='Results/vic_times_lcc.RData')
-write.csv(vic_times_lcc, file='Results/vic_times_lcc.csv')
+save(vic_times_lcc, file='vic_times_lcc.RData')
+write.csv(vic_times_lcc, file='vic_times_lcc.csv')
diff --git a/R Scripts/sim-analysis.R b/R Scripts/sim-analysis.R
index af280cc..92c3d26 100755
--- a/R Scripts/sim-analysis.R
+++ b/R Scripts/sim-analysis.R
@@ -85,3 +85,14 @@ dates = c(V(lcc)$fatal_date[!is.na(V(lcc)$fatal_date)],
dates = as.Date(dates)
hist(dates, breaks='months',freq=T,col='lightblue',format='%m/%Y',
xlab='Month',ylab='Number of Shootings',main='')
+
+### data vs sim hist comparison
+hist(vic.time.data,breaks=seq(0,3000,by=10),col=rgb(0,0,1,1/2),freq=F,
+ xlim=c(0,max(vic.time,vic.time.data)),
+ xlab='Days between infections',main='')
+hist(vic.time,breaks=seq(0,3000,by=10),col=rgb(1,0,0,1/2),add=T,freq=F)
+legend("topright", inset=0.05,
+ c("Data", "Simulation"),
+ fill=c(rgb(0,0,1,1/2),rgb(1,0,0,1/2)))
+box(which='plot')
+
diff --git a/R Scripts/weight-vs-victims.R b/R Scripts/weight-vs-victims.R
new file mode 100644
index 0000000..da60df2
--- /dev/null
+++ b/R Scripts/weight-vs-victims.R
@@ -0,0 +1,10 @@
+# testing to see if pairs connected by high weight edges are more likely to both be
+# victims than pairs connected by low weight edges. This would provide some evidence
+# that high weight edges are important for transmitting infections.
+
+v = rep(0,max(lcc_edges$weight))
+for(w in 1:max(lcc_edges$weight)){
+ es = lcc_edges[lcc_edges$weight==w,1:2]
+ v[w] = mean(lcc_verts$vic[as.numeric(es$from)] & lcc_verts$vic[as.numeric(es$to)])
+}
+plot(v) \ No newline at end of file