summaryrefslogtreecommitdiffstats
path: root/R Scripts/data-prep.R
diff options
context:
space:
mode:
authorBen Green <bgreen@g.harvard.edu>2015-09-14 23:19:34 -0400
committerBen Green <bgreen@g.harvard.edu>2015-09-14 23:19:37 -0400
commit58faa01748fe0e6f6d040d1296266d17bd7a3543 (patch)
treeb1a2bf0709ec3d4c252d90c4dba8e42b3057c91b /R Scripts/data-prep.R
parentab0b1f3cefedb35327a19ec1b6afd560bfdf802d (diff)
downloadcriminal_cascades-58faa01748fe0e6f6d040d1296266d17bd7a3543.tar.gz
prediction and plotting cascades
Diffstat (limited to 'R Scripts/data-prep.R')
-rwxr-xr-xR Scripts/data-prep.R56
1 files changed, 48 insertions, 8 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 127acca..ca2fdc2 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -80,6 +80,18 @@ for (i in 1:dim(a)[1]){
}
}
+# clean up entries where sex is missing
+a = arrests[arrests$sex_code_cd=='X',]
+for (i in 1:dim(a)[1]){
+ ir = a$ir_no[i]
+ arr = arrests[arrests$ir_no==ir,]
+ arr = arr[arr$sex_code_cd != 'X',]
+ if(dim(arr)[1]>0){
+ arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd)))
+ }
+}
+arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M'
+
#I need to add the "ir" for this to make sense when I "project"
arrests$ir2 <- paste("ir", arrests$ir_no)
@@ -171,6 +183,17 @@ murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,]
murders$ir2 = paste("ir", murders$VICTIM_IR_NO)
+# clear nonfatals that led to death
+v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,]
+rows = c()
+for(i in 1:dim(v)[1]){
+ row = which(rownames(victims)==as.numeric(rownames(v[i,])))
+ m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],]
+ dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y')
+ if(dup==T) rows = c(rows,row)
+}
+victims = victims[-rows,]
+
# set victim data in network
vtab = as.data.frame(table(victims$ir2))
match_vector = match(V(person)$name,vtab$Var1)
@@ -193,7 +216,8 @@ for(i in 1:length(vics)){
if (i%%3000==0) print(i)
name = vics[i]
ids = which(match_vector==name)
- dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))
+ dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')))
+# if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]]
nfd1[i] = as.character(dates[1])
nfd2[i] = as.character(dates[2])
nfd3[i] = as.character(dates[3])
@@ -211,6 +235,7 @@ V(person)$nonfatal_date_3[vics] = nfd3
V(person)$nonfatal_date_4[vics] = nfd4
V(person)$nonfatal_date_5[vics] = nfd5
+
# convert dates into numeric values ("days")
start_date
V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date)
@@ -223,25 +248,40 @@ V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_d
#===================================================================
# set gang attributes
gangs <- read.csv("Sept2014-ganglist.csv", header=T)
-gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
gangs$ir2 <- paste("ir", gangs$IR_NO)
+t = table(gangs$IR_NO)
+t = t[t>1]
+irs = as.numeric(attr(t,'name'))
+for(ir in irs){
+ if(which(ir==irs)%%1000==0)print(which(ir==irs))
+ g = gangs[gangs$IR_NO==ir,]
+ gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME)))
+}
+
+gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
+gnames = as.character(gangs$GANG_NAME)
+gnames[is.na(gnames)] = 'Unknown'
+
V(person)$gang.member <- V(person)$name %in% gangs$ir2
match_vector = match(V(person)$name, gangs$ir2)
-gnames = gangs$GANG_NAME[match_vector]
-gnames = as.character(gnames)
-gnames[V(person)$gang.member==''] = 'Unknown'
+gnames = gnames[match_vector]
gnames[V(person)$gang.member==F] = 'None'
V(person)$gang.name <- as.character(gnames)
-V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
+# V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
+
+# clean up later to make this fit with process
+t = table(V(person)$gang.name)
+gs = names(t)[t<50]
+V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown'
#===================================================================
# save data
person = remove.edge.attribute(person,'weight')
# person_data = get.data.frame(person,'both')
-save(person, file="chi-9sep2015.RData")
+save(person, file="chi-14sep2015.RData")
#===================================================================
# get LCC of the network
@@ -254,7 +294,7 @@ lcc_edges = as_data_frame(lcc,'edges')
# update lcc_verts
lcc_verts = get.data.frame(lcc,'vertices')
-lcc_verts = lcc_verts[,c(1,23,24,2:22)]
+lcc_verts = lcc_verts[,c(1,23,24,2:21)]
# save file
save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData")