summaryrefslogtreecommitdiffstats
path: root/R Scripts/data-prep.R
diff options
context:
space:
mode:
Diffstat (limited to 'R Scripts/data-prep.R')
-rwxr-xr-xR Scripts/data-prep.R69
1 files changed, 61 insertions, 8 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R
index 2994575..127acca 100755
--- a/R Scripts/data-prep.R
+++ b/R Scripts/data-prep.R
@@ -8,10 +8,7 @@ setwd('~/Documents/Violence Cascades/Raw Data/')
#load all three sets of data
arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))
-#I need to add the "ir" for this to make sense when I "project"
-arrests$ir2 <- paste("ir", arrests$ir_no)
-
-## Match arrests based on date, time, and location
+## Match arrest records (RD) based on date, time, and location
a = arrests[arrests$rd_no=='',]
dtab = table(a$arrest_date)
dates = attr(dtab,'name')[dtab>1]
@@ -29,6 +26,64 @@ for (date in dates){
# now make unique rd_nos for the other people arrested alone
null_rds = which(arrests$rd_no=='')
arrests$rd_no[null_rds] = paste('rd',null_rds)
+
+# clean up entries with null birthdate
+null_bdate = "1/1/1900 0:00:00"
+a = arrests[arrests$birth_date == null_bdate,]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0)print(i)
+ ir = a$ir_no[i]
+ arr = arrests[arrests$ir_no==ir,]
+ arr = arr[arr$birth_date != null_bdate,]
+ if(dim(arr)[1]>0){
+ arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date)))
+ arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme)))
+ }
+}
+arrests = arrests[arrests$birth_date!=null_bdate,]
+
+# Find individual records (IR) based on birthday, sex, race, address
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0) print(i)
+ bdate = a$birth_date[i]
+ sex = a$sex_code_cd[i]
+ race = a$race_code_cd[i]
+ arr = arrests[arrests$birth_date==bdate,]
+ arr = arr[arr$race_code_cd==race,]
+ arr = arr[arr$sex_code_cd==sex,]
+ if (dim(arr)[1]>1){
+ street = a$o_street_nme[i]
+ arr = arr[arr$o_street_nme==street,]
+ }
+ arr = arr[arr$ir_no != '',]
+ if (dim(arr)[1]>0){
+ arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no))))
+ }
+}
+# fill IRs for the rest of people
+a = arrests[arrests$ir_no=='',]
+for (i in 1:dim(a)[1]){
+ if(i%%200==0) print(i)
+ if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){
+ bdate = a$birth_date[i]
+ sex = a$sex_code_cd[i]
+ race = a$race_code_cd[i]
+ arr = arrests[arrests$birth_date==bdate,]
+ arr = arr[arr$race_code_cd==race,]
+ arr = arr[arr$sex_code_cd==sex,]
+ if (dim(arr)[1]>1){
+ street = a$o_street_nme[i]
+ arr = arr[arr$o_street_nme==street,]
+ }
+ arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i
+ }
+}
+
+#I need to add the "ir" for this to make sense when I "project"
+arrests$ir2 <- paste("ir", arrests$ir_no)
+
+# save altered arrests data
save(arrests,file='arrests.RData')
#=====================
@@ -102,7 +157,6 @@ sub.arrests = sub.arrests[order(sub.arrests$dates),]
#===================================================================
-
# get victim attributes
shootings <- read.csv("shooting-data-withdate2.csv", header = T)
victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
@@ -183,12 +237,11 @@ V(person)$gang.name <- as.character(gnames)
V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])
#===================================================================
-# create id number
# save data
-# person = remove.edge.attribute(person,'weight')
+person = remove.edge.attribute(person,'weight')
# person_data = get.data.frame(person,'both')
-save(person, file="chi-19aug2015.RData")
+save(person, file="chi-9sep2015.RData")
#===================================================================
# get LCC of the network