diff options
Diffstat (limited to 'R Scripts/data-prep.R')
| -rwxr-xr-x | R Scripts/data-prep.R | 69 |
1 files changed, 61 insertions, 8 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R index 2994575..127acca 100755 --- a/R Scripts/data-prep.R +++ b/R Scripts/data-prep.R @@ -8,10 +8,7 @@ setwd('~/Documents/Violence Cascades/Raw Data/') #load all three sets of data arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) -#I need to add the "ir" for this to make sense when I "project" -arrests$ir2 <- paste("ir", arrests$ir_no) - -## Match arrests based on date, time, and location +## Match arrest records (RD) based on date, time, and location a = arrests[arrests$rd_no=='',] dtab = table(a$arrest_date) dates = attr(dtab,'name')[dtab>1] @@ -29,6 +26,64 @@ for (date in dates){ # now make unique rd_nos for the other people arrested alone null_rds = which(arrests$rd_no=='') arrests$rd_no[null_rds] = paste('rd',null_rds) + +# clean up entries with null birthdate +null_bdate = "1/1/1900 0:00:00" +a = arrests[arrests$birth_date == null_bdate,] +for (i in 1:dim(a)[1]){ + if(i%%200==0)print(i) + ir = a$ir_no[i] + arr = arrests[arrests$ir_no==ir,] + arr = arr[arr$birth_date != null_bdate,] + if(dim(arr)[1]>0){ + arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date))) + arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme))) + } +} +arrests = arrests[arrests$birth_date!=null_bdate,] + +# Find individual records (IR) based on birthday, sex, race, address +a = arrests[arrests$ir_no=='',] +for (i in 1:dim(a)[1]){ + if(i%%200==0) print(i) + bdate = a$birth_date[i] + sex = a$sex_code_cd[i] + race = a$race_code_cd[i] + arr = arrests[arrests$birth_date==bdate,] + arr = arr[arr$race_code_cd==race,] + arr = arr[arr$sex_code_cd==sex,] + if (dim(arr)[1]>1){ + street = a$o_street_nme[i] + arr = arr[arr$o_street_nme==street,] + } + arr = arr[arr$ir_no != '',] + if (dim(arr)[1]>0){ + arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no)))) + } +} +# fill IRs for the rest of people +a = arrests[arrests$ir_no=='',] +for (i in 1:dim(a)[1]){ + if(i%%200==0) print(i) + if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){ + bdate = a$birth_date[i] + sex = a$sex_code_cd[i] + race = a$race_code_cd[i] + arr = arrests[arrests$birth_date==bdate,] + arr = arr[arr$race_code_cd==race,] + arr = arr[arr$sex_code_cd==sex,] + if (dim(arr)[1]>1){ + street = a$o_street_nme[i] + arr = arr[arr$o_street_nme==street,] + } + arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i + } +} + +#I need to add the "ir" for this to make sense when I "project" +arrests$ir2 <- paste("ir", arrests$ir_no) + +# save altered arrests data save(arrests,file='arrests.RData') #===================== @@ -102,7 +157,6 @@ sub.arrests = sub.arrests[order(sub.arrests$dates),] #=================================================================== - # get victim attributes shootings <- read.csv("shooting-data-withdate2.csv", header = T) victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] @@ -183,12 +237,11 @@ V(person)$gang.name <- as.character(gnames) V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) #=================================================================== -# create id number # save data -# person = remove.edge.attribute(person,'weight') +person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') -save(person, file="chi-19aug2015.RData") +save(person, file="chi-9sep2015.RData") #=================================================================== # get LCC of the network |
