diff options
Diffstat (limited to 'R Scripts/data-prep.R')
| -rwxr-xr-x | R Scripts/data-prep.R | 56 |
1 files changed, 48 insertions, 8 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R index 127acca..ca2fdc2 100755 --- a/R Scripts/data-prep.R +++ b/R Scripts/data-prep.R @@ -80,6 +80,18 @@ for (i in 1:dim(a)[1]){ } } +# clean up entries where sex is missing +a = arrests[arrests$sex_code_cd=='X',] +for (i in 1:dim(a)[1]){ + ir = a$ir_no[i] + arr = arrests[arrests$ir_no==ir,] + arr = arr[arr$sex_code_cd != 'X',] + if(dim(arr)[1]>0){ + arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd))) + } +} +arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M' + #I need to add the "ir" for this to make sense when I "project" arrests$ir2 <- paste("ir", arrests$ir_no) @@ -171,6 +183,17 @@ murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] murders$ir2 = paste("ir", murders$VICTIM_IR_NO) +# clear nonfatals that led to death +v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,] +rows = c() +for(i in 1:dim(v)[1]){ + row = which(rownames(victims)==as.numeric(rownames(v[i,]))) + m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],] + dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y') + if(dup==T) rows = c(rows,row) +} +victims = victims[-rows,] + # set victim data in network vtab = as.data.frame(table(victims$ir2)) match_vector = match(V(person)$name,vtab$Var1) @@ -193,7 +216,8 @@ for(i in 1:length(vics)){ if (i%%3000==0) print(i) name = vics[i] ids = which(match_vector==name) - dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) + dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))) +# if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]] nfd1[i] = as.character(dates[1]) nfd2[i] = as.character(dates[2]) nfd3[i] = as.character(dates[3]) @@ -211,6 +235,7 @@ V(person)$nonfatal_date_3[vics] = nfd3 V(person)$nonfatal_date_4[vics] = nfd4 V(person)$nonfatal_date_5[vics] = nfd5 + # convert dates into numeric values ("days") start_date V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) @@ -223,25 +248,40 @@ V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_d #=================================================================== # set gang attributes gangs <- read.csv("Sept2014-ganglist.csv", header=T) -gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] gangs$ir2 <- paste("ir", gangs$IR_NO) +t = table(gangs$IR_NO) +t = t[t>1] +irs = as.numeric(attr(t,'name')) +for(ir in irs){ + if(which(ir==irs)%%1000==0)print(which(ir==irs)) + g = gangs[gangs$IR_NO==ir,] + gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME))) +} + +gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] +gnames = as.character(gangs$GANG_NAME) +gnames[is.na(gnames)] = 'Unknown' + V(person)$gang.member <- V(person)$name %in% gangs$ir2 match_vector = match(V(person)$name, gangs$ir2) -gnames = gangs$GANG_NAME[match_vector] -gnames = as.character(gnames) -gnames[V(person)$gang.member==''] = 'Unknown' +gnames = gnames[match_vector] gnames[V(person)$gang.member==F] = 'None' V(person)$gang.name <- as.character(gnames) -V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) +# V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) + +# clean up later to make this fit with process +t = table(V(person)$gang.name) +gs = names(t)[t<50] +V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown' #=================================================================== # save data person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') -save(person, file="chi-9sep2015.RData") +save(person, file="chi-14sep2015.RData") #=================================================================== # get LCC of the network @@ -254,7 +294,7 @@ lcc_edges = as_data_frame(lcc,'edges') # update lcc_verts lcc_verts = get.data.frame(lcc,'vertices') -lcc_verts = lcc_verts[,c(1,23,24,2:22)] +lcc_verts = lcc_verts[,c(1,23,24,2:21)] # save file save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") |
