diff options
| author | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
|---|---|---|
| committer | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
| commit | 1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch) | |
| tree | 6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts/data-prep.R | |
| parent | e5dada202c34521618bf82a086093c342841e5e8 (diff) | |
| download | criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz | |
added my R scripts
Diffstat (limited to 'R Scripts/data-prep.R')
| -rwxr-xr-x | R Scripts/data-prep.R | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/R Scripts/data-prep.R b/R Scripts/data-prep.R new file mode 100755 index 0000000..3104ea2 --- /dev/null +++ b/R Scripts/data-prep.R @@ -0,0 +1,223 @@ +library(igraph) +setwd('~/Documents/Cascade Project/Raw Data/') + +#================ +# (1) load data +#================ + +#load all three sets of data +arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) + +#I need to add the "ir" for this to make sense when I "project" +arrests$ir2 <- paste("ir", arrests$ir_no) + +## Match arrests based on date, time, and location +a = arrests[arrests$rd_no=='',] +dtab = table(a$arrest_date) +dates = attr(dtab,'name')[dtab>1] +for (date in dates){ + if (which(date==dates)%%10000==0) print(which(date==dates)) + ids = which(a$arrest_date==date) + grp = a[ids,] + stab = table(grp$street_nme) + streets = attr(stab,'name')[stab>1] + for (street in streets){ + arr_ids = as.numeric(rownames(grp[grp$street_nme==street,])) + arrests$rd_no[arr_ids] = paste('rd',arr_ids[1]) + } +} +# now make unique rd_nos for the other people arrested alone +null_rds = which(arrests$rd_no=='') +arrests$rd_no[null_rds] = paste('rd',null_rds) +save(arrests,file='arrests.RData') + +#===================== +# (2) Structure Data +#===================== + +#get the fields we need for all three: incidents, mni, and "type" +sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'))) +colnames(sub.arrests) <- c("events", "individuals") + +#============================= +# (3) Prep for making graphs +#============================= + +individuals <- unique(sub.arrests$individuals) + +events <- unique(sub.arrests$events) + +if (any(individuals %in% events)) + + stop('vertex name collision') + +vertices <- data.frame(name=c(events, individuals), + type=c(rep(FALSE, length(events)), + rep(TRUE, length(individuals))), + stringsAsFactors=FALSE) + +#=================================================================== +# (4) Make the GRAPH file +#=================================================================== +g <- graph.data.frame(sub.arrests, vertices=vertices) + + +#=================================================================== +# Sanity check the resulting igraph object +stopifnot(ecount(g) == nrow(sub.arrests)) +stopifnot(vcount(g) == nrow(vertices)) + +names <- V(g)$name +stopifnot(isTRUE(identical(sort(names), sort(vertices$name)))) + +inames <- V(g)[V(g)$type]$name +stopifnot(isTRUE(identical(sort(inames), sort(individuals)))) + +enames <- V(g)[! V(g)$type]$name +stopifnot(isTRUE(identical(sort(enames), sort(events)))) + +#=================================================================== +#now do the converstion into a single network +net1 <- bipartite.projection(g) +person <- net1[[2]] + +#=================================================================== +# (5) Define attributes on nodes +#=================================================================== +# set attributes from arrests file + +attribs <- arrests +match_vector = match(V(person)$name, attribs$ir2) +V(person)$sex <- as.character(attribs$sex_code_cd[match_vector]) +V(person)$race <- as.character(attribs$race_code_cd[match_vector]) +# V(person)$age <- as.character(attribs$age[match_vector]) +V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y')) + +# January 1, 2006 is Day 1 of the study period +start_date = as.Date("2005-12-31") + +## Get first arrest date in the study period for each person +sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') +sub.arrests = sub.arrests[order(sub.arrests$dates),] +sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),] +arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y') +arrest.days = as.numeric(arrest.dates-start_date) +V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)] + +V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)], + V(person)$dob, + units='days')/365.25) + +#=================================================================== + + +# get victim attributes +shootings <- read.csv("shooting-data-withdate2.csv", header = T) +victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] +victims = victims[!is.na(victims$IR_NO),] +victims$ir2 <- paste("ir", victims$IR_NO) + +# get murder victim attributes +murders = read.csv("murder-victims-13nov.csv", header=T) +murders = murders[!is.na(murders$VICTIM_IR_NO),] +murders = murders[murders$INJURY_DESCR=="SHOT",] +murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] +murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] +murders$ir2 = paste("ir", murders$VICTIM_IR_NO) + +# set victim data in network +vtab = as.data.frame(table(victims$ir2)) +match_vector = match(V(person)$name,vtab$Var1) +V(person)$vic.nonfatal = vtab$Freq[match_vector] +V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0 +V(person)$vic.fatal = V(person)$name %in% murders$ir2 +V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2) + +# add fatal shooting dates to the network +match_vector = match(V(person)$name, murders$ir2) +fatal_dates = murders$INJURY_DATE[match_vector] +fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y')) +V(person)$fatal_date = fatal_dates + +# add nonfatal shooting dates to the network +match_vector = match(victims$ir2,V(person)$name) +vics = which(V(person)$vic.nonfatal>0) +nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics)) +for(i in 1:length(vics)){ + if (i%%3000==0) print(i) + name = vics[i] + ids = which(match_vector==name) + dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) + nfd1[i] = as.character(dates[1]) + nfd2[i] = as.character(dates[2]) + nfd3[i] = as.character(dates[3]) + nfd4[i] = as.character(dates[4]) + nfd5[i] = as.character(dates[5]) +} +V(person)$nonfatal_date_1 = NA +V(person)$nonfatal_date_2 = NA +V(person)$nonfatal_date_3 = NA +V(person)$nonfatal_date_4 = NA +V(person)$nonfatal_date_5 = NA +V(person)$nonfatal_date_1[vics] = nfd1 +V(person)$nonfatal_date_2[vics] = nfd2 +V(person)$nonfatal_date_3[vics] = nfd3 +V(person)$nonfatal_date_4[vics] = nfd4 +V(person)$nonfatal_date_5[vics] = nfd5 + +# convert dates into numeric values ("days") +start_date +V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) +V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date) +V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date) +V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date) +V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date) +V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date) + +#=================================================================== +# set gang attributes +gangs <- read.csv("Sept2014-ganglist.csv", header=T) +gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] +gangs$ir2 <- paste("ir", gangs$IR_NO) + +V(person)$gang.member <- V(person)$name %in% gangs$ir2 + +match_vector = match(V(person)$name, gangs$ir2) +gnames = gangs$GANG_NAME[match_vector] +gnames = as.character(gnames) +gnames[V(person)$gang.member==''] = 'Unknown' +gnames[V(person)$gang.member==F] = 'None' +V(person)$gang.name <- as.character(gnames) +V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) + +#=================================================================== +# create id number +V(person)$id = rank(V(person)$name) + +# save data +# person = remove.edge.attribute(person,'weight') +# person_data = get.data.frame(person,'both') +save(person, file="chi-19mar2015.RData") + +#=================================================================== +# get LCC of the network +lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) +V(lcc)$ir_no = V(lcc)$name +vic_ids = which(V(lcc)$vic) +lcc_verts = get.data.frame(lcc,'vertices') +lcc_edges = get.data.frame(lcc,'edges') +save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") + +lcc_data = get.data.frame(lcc,'both') +lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) +row.names(lcc_data$vertices) = lcc_data$vertices$id +vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal', + 'fatal_day','nonfatal_day_1','nonfatal_day_2', + 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')] + +write.csv(vertices,file='lcc_vertices.csv') +write.csv(lcc_data$edges,file='lcc_edges.csv') + +lcc = graph.data.frame(lcc_edges, directed=FALSE, + vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')]) +write.graph(lcc,'lcc.gml','gml') |
