library(igraph) setwd('~/Documents/Violence Cascades/Raw Data/') #================ # (1) load data #================ #load all three sets of data arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) ## Match arrest records (RD) based on date, time, and location a = arrests[arrests$rd_no=='',] dtab = table(a$arrest_date) dates = attr(dtab,'name')[dtab>1] for (date in dates){ if (which(date==dates)%%10000==0) print(which(date==dates)) ids = which(a$arrest_date==date) grp = a[ids,] stab = table(grp$street_nme) streets = attr(stab,'name')[stab>1] for (street in streets){ arr_ids = as.numeric(rownames(grp[grp$street_nme==street,])) arrests$rd_no[arr_ids] = paste('rd',arr_ids[1]) } } # now make unique rd_nos for the other people arrested alone null_rds = which(arrests$rd_no=='') arrests$rd_no[null_rds] = paste('rd',null_rds) # clean up entries with null birthdate null_bdate = "1/1/1900 0:00:00" a = arrests[arrests$birth_date == null_bdate,] for (i in 1:dim(a)[1]){ if(i%%200==0)print(i) ir = a$ir_no[i] arr = arrests[arrests$ir_no==ir,] arr = arr[arr$birth_date != null_bdate,] if(dim(arr)[1]>0){ arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date))) arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme))) } } arrests = arrests[arrests$birth_date!=null_bdate,] # Find individual records (IR) based on birthday, sex, race, address a = arrests[arrests$ir_no=='',] for (i in 1:dim(a)[1]){ if(i%%200==0) print(i) bdate = a$birth_date[i] sex = a$sex_code_cd[i] race = a$race_code_cd[i] arr = arrests[arrests$birth_date==bdate,] arr = arr[arr$race_code_cd==race,] arr = arr[arr$sex_code_cd==sex,] if (dim(arr)[1]>1){ street = a$o_street_nme[i] arr = arr[arr$o_street_nme==street,] } arr = arr[arr$ir_no != '',] if (dim(arr)[1]>0){ arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no)))) } } # fill IRs for the rest of people a = arrests[arrests$ir_no=='',] for (i in 1:dim(a)[1]){ if(i%%200==0) print(i) if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){ bdate = a$birth_date[i] sex = a$sex_code_cd[i] race = a$race_code_cd[i] arr = arrests[arrests$birth_date==bdate,] arr = arr[arr$race_code_cd==race,] arr = arr[arr$sex_code_cd==sex,] if (dim(arr)[1]>1){ street = a$o_street_nme[i] arr = arr[arr$o_street_nme==street,] } arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i } } # clean up entries where sex is missing a = arrests[arrests$sex_code_cd=='X',] for (i in 1:dim(a)[1]){ ir = a$ir_no[i] arr = arrests[arrests$ir_no==ir,] arr = arr[arr$sex_code_cd != 'X',] if(dim(arr)[1]>0){# need to match rownames like districts? arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd))) } } arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M' ##### residential districts arrests$o_district[arrests$o_district=='31'] = '' a = arrests[arrests$o_district=='' & arrests$o_city=='CHICAGO',] for (i in 1:dim(a)[1]){ if(i%%200==0) print(i) ir = a$ir_no[i] arr = arrests[arrests$ir_no==ir,] arr = arr[arr$o_district != '',] if(dim(arr)[1]>0){ arrests$o_district[match(as.numeric(rownames(a[i,])),rownames(arrests))] = names(which.max(table(arr$o_district))) } } arrests$o_district[arrests$o_district==''] = 0 # lcc_verts$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)] # V(lcc)$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)] #I need to add the "ir" for this to make sense when I "project" arrests$ir2 <- paste("ir", arrests$ir_no) # save altered arrests data save(arrests,file='arrests.RData') #===================== # (2) Structure Data #===================== #get the fields we need for all three: incidents, mni, and "type" sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'))) colnames(sub.arrests) <- c("events", "individuals") #============================= # (3) Prep for making graphs #============================= individuals <- unique(sub.arrests$individuals) events <- unique(sub.arrests$events) if (any(individuals %in% events)) stop('vertex name collision') vertices <- data.frame(name=c(events, individuals), type=c(rep(FALSE, length(events)), rep(TRUE, length(individuals))), stringsAsFactors=FALSE) #=================================================================== # (4) Make the GRAPH file #=================================================================== g <- graph.data.frame(sub.arrests, vertices=vertices) #=================================================================== # Sanity check the resulting igraph object stopifnot(ecount(g) == nrow(sub.arrests)) stopifnot(vcount(g) == nrow(vertices)) names <- V(g)$name stopifnot(isTRUE(identical(sort(names), sort(vertices$name)))) inames <- V(g)[V(g)$type]$name stopifnot(isTRUE(identical(sort(inames), sort(individuals)))) enames <- V(g)[! V(g)$type]$name stopifnot(isTRUE(identical(sort(enames), sort(events)))) #=================================================================== #now do the converstion into a single network net1 <- bipartite.projection(g) person <- net1[[2]] #=================================================================== # (5) Define attributes on nodes #=================================================================== # set attributes from arrests file attribs <- arrests match_vector = match(V(person)$name, attribs$ir2) V(person)$sex <- as.character(attribs$sex_code_cd[match_vector]) V(person)$race <- as.character(attribs$race_code_cd[match_vector]) # V(person)$age <- as.character(attribs$age[match_vector]) V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y')) # January 1, 2006 is Day 1 of the study period start_date = as.Date("2005-12-31") ## Get first arrest date in the study period for each person sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') sub.arrests = sub.arrests[order(sub.arrests$dates),] #=================================================================== # get victim attributes shootings <- read.csv("shooting-data-withdate2.csv", header = T) victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] victims = victims[!is.na(victims$IR_NO),] victims$ir2 <- paste("ir", victims$IR_NO) # get murder victim attributes murders = read.csv("murder-victims-13nov.csv", header=T) murders = murders[!is.na(murders$VICTIM_IR_NO),] murders = murders[murders$INJURY_DESCR=="SHOT",] murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,] murders$ir2 = paste("ir", murders$VICTIM_IR_NO) # clear nonfatals that led to death v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,] rows = c() for(i in 1:dim(v)[1]){ row = which(rownames(victims)==as.numeric(rownames(v[i,]))) m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],] dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y') if(dup==T) rows = c(rows,row) } victims = victims[-rows,] # set victim data in network vtab = as.data.frame(table(victims$ir2)) match_vector = match(V(person)$name,vtab$Var1) V(person)$vic.nonfatal = vtab$Freq[match_vector] V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0 V(person)$vic.fatal = V(person)$name %in% murders$ir2 V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2) # add fatal shooting dates to the network match_vector = match(V(person)$name, murders$ir2) fatal_dates = murders$INJURY_DATE[match_vector] fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y')) V(person)$fatal_date = fatal_dates # add nonfatal shooting dates to the network match_vector = match(victims$ir2,V(person)$name) vics = which(V(person)$vic.nonfatal>0) nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics)) for(i in 1:length(vics)){ if (i%%3000==0) print(i) name = vics[i] ids = which(match_vector==name) dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y'))) # if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]] nfd1[i] = as.character(dates[1]) nfd2[i] = as.character(dates[2]) nfd3[i] = as.character(dates[3]) nfd4[i] = as.character(dates[4]) nfd5[i] = as.character(dates[5]) } V(person)$nonfatal_date_1 = NA V(person)$nonfatal_date_2 = NA V(person)$nonfatal_date_3 = NA V(person)$nonfatal_date_4 = NA V(person)$nonfatal_date_5 = NA V(person)$nonfatal_date_1[vics] = nfd1 V(person)$nonfatal_date_2[vics] = nfd2 V(person)$nonfatal_date_3[vics] = nfd3 V(person)$nonfatal_date_4[vics] = nfd4 V(person)$nonfatal_date_5[vics] = nfd5 # convert dates into numeric values ("days") start_date V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date) V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date) V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date) V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date) V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date) #=================================================================== # set gang attributes gangs <- read.csv("Sept2014-ganglist.csv", header=T) gangs$ir2 <- paste("ir", gangs$IR_NO) t = table(gangs$IR_NO) t = t[t>1] irs = as.numeric(attr(t,'name')) for(ir in irs){ if(which(ir==irs)%%1000==0)print(which(ir==irs)) g = gangs[gangs$IR_NO==ir,] gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME))) } gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] gnames = as.character(gangs$GANG_NAME) gnames[is.na(gnames)] = 'Unknown' V(person)$gang.member <- V(person)$name %in% gangs$ir2 match_vector = match(V(person)$name, gangs$ir2) gnames = gnames[match_vector] gnames[V(person)$gang.member==F] = 'None' V(person)$gang.name <- as.character(gnames) # V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) # clean up later to make this fit with process t = table(V(person)$gang.name) gs = names(t)[t<50] V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown' #=================================================================== # save data person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') save(person, file="chi-14sep2015.RData") #=================================================================== # get LCC of the network lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) V(lcc)$id = rank(V(lcc)$name) V(lcc)$ir_no = V(lcc)$name V(lcc)$name = V(lcc)$id vic_ids = which(V(lcc)$vic) lcc_edges = as_data_frame(lcc,'edges') # update lcc_verts lcc_verts = get.data.frame(lcc,'vertices') lcc_verts = lcc_verts[,c(1,23,24,2:21)] # save file save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") ##### # old stuff lcc_data = get.data.frame(lcc,'both') lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) row.names(lcc_data$vertices) = lcc_data$vertices$id vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal', 'fatal_day','nonfatal_day_1','nonfatal_day_2', 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')] write.csv(vertices,file='lcc_vertices.csv') write.csv(lcc_data$edges,file='lcc_edges.csv') lcc = graph.data.frame(lcc_edges, directed=FALSE, vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')]) write.graph(lcc,'lcc.gml','gml')