library(igraph) setwd('~/Documents/Cascade Project/Raw Data/') #================ # (1) load data #================ #load all three sets of data arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character")) #I need to add the "ir" for this to make sense when I "project" arrests$ir2 <- paste("ir", arrests$ir_no) ## Match arrests based on date, time, and location a = arrests[arrests$rd_no=='',] dtab = table(a$arrest_date) dates = attr(dtab,'name')[dtab>1] for (date in dates){ if (which(date==dates)%%10000==0) print(which(date==dates)) ids = which(a$arrest_date==date) grp = a[ids,] stab = table(grp$street_nme) streets = attr(stab,'name')[stab>1] for (street in streets){ arr_ids = as.numeric(rownames(grp[grp$street_nme==street,])) arrests$rd_no[arr_ids] = paste('rd',arr_ids[1]) } } # now make unique rd_nos for the other people arrested alone null_rds = which(arrests$rd_no=='') arrests$rd_no[null_rds] = paste('rd',null_rds) save(arrests,file='arrests.RData') #===================== # (2) Structure Data #===================== #get the fields we need for all three: incidents, mni, and "type" sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2'))) colnames(sub.arrests) <- c("events", "individuals") #============================= # (3) Prep for making graphs #============================= individuals <- unique(sub.arrests$individuals) events <- unique(sub.arrests$events) if (any(individuals %in% events)) stop('vertex name collision') vertices <- data.frame(name=c(events, individuals), type=c(rep(FALSE, length(events)), rep(TRUE, length(individuals))), stringsAsFactors=FALSE) #=================================================================== # (4) Make the GRAPH file #=================================================================== g <- graph.data.frame(sub.arrests, vertices=vertices) #=================================================================== # Sanity check the resulting igraph object stopifnot(ecount(g) == nrow(sub.arrests)) stopifnot(vcount(g) == nrow(vertices)) names <- V(g)$name stopifnot(isTRUE(identical(sort(names), sort(vertices$name)))) inames <- V(g)[V(g)$type]$name stopifnot(isTRUE(identical(sort(inames), sort(individuals)))) enames <- V(g)[! V(g)$type]$name stopifnot(isTRUE(identical(sort(enames), sort(events)))) #=================================================================== #now do the converstion into a single network net1 <- bipartite.projection(g) person <- net1[[2]] #=================================================================== # (5) Define attributes on nodes #=================================================================== # set attributes from arrests file attribs <- arrests match_vector = match(V(person)$name, attribs$ir2) V(person)$sex <- as.character(attribs$sex_code_cd[match_vector]) V(person)$race <- as.character(attribs$race_code_cd[match_vector]) # V(person)$age <- as.character(attribs$age[match_vector]) V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y')) # January 1, 2006 is Day 1 of the study period start_date = as.Date("2005-12-31") ## Get first arrest date in the study period for each person sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y') sub.arrests = sub.arrests[order(sub.arrests$dates),] sub.arrests = sub.arrests[match(unique(sub.arrests$individuals),sub.arrests$individuals),] arrest.dates = as.Date(sub.arrests$dates,format='%m/%d/%y') arrest.days = as.numeric(arrest.dates-start_date) V(person)$arrest.day = arrest.days[match(V(person)$name, sub.arrests$individuals)] V(person)$age.arrest = floor(difftime(arrest.dates[match(V(person)$name, sub.arrests$individuals)], V(person)$dob, units='days')/365.25) #=================================================================== # get victim attributes shootings <- read.csv("shooting-data-withdate2.csv", header = T) victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",] victims = victims[!is.na(victims$IR_NO),] victims$ir2 <- paste("ir", victims$IR_NO) # get murder victim attributes murders = read.csv("murder-victims-13nov.csv", header=T) murders = murders[!is.na(murders$VICTIM_IR_NO),] murders = murders[murders$INJURY_DESCR=="SHOT",] murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),] murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>=start_date,] murders$ir2 = paste("ir", murders$VICTIM_IR_NO) # set victim data in network vtab = as.data.frame(table(victims$ir2)) match_vector = match(V(person)$name,vtab$Var1) V(person)$vic.nonfatal = vtab$Freq[match_vector] V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0 V(person)$vic.fatal = V(person)$name %in% murders$ir2 V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2) # add fatal shooting dates to the network match_vector = match(V(person)$name, murders$ir2) fatal_dates = murders$INJURY_DATE[match_vector] fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y')) V(person)$fatal_date = fatal_dates # add nonfatal shooting dates to the network match_vector = match(victims$ir2,V(person)$name) vics = which(V(person)$vic.nonfatal>0) nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics)) for(i in 1:length(vics)){ if (i%%3000==0) print(i) name = vics[i] ids = which(match_vector==name) dates = sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')) nfd1[i] = as.character(dates[1]) nfd2[i] = as.character(dates[2]) nfd3[i] = as.character(dates[3]) nfd4[i] = as.character(dates[4]) nfd5[i] = as.character(dates[5]) } V(person)$nonfatal_date_1 = NA V(person)$nonfatal_date_2 = NA V(person)$nonfatal_date_3 = NA V(person)$nonfatal_date_4 = NA V(person)$nonfatal_date_5 = NA V(person)$nonfatal_date_1[vics] = nfd1 V(person)$nonfatal_date_2[vics] = nfd2 V(person)$nonfatal_date_3[vics] = nfd3 V(person)$nonfatal_date_4[vics] = nfd4 V(person)$nonfatal_date_5[vics] = nfd5 # convert dates into numeric values ("days") start_date V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date) V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date) V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date) V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date) V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date) V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date) #=================================================================== # set gang attributes gangs <- read.csv("Sept2014-ganglist.csv", header=T) gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),] gangs$ir2 <- paste("ir", gangs$IR_NO) V(person)$gang.member <- V(person)$name %in% gangs$ir2 match_vector = match(V(person)$name, gangs$ir2) gnames = gangs$GANG_NAME[match_vector] gnames = as.character(gnames) gnames[V(person)$gang.member==''] = 'Unknown' gnames[V(person)$gang.member==F] = 'None' V(person)$gang.name <- as.character(gnames) V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector]) #=================================================================== # create id number V(person)$id = rank(V(person)$name) # save data # person = remove.edge.attribute(person,'weight') # person_data = get.data.frame(person,'both') save(person, file="chi-19mar2015.RData") #=================================================================== # get LCC of the network lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize))) V(lcc)$ir_no = V(lcc)$name vic_ids = which(V(lcc)$vic) lcc_verts = get.data.frame(lcc,'vertices') lcc_edges = get.data.frame(lcc,'edges') save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData") lcc_data = get.data.frame(lcc,'both') lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id) row.names(lcc_data$vertices) = lcc_data$vertices$id vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal', 'fatal_day','nonfatal_day_1','nonfatal_day_2', 'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')] write.csv(vertices,file='lcc_vertices.csv') write.csv(lcc_data$edges,file='lcc_edges.csv') lcc = graph.data.frame(lcc_edges, directed=FALSE, vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')]) write.graph(lcc,'lcc.gml','gml')