library(igraph)
setwd('~/Documents/Violence Cascades/Raw Data/')

#================
# (1) load data
#================

#load all three sets of data
arrests <- read.csv("2006to2014arrests2.csv", header=T, colClass=c("character"))

## Match arrest records (RD) based on date, time, and location
a = arrests[arrests$rd_no=='',]
dtab = table(a$arrest_date)
dates = attr(dtab,'name')[dtab>1]
for (date in dates){
  if (which(date==dates)%%10000==0) print(which(date==dates))
  ids = which(a$arrest_date==date)
  grp = a[ids,]
  stab = table(grp$street_nme)
  streets = attr(stab,'name')[stab>1]
  for (street in streets){
    arr_ids = as.numeric(rownames(grp[grp$street_nme==street,]))
    arrests$rd_no[arr_ids] = paste('rd',arr_ids[1])
  }
}
# now make unique rd_nos for the other people arrested alone
null_rds = which(arrests$rd_no=='')
arrests$rd_no[null_rds] = paste('rd',null_rds)

# clean up entries with null birthdate
null_bdate = "1/1/1900 0:00:00"
a = arrests[arrests$birth_date == null_bdate,]
for (i in 1:dim(a)[1]){
  if(i%%200==0)print(i)
  ir = a$ir_no[i]
  arr = arrests[arrests$ir_no==ir,]
  arr = arr[arr$birth_date != null_bdate,]
  if(dim(arr)[1]>0){
    arrests$birth_date[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$birth_date)))
    arrests$o_street_nme[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$o_street_nme)))
  }
} 
arrests = arrests[arrests$birth_date!=null_bdate,]

# Find individual records (IR) based on birthday, sex, race, address
a = arrests[arrests$ir_no=='',]
for (i in 1:dim(a)[1]){
  if(i%%200==0) print(i)
  bdate = a$birth_date[i]
  sex = a$sex_code_cd[i]
  race = a$race_code_cd[i]
  arr = arrests[arrests$birth_date==bdate,]
  arr = arr[arr$race_code_cd==race,]
  arr = arr[arr$sex_code_cd==sex,]
  if (dim(arr)[1]>1){
    street = a$o_street_nme[i]
    arr = arr[arr$o_street_nme==street,]
  }
  arr = arr[arr$ir_no != '',]
  if (dim(arr)[1]>0){
    arrests$ir_no[match(rownames(a[i,]),rownames(arrests))] = as.numeric(names(which.max(table(arr$ir_no))))
  }
}
# fill IRs for the rest of people 
a = arrests[arrests$ir_no=='',]
for (i in 1:dim(a)[1]){
  if(i%%200==0) print(i)
  if (arrests$ir_no[match(rownames(a[i,]),rownames(arrests))]==''){
    bdate = a$birth_date[i]
    sex = a$sex_code_cd[i]
    race = a$race_code_cd[i]
    arr = arrests[arrests$birth_date==bdate,]
    arr = arr[arr$race_code_cd==race,]
    arr = arr[arr$sex_code_cd==sex,]
    if (dim(arr)[1]>1){
      street = a$o_street_nme[i]
      arr = arr[arr$o_street_nme==street,]
    }
    arrests$ir_no[match(rownames(arr),rownames(arrests))] = 10000000+i
  }
}

# clean up entries where sex is missing
a = arrests[arrests$sex_code_cd=='X',]
for (i in 1:dim(a)[1]){
  ir = a$ir_no[i]
  arr = arrests[arrests$ir_no==ir,]
  arr = arr[arr$sex_code_cd != 'X',]
  if(dim(arr)[1]>0){# need to match rownames like districts?
    arrests$sex_code_cd[as.numeric(rownames(a[i,]))] = names(which.max(table(arr$sex_code_cd)))
  }
} 
arrests$sex_code_cd[arrests$sex_code_cd=='X'] = 'M'

##### residential districts
arrests$o_district[arrests$o_district=='31'] = ''
a = arrests[arrests$o_district=='' & arrests$o_city=='CHICAGO',]
for (i in 1:dim(a)[1]){
  if(i%%200==0) print(i)
  ir = a$ir_no[i]
  arr = arrests[arrests$ir_no==ir,]
  arr = arr[arr$o_district != '',]
  if(dim(arr)[1]>0){
    arrests$o_district[match(as.numeric(rownames(a[i,])),rownames(arrests))] = names(which.max(table(arr$o_district)))
  }
} 
arrests$o_district[arrests$o_district==''] = 0
# lcc_verts$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]
# V(lcc)$district = arrests$o_district[match(lcc_verts$ir_no,arrests$ir2)]

#I need to add the "ir" for this to make sense when I "project"
arrests$ir2 <- paste("ir", arrests$ir_no)
# save altered arrests data
save(arrests,file='arrests.RData')

#=====================
# (2) Structure Data
#=====================

#get the fields we need for all three: incidents, mni, and "type"
sub.arrests <- subset(arrests, select=c(as.character("rd_no"), as.character('ir2')))
colnames(sub.arrests) <- c("events", "individuals")

#=============================
# (3) Prep for making graphs
#=============================

individuals <- unique(sub.arrests$individuals)

events <- unique(sub.arrests$events)

if (any(individuals %in% events))
  
  stop('vertex name collision')

vertices <- data.frame(name=c(events, individuals),
                       type=c(rep(FALSE, length(events)),
                              rep(TRUE, length(individuals))),
                       stringsAsFactors=FALSE)

#===================================================================
# (4) Make the GRAPH file
#===================================================================
g <- graph.data.frame(sub.arrests, vertices=vertices)


#===================================================================
# Sanity check the resulting igraph object
stopifnot(ecount(g) == nrow(sub.arrests))
stopifnot(vcount(g) == nrow(vertices))

names <- V(g)$name
stopifnot(isTRUE(identical(sort(names), sort(vertices$name))))

inames <- V(g)[V(g)$type]$name
stopifnot(isTRUE(identical(sort(inames), sort(individuals))))

enames <- V(g)[! V(g)$type]$name
stopifnot(isTRUE(identical(sort(enames), sort(events))))

#===================================================================
#now do the converstion into a single network 
net1 <- bipartite.projection(g)
person <- net1[[2]]

#===================================================================
# (5) Define attributes on nodes
#===================================================================
# set attributes from arrests file

attribs <- arrests
match_vector = match(V(person)$name, attribs$ir2)
V(person)$sex <- as.character(attribs$sex_code_cd[match_vector])
V(person)$race <- as.character(attribs$race_code_cd[match_vector])
# V(person)$age <- as.character(attribs$age[match_vector])
V(person)$dob <- as.character(as.Date(attribs$birth_date[match_vector],format='%m/%d/%Y'))

# January 1, 2006 is Day 1 of the study period
start_date = as.Date("2005-12-31")

## Get first arrest date in the study period for each person
sub.arrests$dates = as.Date(arrests$arrest_date,format='%m/%d/%Y')
sub.arrests = sub.arrests[order(sub.arrests$dates),]

#===================================================================

# get victim attributes
shootings <- read.csv("shooting-data-withdate2.csv", header = T)
victims = shootings[shootings$INV_PARTY_TYPE_CD=="VIC",]
victims = victims[!is.na(victims$IR_NO),]
victims$ir2 <- paste("ir", victims$IR_NO)

# get murder victim attributes
murders = read.csv("murder-victims-13nov.csv", header=T)
murders = murders[!is.na(murders$VICTIM_IR_NO),]
murders = murders[murders$INJURY_DESCR=="SHOT",]
murders = murders[match(unique(murders$VICTIM_IR_NO),murders$VICTIM_IR_NO),]
murders = murders[as.Date(murders$INJURY_DATE,format='%m/%d/%y')>start_date,]
murders$ir2 = paste("ir", murders$VICTIM_IR_NO)

# clear nonfatals that led to death
v = victims[victims$IR_NO %in% murders$VICTIM_IR_NO,]
rows = c()
for(i in 1:dim(v)[1]){
  row = which(rownames(victims)==as.numeric(rownames(v[i,])))
  m = murders[murders$VICTIM_IR_NO==v$IR_NO[i],]
  dup = as.Date(v$INCIDENT_DATE[i],format='%m/%d/%y') %in% as.Date(m$INJURY_DATE,format='%m/%d/%y')
  if(dup==T) rows = c(rows,row)
}
victims = victims[-rows,]

# set victim data in network
vtab = as.data.frame(table(victims$ir2))
match_vector = match(V(person)$name,vtab$Var1)
V(person)$vic.nonfatal = vtab$Freq[match_vector]
V(person)$vic.nonfatal[is.na(V(person)$vic.nonfatal)] = 0
V(person)$vic.fatal = V(person)$name %in% murders$ir2
V(person)$vic = V(person)$name %in% union(victims$ir2,murders$ir2)

# add fatal shooting dates to the network
match_vector = match(V(person)$name, murders$ir2)
fatal_dates = murders$INJURY_DATE[match_vector]
fatal_dates = as.character(as.Date(fatal_dates,format='%m/%d/%y'))
V(person)$fatal_date = fatal_dates

# add nonfatal shooting dates to the network
match_vector = match(victims$ir2,V(person)$name)
vics = which(V(person)$vic.nonfatal>0)
nfd1 = nfd2 = nfd3 = nfd4 = nfd5 = rep(0,length(vics))
for(i in 1:length(vics)){
  if (i%%3000==0) print(i)
  name = vics[i]
  ids = which(match_vector==name)
  dates = unique(sort(as.Date(victims$INCIDENT_DATE[ids],format='%m/%d/%y')))
#   if(!is.na(V(person)$fatal_date[i])) dates = dates[dates != V(person)$fatal_date[ids]]
  nfd1[i] = as.character(dates[1])
  nfd2[i] = as.character(dates[2])
  nfd3[i] = as.character(dates[3])
  nfd4[i] = as.character(dates[4])
  nfd5[i] = as.character(dates[5])
}
V(person)$nonfatal_date_1 = NA
V(person)$nonfatal_date_2 = NA
V(person)$nonfatal_date_3 = NA
V(person)$nonfatal_date_4 = NA
V(person)$nonfatal_date_5 = NA
V(person)$nonfatal_date_1[vics] = nfd1
V(person)$nonfatal_date_2[vics] = nfd2
V(person)$nonfatal_date_3[vics] = nfd3
V(person)$nonfatal_date_4[vics] = nfd4
V(person)$nonfatal_date_5[vics] = nfd5


# convert dates into numeric values ("days")
start_date
V(person)$fatal_day = as.numeric(as.Date(V(person)$fatal_date)-start_date)
V(person)$nonfatal_day_1 = as.numeric(as.Date(V(person)$nonfatal_date_1)-start_date)
V(person)$nonfatal_day_2 = as.numeric(as.Date(V(person)$nonfatal_date_2)-start_date)
V(person)$nonfatal_day_3 = as.numeric(as.Date(V(person)$nonfatal_date_3)-start_date)
V(person)$nonfatal_day_4 = as.numeric(as.Date(V(person)$nonfatal_date_4)-start_date)
V(person)$nonfatal_day_5 = as.numeric(as.Date(V(person)$nonfatal_date_5)-start_date)

#===================================================================
# set gang attributes
gangs <- read.csv("Sept2014-ganglist.csv", header=T)
gangs$ir2 <- paste("ir", gangs$IR_NO)

t = table(gangs$IR_NO)
t = t[t>1]
irs = as.numeric(attr(t,'name'))
for(ir in irs){
  if(which(ir==irs)%%1000==0)print(which(ir==irs))
  g = gangs[gangs$IR_NO==ir,]
  gangs$GANG_NAME[as.numeric(rownames(g))] = names(which.max(table(g$GANG_NAME)))
}

gangs = gangs[match(unique(gangs$IR_NO),gangs$IR_NO),]
gnames = as.character(gangs$GANG_NAME)
gnames[is.na(gnames)] = 'Unknown'

V(person)$gang.member <- V(person)$name %in% gangs$ir2

match_vector = match(V(person)$name, gangs$ir2)
gnames = gnames[match_vector]
gnames[V(person)$gang.member==F] = 'None'
V(person)$gang.name <- as.character(gnames)
# V(person)$faction.name <- as.character(gangs$FACTION_NAME[match_vector])

# clean up later to make this fit with process
t = table(V(person)$gang.name)
gs = names(t)[t<50]
V(person)$gang.name[V(person)$gang.name %in% gs] = 'Unknown'

#===================================================================

# save data
person = remove.edge.attribute(person,'weight')
# person_data = get.data.frame(person,'both')
save(person, file="chi-14sep2015.RData")

#===================================================================
# get LCC of the network
lcc = induced.subgraph(person,which(clusters(person)$membership==which.max(clusters(person)$csize)))
V(lcc)$id = rank(V(lcc)$name)
V(lcc)$ir_no = V(lcc)$name
V(lcc)$name = V(lcc)$id
vic_ids = which(V(lcc)$vic)
lcc_edges = as_data_frame(lcc,'edges')

# update lcc_verts
lcc_verts = get.data.frame(lcc,'vertices')
lcc_verts = lcc_verts[,c(1,23,24,2:21)]

# save file
save(lcc, lcc_verts, lcc_edges, vic_ids, file="lcc.RData")

#####
# old stuff
lcc_data = get.data.frame(lcc,'both')
lcc = set.vertex.attribute(graph=lcc, name='name', value=V(lcc)$id)
row.names(lcc_data$vertices) = lcc_data$vertices$id
vertices = lcc_data$vertices[c('name','vic','vic.fatal','vic.nonfatal',
                               'fatal_day','nonfatal_day_1','nonfatal_day_2',
                               'nonfatal_day_3','nonfatal_day_4','nonfatal_day_5')]

write.csv(vertices,file='lcc_vertices.csv')
write.csv(lcc_data$edges,file='lcc_edges.csv')

lcc = graph.data.frame(lcc_edges, directed=FALSE, 
                       vertices=lcc_verts[,c('name','vic','vic.fatal','vic.nonfatal')])
write.graph(lcc,'lcc.gml','gml')