summaryrefslogtreecommitdiffstats
path: root/R Scripts/lm.R
blob: f1bbae09d388c0d6215d55f393e6070c9ec573a1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(pROC)
library(igraph)
setwd('~/Documents/Cascade Project')
load('Raw Data/lcc.RData')
load('Results/lcc_bw.RData')

is_vic = V(lcc)$vic
vics = which(V(lcc)$vic==T)
nonvics = which(V(lcc)$vic==F)
thresh = length(vics)/length(union(vics,nonvics))

nbhdVics = function(neighbs){return(sum(is_vic[neighbs[-1]]))}
#setdiff(is_vic order-(order-1) to get only those at the given order
n1s = neighborhood(lcc,1,V(lcc))
n2s = neighborhood(lcc,2,V(lcc))
n1vics = unlist(lapply(n1s,nbhdVics))
n2vics = unlist(lapply(n2s,nbhdVics))

data = data.frame(vic         = V(lcc)$vic,
                  sex         = V(lcc)$sex,
                  race        = V(lcc)$race,
                  age         = V(lcc)$age,
                  gang.member = V(lcc)$gang.member,
                  gang.name   = V(lcc)$gang.name,
#                   fname       = V(lcc)$faction.name,
                  deg         = degree(lcc),
                  bw          = bw,
                  n1v         = n1vics,
                  n2v         = n2vics
  )
data$age = as.numeric(data$age)
data$gname = as.factor(data$gname)

train = 1:dim(data)[1]; test = 1:dim(data)[1]
# train = sort(union(sample(vics,length(vics)*0.8),sample(nonvics,length(nonvics)*0.8)))
# test = sort((1:length(is_vic))[-train])

formula = vic ~ sex + race + age + gmember + gname + deg + bw + n1v + n2v
formula = vic ~ sex + race + age + gmember + gname
formula = vic ~ deg + bw
formula = vic ~ n1v + n2v

glm.fit = glm(formula, data=data, family=binomial)
glm.probs = predict(glm.fit, newdata=data, type='response')
roc = roc(data$vic[test],glm.probs,ci=TRUE)
plot(roc,print.auc=TRUE)
roc$auc

lm.fit = lm(formula, data=data)
lm.probs = predict(lm.fit, newdata=data, type='response')
p = 1/(1+exp(-lm.probs))
z = log(lm.probs/(1-lm.probs))
logit = 1/(1+exp(-z))

save(glm.probs, file='Results/lm-probs.RData')

##########
lm.pred = lm.probs>thresh
tab = table(lm.pred,data$vic)
tab
(tab[1,1]+tab[2,2])/sum(tab)

propensity = as.numeric(pred)
similarity = 1 - abs(propensity[u]-propensity[v])

###############
# Random forest approach -- not as good
library(randomForest)
data$vic = factor(data$vic)
rf.fit = randomForest(formula,data=data[train,],do.trace=50,ntree=20)
rf.probs = (predict(rf.fit,newdata=data[test,],type='prob'))[,1]
roc = roc(data$vic[test],rf.probs,ci=TRUE)
plot(roc,print.auc=TRUE)