diff options
| author | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
|---|---|---|
| committer | Ben Green <bgreen@g.harvard.edu> | 2015-06-08 15:21:51 -0400 |
| commit | 1739e9f5706bb8a73de5dbf0b467de49ea040898 (patch) | |
| tree | 6f1d0f166986c5f0757be9b40d8eeb3409ab022c /R Scripts/lm.R | |
| parent | e5dada202c34521618bf82a086093c342841e5e8 (diff) | |
| download | criminal_cascades-1739e9f5706bb8a73de5dbf0b467de49ea040898.tar.gz | |
added my R scripts
Diffstat (limited to 'R Scripts/lm.R')
| -rwxr-xr-x | R Scripts/lm.R | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/R Scripts/lm.R b/R Scripts/lm.R new file mode 100755 index 0000000..f1bbae0 --- /dev/null +++ b/R Scripts/lm.R @@ -0,0 +1,74 @@ +library(pROC) +library(igraph) +setwd('~/Documents/Cascade Project') +load('Raw Data/lcc.RData') +load('Results/lcc_bw.RData') + +is_vic = V(lcc)$vic +vics = which(V(lcc)$vic==T) +nonvics = which(V(lcc)$vic==F) +thresh = length(vics)/length(union(vics,nonvics)) + +nbhdVics = function(neighbs){return(sum(is_vic[neighbs[-1]]))} +#setdiff(is_vic order-(order-1) to get only those at the given order +n1s = neighborhood(lcc,1,V(lcc)) +n2s = neighborhood(lcc,2,V(lcc)) +n1vics = unlist(lapply(n1s,nbhdVics)) +n2vics = unlist(lapply(n2s,nbhdVics)) + +data = data.frame(vic = V(lcc)$vic, + sex = V(lcc)$sex, + race = V(lcc)$race, + age = V(lcc)$age, + gang.member = V(lcc)$gang.member, + gang.name = V(lcc)$gang.name, +# fname = V(lcc)$faction.name, + deg = degree(lcc), + bw = bw, + n1v = n1vics, + n2v = n2vics + ) +data$age = as.numeric(data$age) +data$gname = as.factor(data$gname) + +train = 1:dim(data)[1]; test = 1:dim(data)[1] +# train = sort(union(sample(vics,length(vics)*0.8),sample(nonvics,length(nonvics)*0.8))) +# test = sort((1:length(is_vic))[-train]) + +formula = vic ~ sex + race + age + gmember + gname + deg + bw + n1v + n2v +formula = vic ~ sex + race + age + gmember + gname +formula = vic ~ deg + bw +formula = vic ~ n1v + n2v + +glm.fit = glm(formula, data=data, family=binomial) +glm.probs = predict(glm.fit, newdata=data, type='response') +roc = roc(data$vic[test],glm.probs,ci=TRUE) +plot(roc,print.auc=TRUE) +roc$auc + +lm.fit = lm(formula, data=data) +lm.probs = predict(lm.fit, newdata=data, type='response') +p = 1/(1+exp(-lm.probs)) +z = log(lm.probs/(1-lm.probs)) +logit = 1/(1+exp(-z)) + +save(glm.probs, file='Results/lm-probs.RData') + +########## +lm.pred = lm.probs>thresh +tab = table(lm.pred,data$vic) +tab +(tab[1,1]+tab[2,2])/sum(tab) + +propensity = as.numeric(pred) +similarity = 1 - abs(propensity[u]-propensity[v]) + +############### +# Random forest approach -- not as good +library(randomForest) +data$vic = factor(data$vic) +rf.fit = randomForest(formula,data=data[train,],do.trace=50,ntree=20) +rf.probs = (predict(rf.fit,newdata=data[test,],type='prob'))[,1] +roc = roc(data$vic[test],rf.probs,ci=TRUE) +plot(roc,print.auc=TRUE) + |
