library(tidyverse)
library(sf)
library(QuantPsyc)
library(RSocrata)
library(viridis)
library(caret)
library(spatstat)
library(spdep)
library(FNN)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)
library(tidycensus)

mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 14,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank()
  )
}
chicagoBoundary <- 
  st_read("C:/Users/Xiaoran Wang/Desktop/MUSA507/riskPrediction_data/chicagoBoundary.shp") %>%
  st_transform(crs=102271) 

wv <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2017/d62x-nvdr") %>% 
  filter(Primary.Type == "WEAPONS VIOLATION" & 
         Description == "UNLAWFUL POSS OF HANDGUN") %>%
  mutate(x = gsub("[()]", "", Location)) %>%
  separate(x,into= c("Y","X"), sep=",") %>%
  mutate(X = as.numeric(X),
         Y = as.numeric(Y)) %>% 
  na.omit %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant")%>%
  st_transform(102271) %>% 
  distinct()

wv
fishnet <- 
  st_make_grid(chicagoBoundary, cellsize = 500) %>%
  st_sf()

fishnet <- 
  fishnet[chicagoBoundary,] %>%
  mutate(uniqueID = rownames(.)) %>%
  dplyr::select(uniqueID)

1. Outcome of Interest

The outcome of interest is weapon violation and the type is “unlawful poss of handgun”. Handguns can easily be concealed, so police would spend extra time and attention on searching. The over-policing and the uneven allocation of policing resources may cause certain areas of the city to have a higher rate of weapon violation. In addition, weapon violation tends to appear more in unsafe neighborhoods. These two factors may potentially introduce selection bias to the model.

ggplot() + 
  geom_sf(data = chicagoBoundary) +
  geom_sf(data = wv, colour="#47A897", size=0.1, show.legend = "point") +
  labs(title= "Weapon Violation, Chicago - 2017") +
  mapTheme()

2. Outcome of Interest in Fishnet

The grids with the brighter color represent more weapon violations. The map indicates the weapon violations mainly cluster around West-side Chicago, South Chicago and Far-south Chicago.

crime_net <- 
  wv %>% 
  dplyr::select() %>% 
  mutate(countwv = 1) %>% 
  aggregate(., fishnet, sum) %>%
  mutate(countwv = ifelse(is.na(countwv), 0, countwv),
         uniqueID = rownames(.),
         cvID = sample(round(nrow(fishnet) / 24), size=nrow(fishnet), replace = TRUE))
ggplot() +
  geom_sf(data = crime_net, aes(fill = countwv)) +
  scale_fill_viridis() +
  labs(title = "Count of weapons violations for the fishnet") +
  mapTheme()

3. Risk Factors

3.1 Types of risk factors

abandonCars <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Abandoned_Cars")

abandonBuildings <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
    mutate(year = substr(date_service_request_was_received,1,4)) %>%
    filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Abandoned_Buildings")

graffiti <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Graffiti-Removal-Historical/hec5-y4x5") %>%
    mutate(year = substr(creation_date,1,4)) %>%
    filter(year == "2017") %>%
    filter(where_is_the_graffiti_located_ == "Front" |
           where_is_the_graffiti_located_ == "Rear" | where_is_the_graffiti_located_ == "Side") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Graffiti")

streetLightsOut <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Street-Lights-All-Out/zuxi-7xem") %>%
    mutate(year = substr(creation_date,1,4)) %>%
    filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Street_Lights_Out")

sanitation <-
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints-Hi/me59-5fac") %>%
    mutate(year = substr(creation_date,1,4)) %>%
    filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Sanitation")

liquorRetail <- 
  read.socrata("https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses-Cur   rent-Liquor-and-Public-Places/nrmj-3kcf") %>%
  filter(BUSINESS.ACTIVITY == "Retail Sales of Packaged Liquor") %>%
  dplyr::select(Y = LATITUDE, X = LONGITUDE) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Liquor_Retail")

neighborhoods <- 
  st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
  st_transform(st_crs(fishnet)) 

# busstop <- 
#   read.socrata("https://data.cityofchicago.org/Transportation/CTA-System-Information-Bus-Stop-Locations-in-Digit/5cq6-qygt/data") %>%
#   dplyr::select(Y = latitude, X = longitude) %>%
#   na.omit() %>%
#   st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
#   st_transform(st_crs(fishnet)) %>%
#   mutate(Legend = "Bus_Stop")

AffordableRentalHousing <-
  read.socrata("https://data.cityofchicago.org/Community-Economic-Development/Affordable-Rental-Housing-Developments/s6ha-ppgi/data") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "ARH")

policeStation <- 
   read.socrata("https://data.cityofchicago.org/Public-Safety/Police-Stations/z8bn-74gv/data") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Police Station")

# metrostation <- 
#   st_read("C:/Users/Xiaoran Wang/Desktop/MUSA507/riskPrediction_data/MetraStations.shp") %>%
#   st_as_sf(coords = geometry, crs = 4326, agr = "constant") %>%
#   st_transform(st_crs(fishnet))%>%
#   mutate(Legend = "Metro_Stop")

# df_metro <- subset(metrostation, select=c("geometry", "Legend")) %>%
#   st_as_sf(coords = "geometry", crs = 4326, agr = "constant") %>%
#   st_transform(st_crs(fishnet))

There are total 8 risk factors in this model: - Abandoned buildings - Abandoned cars - Affordable rental housing - Graffiti - Liquor retail - Police station - Sanitation - Street lights out

It assumes that the distribution of police station would have negative correlation with the distribution of weapon violations. The other 7 factors would have positive correlations with the case of weapon violations.

ggplot() +
  geom_sf(data=chicagoBoundary) +
  geom_sf(data = rbind(abandonCars,streetLightsOut,
                       abandonBuildings,liquorRetail, 
                       graffiti, sanitation,
                       policeStation, AffordableRentalHousing), 
          size = .1) +
  facet_wrap(~Legend, ncol = 4) +
  labs(title = "Risk Factors") +  
  mapTheme()

3.2 Counts of risk factors by fishnet

It classified the number of each risk factor into grids and map them out. The police station is a strong binary factor: the grids may have one station or no station. The abandoned buildings, liquor retail, graffiti, and affordable rental housing are the factors that cluster in certain areas. The rest factors are more likely in even distributions.

vars_net <- 
  rbind(abandonCars,streetLightsOut,abandonBuildings,
        liquorRetail, graffiti, sanitation, policeStation, AffordableRentalHousing) %>%
  st_join(., fishnet, join=st_within) %>%
  st_set_geometry(NULL) %>%
  group_by(uniqueID, Legend) %>%
  summarize(count = n()) %>%
  full_join(fishnet) %>%
  spread(Legend, count, fill=0) %>%
  st_sf() %>%
  dplyr::select(-`<NA>`) %>%
  na.omit()

vars_net
vars_net.long <- 
  vars_net %>%
  gather(Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol =4, top = "Risk Factors by Fishnet"))

3.3 Distance from risk factors

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
    output <-
      as.data.frame(nn) %>%
      rownames_to_column(var = "thisPoint") %>%
      gather(points, point_distance, V1:ncol(.)) %>%
      arrange(as.numeric(thisPoint)) %>%
      group_by(thisPoint) %>%
      summarize(pointDistance = mean(point_distance)) %>%
      arrange(as.numeric(thisPoint)) %>% 
      dplyr::select(-thisPoint) %>%
      pull()
  
  return(output)  
}
vars_net$Abandoned_Buildings.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonBuildings), 3)
    
vars_net$Abandoned_Cars.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonCars), 3)
    
vars_net$Graffiti.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(graffiti), 3)
    
vars_net$Liquor_Retail.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(liquorRetail), 3)

vars_net$Street_Lights_Out.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(streetLightsOut), 3)
    
vars_net$Sanitation.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(sanitation), 3)

vars_net$policeStation.nn=
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(policeStation), 3)
    
vars_net$AffordableRentalHousing.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(AffordableRentalHousing), 3)

Nearest neighbor distance is used to plot the distance to the risk factor. Brighter color represents longer distance. The south-end Chicago usualy have the longest distance to all risk factors.

vars_net.long.nn <- 
  vars_net %>%
  dplyr::select(ends_with(".nn")) %>%
  gather(Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long.nn$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long.nn, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol = 4, top = "Nearest Neighbor risk Factors by Fishnet"))

4. Local Moran’s I

loopPoint <-
  neighborhoods %>%
  filter(name == "Loop") %>%
  st_centroid()

vars_net$loopDistance =
  st_distance(st_centroid(vars_net),loopPoint) %>%
  as.numeric() 
policeDistricts <- 
  st_read("C:/Users/Xiaoran Wang/Desktop/MUSA507/riskPrediction_data/Boundaries-Police DistrictsCurrent.geojson") %>%
  st_transform(st_crs(fishnet)) 

final_net <-
  left_join(crime_net, st_set_geometry(vars_net, NULL), by="uniqueID") 

final_net <-
  st_centroid(final_net) %>%
    st_join(., dplyr::select(neighborhoods, name)) %>%
    st_join(., dplyr::select(policeDistricts, dist_label)) %>%
      st_set_geometry(NULL) %>%
      left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
      st_sf() %>%
  na.omit()

dplyr::select(final_net, name, dist_label) %>%
  gather(Variable, Value, -geometry) 

The null hypothesis is that the data does not have spatial autocorrelation.

The smaller p-values in grids with more weapon violations reject the null hypothesis that the grids do not have spatial autocorrelation. Said differently, weapon violations strongly cluster at these grids and they have strong spatial autocorrelations. The large local Moran’s I value at those grids further indicate that the weapon violations specifically concentrate on those grids. The significant hotspots mainly locate at the grids with high p-values, and they are statistically significant.

final_net.nb <- poly2nb(final_net, queen=TRUE)
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE)

final_net.localMorans <- 
  cbind(
    as.data.frame(localmoran(final_net$countwv, final_net.weights)),
    as.data.frame(final_net, NULL)) %>% 
    st_sf() %>%
    dplyr::select(wv_Count = countwv, 
                  Local_Morans_I = Ii, 
                  P_Value = `Pr(z > 0)`) %>%
    mutate(Significant_Hotspots = ifelse(P_Value <= 0.05, 1, 0)) %>%
    gather(Variable, Value, -geometry)
  
vars <- unique(final_net.localMorans$Variable)
varList <- list()

for(i in vars){
  varList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(final_net.localMorans, Variable == i), aes(fill = Value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(varList, ncol = 4, top = "Local Morans I statistics, Weapon violation"))

5. Scatterplot for Correlations

The weapon violation has the highest positive correlation with the number of abandoned buildings: the r of which is 0.57. Similarly, if the grid is far away from the abandon buildings, it tends to have a smaller number of weapon violations. There is no strong linear relationship between the number of weapon violations and the location of police stations, which does not match the assumption.

final_net <-
  final_net %>% 
  mutate(wv.isSig = ifelse(localmoran(final_net$countwv, 
                                            final_net.weights)[,5] <= 0.0000001, 1, 0)) %>%
  mutate(wv.isSig.dist = nn_function(st_coordinates(st_centroid(final_net)),
                                           st_coordinates(st_centroid(
                                             filter(final_net, wv.isSig == 1))), 1 ))
correlation.long <-
  st_set_geometry(final_net, NULL) %>%
    dplyr::select(-uniqueID, -cvID, -loopDistance, -name, -dist_label) %>%
    gather(Variable, Value, -countwv)

correlation.cor <-
  correlation.long %>%
    group_by(Variable) %>%
    summarize(correlation = cor(Value, countwv, use = "complete.obs"))
    
ggplot(correlation.long, aes(Value, countwv)) +
  geom_point(size = 0.1) +
  geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
            x=-Inf, y=Inf, vjust = 1, hjust = -.1) +
  geom_smooth(method = "lm", se = FALSE, colour = "#a2d7d8") +
  facet_wrap(~Variable, ncol = 4, scales = "free") +
  labs(title = "Weapon violation count as a function of risk factors")

6. Histogram of Dependent Variable

The histogram indicates that the weapon violation is a relatively rare event. The histogram is heavily skewed to right, and the majority of the grids do not contain the case of weapon violation or only have a few weapon violations.

ggplot(final_net, aes(countwv)) + 
  geom_histogram(binwidth = 1) +
  labs(title = "Distribution of Weapon violation by grid cell")

7. Cross Validation

The code block below runs cross-validate to estimate four different regressions that are then compared below. reg.cv and reg.ss.cv perform random k-fold cross-validation using Just Risk Factors and the Spatial Structure features, respectively. reg.spatialCV and reg.ss.spatialCV perform LOGO spatial cross-validation using the aforementioned two groups of features.

k-fold cross validation uses the cvID. LOGO uses the neighborhood name.

7.1 Cross validation results

reg.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", 
              "Liquor_Retail.nn", "Street_Lights_Out.nn", "Sanitation.nn", 
              "policeStation.nn", "AffordableRentalHousing.nn", "loopDistance")

reg.ss.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", "Liquor_Retail.nn", 
                 "Street_Lights_Out.nn", "Sanitation.nn", 
                 "policeStation.nn", "AffordableRentalHousing.nn", "loopDistance", 
                 "wv.isSig", "wv.isSig.dist")
crossValidate <- function(dataset, id, dependentVariable, indVariables) {

allPredictions <- data.frame()
cvID_list <- unique(dataset[[id]])

for (i in cvID_list) {

  thisFold <- i
  cat("This hold out fold is", thisFold, "\n")

  fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>% 
                dplyr::select(id, geometry, indVariables, dependentVariable)
  fold.test  <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>% 
                dplyr::select(id, geometry, indVariables, dependentVariable)
  
  regression <-
    glm(countwv ~ ., family = "poisson", 
      data = fold.train %>% 
      dplyr::select(-geometry, -id))
  
  thisPrediction <- 
    mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
    
  allPredictions <-
    rbind(allPredictions, thisPrediction)
    
  }
  return(st_sf(allPredictions))
}
reg.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countwv",
  indVariables = reg.vars) %>%
    dplyr::select(cvID = cvID, countwv, Prediction, geometry)

reg.ss.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countwv",
  indVariables = reg.ss.vars) %>%
    dplyr::select(cvID = cvID, countwv, Prediction, geometry)
  
reg.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countwv",
  indVariables = reg.vars) %>%
    dplyr::select(cvID = name, countwv, Prediction, geometry)

reg.ss.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countwv",
  indVariables = reg.ss.vars) %>%
    dplyr::select(cvID = name, countwv, Prediction, geometry)
reg.summary <- 
  rbind(
    mutate(reg.cv,           Error = countwv - Prediction,
                             Regression = "Random k-fold CV: Just Risk Factors"),
                             
    mutate(reg.ss.cv,        Error = countwv - Prediction,
                             Regression = "Random k-fold CV: Spatial Structure"),
    
    mutate(reg.spatialCV,    Error = countwv - Prediction,
                             Regression = "Spatial LOGO-CV: Just Risk Factors"),
                             
    mutate(reg.ss.spatialCV, Error = countwv - Prediction,
                             Regression = "Spatial LOGO-CV: Spatial Structure")) %>%
    st_sf() 

7.1.1 k-fold cross-validation

The k-Fold cross-validation is to test the generalizability of the model. In the map of predicted weapon violation cases, the “just risk factors” under-predicted the hotspots of the weapon violations. It can only provide a general tendency, but the grids with a larger number of observed values cannot be predicted. In contrast, the risk factors plus the spatial structure leads to model overpredicting. The grids near to the grids with large weapon violation numbers tend to be predicted as having more weapon violation cases. The overpredictions cluster in the area with a large number of weapon violations. Thus, the models are not generalized enough.

7.1.2 Spatial cross-validation

Similar to the k-fold cross-validation, the model performs better in predicting the grids with fewer weapon violations. Adding spatial features would cause overprediction, while just using risk factors would cause underprediction. Thus, there is a trade-off between them.

grid.arrange(
  reg.summary %>%
    ggplot() +
      geom_sf(aes(fill = Prediction)) +
      facet_wrap(~Regression) +
      scale_fill_viridis() +
      labs(title = "Predicted weapon violations \n by Regression") +
      mapTheme() + theme(legend.position="bottom"),

  filter(reg.summary, Regression == "Random k-fold CV: Just Risk Factors") %>%
    ggplot() +
      geom_sf(aes(fill = countwv)) +
      scale_fill_viridis() +
      labs(title = "Observed weapon violations\n") +
      mapTheme() + theme(legend.position="bottom"), ncol = 2)

7.2 Model errors by cross validation

7.2.1 Error distribution

The map of error for the “just risk” contains more error than that of “spatial structure”. The errors usually appear at grids with more weapon violations. However, with spatial structures, the model is able to give a better prediction on the neighboring grids of the grids with more weapon violations. In the map of errors, the neighboring grids usually have fewer errors. Thus, the maps indicate that spatial characteristics can improve model performance.

7.2.2 k-fold cross-validation vs. Error for spatial cross-validation

The spatial cross-validation performs better when predicting the neighboring grids of the grids with more weapon violations.

reg.summary%>%
  ggplot() +
  geom_sf(aes(fill = Error)) +
  facet_wrap(~Regression) +
  scale_fill_viridis() +
  labs(title = "Weapon violations errors by regression") +
  mapTheme()

8. MAE and Standard Deviation MAE by Regression

The table below can be interpret as the models error by roughly 1 weapon violations on an absolute basis and on average. It is acceptable value. The MAE keeps indicating that the regressions with the spatial structural features are much more accurate and more generalizable.

st_set_geometry(reg.summary, NULL) %>%
  group_by(Regression) %>% 
  summarize(MAE = round(mean(abs(Prediction - countwv), na.rm = T),2),
            SD_MAE = round(sd(abs(Prediction - countwv), na.rm = T),2)) %>% 
  kable(caption = "MAE by regression") %>%
    kable_styling("striped", full_width = F) %>%
    row_spec(2, color = "black", background = "#FDE725FF") %>%
    row_spec(4, color = "black", background = "#FDE725FF") 
MAE by regression
Regression MAE SD_MAE
Random k-fold CV: Just Risk Factors 1.18 1.86
Random k-fold CV: Spatial Structure 0.96 1.42
Spatial LOGO-CV: Just Risk Factors 1.20 1.91
Spatial LOGO-CV: Spatial Structure 0.97 1.46
st_set_geometry(reg.summary, NULL) %>%
  group_by(Regression) %>%
    mutate(wv_Decile = ntile(countwv, 10)) %>%
  group_by(Regression, wv_Decile) %>%
    summarize(meanObserved = mean(countwv, na.rm=T),
              meanPrediction = mean(Prediction, na.rm=T)) %>%
    gather(Variable, Value, -Regression, -wv_Decile) %>%          
    ggplot(aes(wv_Decile, Value, shape = Variable)) +
      geom_point(size = 2) + geom_path(aes(group = wv_Decile), colour = "black") +
      scale_shape_manual(values = c(2, 17)) +
      facet_wrap(~Regression) + xlim(0,10) +
      labs(title = "Predicted and observed Weapon violations by observed Weapon violations decile")

9. Raw Errors by Race Context for Cross Validation Regression

tracts17 <- 
  get_acs(geography = "tract", variables = c("B01001_001E","B01001A_001E"), 
          year = 2017, state=17, county=031, geometry=T) %>%
  st_transform(102271)  %>% 
  dplyr::select(variable, estimate, GEOID) %>%
  spread(variable, estimate) %>%
  rename(TotalPop = B01001_001,
         NumberWhites = B01001A_001) %>%
  mutate(percentWhite = NumberWhites / TotalPop,
         raceContext = ifelse(percentWhite > .5, "Majority_White", "Majority_Non_White")) %>%
  .[neighborhoods,]
# 
# ggplot() + 
#   geom_sf(data = tracts17, aes(fill = raceContext)) +
#   scale_fill_viridis(discrete = TRUE) +
#   labs(title = "Race Context", name="Race Context") +
#   mapTheme() 

In general, both methods overpredict the Majority_Non_White and underpredict the Majority_White. Using just risk factors tends to cause more over and under prediction. Said differently, the inclusion of spatial structures would effectively reduce the degree of over and under predictions. The spatial cross-validation models have higher likelihood on over and under predicting than k-fold cross-validation models do.

final_reg <-
  reg.summary %>%
  mutate(uniqueID = rownames(.))

final_reg.tracts <- 
  st_join(st_centroid(final_reg), tracts17) %>%
    st_set_geometry(NULL) %>%
    left_join(dplyr::select(final_reg, uniqueID)) %>%
    st_sf() %>%
    na.omit()

st_set_geometry(final_reg.tracts, NULL) %>%
  group_by(Regression, raceContext) %>%
  summarize(mean.Error = mean(Error, na.rm = T)) %>%
  spread(raceContext, mean.Error) %>%
  kable(caption = "Mean Error by neighborhood racial context") %>%
    kable_styling("striped", full_width = F) 
Mean Error by neighborhood racial context
Regression Majority_Non_White Majority_White
Random k-fold CV: Just Risk Factors 0.2489997 -0.2681716
Random k-fold CV: Spatial Structure 0.1081192 -0.1170272
Spatial LOGO-CV: Just Risk Factors 0.2908637 -0.2790828
Spatial LOGO-CV: Spatial Structure 0.1116163 -0.1193310

10. Kernel Density Map

The model narrowly edges out the kernel density in the highest risk category, and classifies extra grids as the highest risk category. It would help reallocate the limited policing resources.

library(raster)
wv_ppp <- as.ppp(st_coordinates(wv), W = st_bbox(final_net))
wv_KD.1000 <- spatstat::density.ppp(wv_ppp, 1000)
wv_KD.1500 <- spatstat::density.ppp(wv_ppp, 1500)
wv_KD.2000 <- spatstat::density.ppp(wv_ppp, 2000)
wv_KD.df <- rbind(
  mutate(data.frame(rasterToPoints(mask(raster(wv_KD.1000), as(neighborhoods, 'Spatial')))), Legend = "1000 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(wv_KD.1500), as(neighborhoods, 'Spatial')))), Legend = "1500 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(wv_KD.2000), as(neighborhoods, 'Spatial')))), Legend = "2000 Ft.")) 

wv_KD.df$Legend <- factor(wv_KD.df$Legend, levels = c("1000 Ft.", "1500 Ft.", "2000 Ft."))

# ggplot(data=wv_KD.df, aes(x=x, y=y)) +
#   geom_raster(aes(fill=layer)) + 
#   facet_wrap(~Legend) +
#   coord_sf(crs=st_crs(final_net)) + 
#   scale_fill_viridis() +
#   mapTheme()
wv_ppp <- as.ppp(st_coordinates(wv), W = st_bbox(final_net))
wv_KD <- spatstat::density.ppp(wv_ppp, 1000)

as.data.frame(wv_KD) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) %>%
   ggplot() +
     geom_sf(aes(fill=value)) +
     geom_sf(data = sample_n(wv, 1500), size = .5) +
     scale_fill_viridis() + 
     labs(title= "Kernel density with a 1000 ft radius - weapon violation") +
     mapTheme()

# Compute kernel density
wv_ppp <- as.ppp(st_coordinates(wv), W = st_bbox(final_net))
wv_KD <- spatstat::density.ppp(wv_ppp, 1000)
# Convert kernel density to grid cells taking the mean
wv_KDE_sf <- as.data.frame(wv_KD) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) %>%
#Mutate the Risk_Category field as defined below.
  mutate(label = "Kernel Density",
         Risk_Category = ntile(value, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
#Bind to a layer where test set crime counts are spatially joined to the fisnnet.
  bind_cols(
    aggregate(
      dplyr::select(wv) %>% mutate(wvCount = 1), ., length) %>%
    mutate(wvCount = replace_na(wvCount, 0))) %>%
#Select the fields we need
  dplyr::select(label, Risk_Category, wvCount)

head(wv_KDE_sf)
wv_risk_sf <-
  filter(final_reg, Regression == "Spatial LOGO-CV: Spatial Structure") %>%
  mutate(label = "Risk Predictions",
         Risk_Category = ntile(Prediction, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  bind_cols(
    aggregate(
      dplyr::select(wv) %>% mutate(wvCount = 1), ., length) %>%
      mutate(wvCount = replace_na(wvCount, 0))) %>%
  dplyr::select(label,Risk_Category, wvCount)
rbind(wv_KDE_sf, wv_risk_sf) %>%
  gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
  ggplot() +
    geom_sf(aes(fill = Risk_Category), colour = NA) +
    geom_sf(data = sample_n(wv, 1500), size = .1, colour = "black") +
    facet_wrap(~label, ) +
    scale_fill_viridis(discrete = TRUE) +
    labs(title="Comparison of Kernel Density and Risk Predictions",
         subtitle="Relative to test set points (in black)") +
    mapTheme()

11. Kernel Density vs. Predictions

The bar plot shows the kernel density of observed data and the predicting data. The purple bar represents the kernal density of the risk category in reality, and the yellow bar represents the kernal density of predicted values.

rbind(wv_KDE_sf, wv_risk_sf) %>%
  st_set_geometry(NULL) %>%
  gather(Variable, Value, -label, -Risk_Category) %>%
  group_by(label, Risk_Category) %>%
  summarize(countwvlaries = sum(Value)) %>%
  ungroup() %>%
  group_by(label) %>%
  mutate(Rate_of_test_set_crimes = countwvlaries / sum(countwvlaries)) %>%
    ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
      geom_bar(aes(fill=label), position="dodge", stat="identity") +
      scale_fill_viridis(discrete = TRUE)

12. Conclusion

I do not recommend this algorithm be put into production. Though the number of errors is in an acceptable range and the MAE values indicate the model is able to provide a relatively reasonable prediction, the over and under prediction in different ethnic groups may intensify the problem of over-policing at certain areas or to certain ethnic groups. It would reduce the general equity across the neighborhoods and ethnic groups. Thus, the model does not effectively balance the selection bias of the dataset.

Weapon violation (unlawful poss of handguns) as the outcome of interest with more selection bias would potentially lead to over or under predictions on certain areas. Unsafe neighborhoods usually have more policing resources and more reporting. Besides, one can easily hide a handgun, which increases the difficulty of reporting and searching. The good point about this model that the risk predictions capture a greater share of observed weapon violation events in the highest risk category relative to the kernel density. To the area in the lowest risk category, it also suggests allocating more resources, which may help to reduce the selection bias and increase the level of safety. The rare weapon violation event would be effectively curbed. However, the algorithm does not perform well with the risk categories in the middle, and this underprediction potentially misleads the resources reallocation. The misleading plus the bias on different ethnic groups, I still do not recommend this algorithm.