Bit more communicative, though the overplotting is a bit annoying.

Code:

## gis libraries library(spBayes) library(MBA) library(geoR) library(fields) library(sp) library(maptools) library(rgdal) library(classInt) library(lattice) library(xtable) library(spatstat) library(splancs) ## Other packages library(ggplot2) library(foreign) library(stringr) library(lubridate) library(plyr) library(xtable) library(scales) library(RColorBrewer) library(grid) library(ggmap) library(gridExtra) library(ggmcmc) setwd('/home/rmealey/Dropbox/school/gisClass/FinalProject') options(digits=10) Save <- function(projName){ savehistory(paste(projName,'.Rhistory',sep='')) save.image(paste(projName,'.RData',sep='')) } sv <- function() Save('FinalProject') ######################################################################## ## Utility Functions ## Read lat/lng coords function str2LatLong <- function(in_df){ latlng <- str_replace(str_replace(in_df$Location.1,'\\(',''),')','') latlng <- str_split(latlng,', ') latlng_df <- ldply(latlng[in_df$Location.1 != '']) out_df <- in_df out_df$lat <- as.numeric(latlng_df[,1]) out_df$long <- as.numeric(latlng_df[,2]) return(out_df) } ## convert projection function convProj <- function(in_df,in_proj,out_proj){ latlong <- in_df[,c('long','lat')] latlong_spdf <- SpatialPoints(latlong, proj4string=in_proj) latlong_spdf <- spTransform(latlong_spdf,out_proj) latlong_spdf_coords <- coordinates(latlong_spdf) out_df <- in_df out_df$long <- latlong_spdf_coords[,1] out_df$lat <- latlong_spdf_coords[,2] return(out_df) } ######################################################################## # City Boundary Shape File city_df <- read.dbf('Baltcity_20Line/baltcity_line.dbf') city_shp <- readOGR(dsn='Baltcity_20Line', layer='baltcity_line') origProj <- city_shp@proj4string ## Store original projection #city_shp = spTransform(city_shp,CRS("+proj=longlat +datum=WGS84")) city_pl_df <- fortify(city_shp, region='LABEL') cityLineCoords <- data.frame(city_shp@lines[[1]]@Lines[[1]]@coords) cityLinePoly <- Polygon(cityLineCoords) cityLinePolys <- Polygons(list(cityLinePoly), ID='cityline') cityLineSpPoly <- SpatialPolygons(list(cityLinePolys),proj4string=origProj) cityLineCoords <- cityLineCoords[,c(2,1)] ######################################################################## # Neighborhood Shape Files # Source: ## Neighborhood Shape Files read in v1 nbhds_df <- read.dbf('Neighborhood_202010/nhood_2010.dbf') nbhds_shp <- readOGR(dsn='Neighborhood_202010', layer='nhood_2010') origProj <- nbhds_shp@proj4string ## Store original projection #nbhds_shp = spTransform(nbhds_shp,CRS("+proj=longlat +datum=WGS84")) nbhds_pl_df <- fortify(nbhds_shp, region='LABEL') names(nbhds_shp@polygons) <- nbhds_shp@data$LABEL ## Neighborhood Shape Files read in v2 (from spatstat docs) #nbhds_shp <- readShapePoly('Neighborhood_202010/nhood_2010.shp') #nbhds_sp <- as(nbhds_shp, "SpatialPolygons") #nbhds_owin <- as(nbhds_sp, "owin") #centroids <- coordinates(nbhds_shp) hoodNames <- 'Mount Vernon' ggplot(data=nbhds_pl_df[nbhds_pl_df$id==hoodNames,], aes(x=long, y=lat, group=group)) + geom_path() + ggtitle(hoodNames) + coord_equal() ######################################################################## # Parcel Shape Polygon Data parcel_shp <- readOGR(dsn='Parcel_Shp', layer='parcel') ## Deduplicate polygons and dataframe parcel_shp2 <- parcel_shp[!duplicated(parcel_shp$BLOCKLOT),] parcel_mtrx <- as.matrix(coordinates(parcel_shp2)) colnames(parcel_mtrx) <- c('long','lat') rownames(parcel_mtrx) <- parcel_shp2$BLOCKLOT parcel_shp2$Type <- NA ######################################################################## # Vacant Buildings vacantBuildings_df <- read.csv('OpenDataSets/Vacant_Buildings.csv') vacantBuildings_df <- str2LatLong(vacantBuildings_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantBuildings_df <- convProj(vacantBuildings_df, inProj, outProj) vacantBuildings_df$type <- 'Vacant Building' vacBld_mtrx <- as.matrix(vacantBuildings_df[,c('long','lat')]) vacantBuildings_parc <- parcel_shp2[parcel_shp2$BLOCKLOT%in%vacantBuildings_df$blockLot,] ######################################################################## # Vacant Lots # Source: vacantLots_df <- read.csv('OpenDataSets/Vacant_Lots.csv') vacantLots_df <- str2LatLong(vacantLots_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantLots_df <- convProj(vacantLots_df, inProj, outProj) vacantLots_df$type <- 'Vacant Lot' vacantLots_mtrx <- as.matrix(vacantLots_df[,c('long','lat')]) vacantLots_parc <- parcel_shp2[parcel_shp2$BLOCKLOT%in%vacantLots_df$blockLot,] ######################################################################## ## Crime Data crimeData <- read.csv('OpenDataSets/BPD_Part_1_Victim_Based_Crime_Data.csv') crimeData_NoCoords <- crimeData[crimeData$Location.1 == '',] crimeData <- crimeData[crimeData$Location.1 != '',] ## Get and convert projection crimeData <- str2LatLong(crimeData) ## Incidents already in correct proj crimeData_ProjOrig <- crimeData[crimeData$lat>100,] crimeData <- crimeData[crimeData$lat<100,] inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj crimeData <- convProj(crimeData, inProj, outProj) crime_mtrx <- as.matrix(crimeData[,c('long','lat')]) ## Parse Dates crimeData$crimeDate2 <- parse_date_time( crimeData$crimeDate, orders='%m/%d/%Y' ) ## Get Burglary Incidents burg_df <- subset(crimeData, description=='BURGLARY') ## Hold Out 2012 Incidents burg_df_ho <- subset(burg_df, year(crimeDate2) == '2012') burg_df <- subset(burg_df, year(crimeDate2) != '2012') ggplot(data=burg_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Get Street Robbery Incidents robbStr_df <- subset(crimeData, description=="ROBBERY - STREET") ## Hold Out 2012 Incidents robbStr_df_ho <- subset(robbStr_df, year(crimeDate2) == '2012') robbStr_df <- subset(robbStr_df, year(crimeDate2) != '2012') ggplot(data=robbStr_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Homicide homic_df <- subset(crimeData, description=='HOMICIDE') ## Hold Out 2012 Incidents homic_df_ho <- subset(homic_df, year(crimeDate2) == '2012') homic_df <- subset(homic_df, year(crimeDate2) != '2012') ggplot(data=homic_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Aggravated Assault aggrAslt_df <- subset(crimeData, description=='AGG. ASSAULT') ## Hold Out 2012 Incidents aggrAslt_df_ho <- subset(aggrAslt_df, year(crimeDate2) == '2012') aggrAslt_df <- subset(aggrAslt_df, year(crimeDate2) != '2012') ggplot(data=aggrAslt_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ######################################################################## # Plot by Neighborhood nbhd_name <- 'Sandtown-Winchester' plot_title <- "Sandtown-\nWinchester\nVacant Properties\nand Crime" plot_title_x <- 1415200 plot_title_y <- 598300 file_name <- 'SandtownWinchesterVacantsandCrime' ##border nbhd_border_df <- fortify(nbhds_shp@polygons[[nbhd_name]]) sw_mtr <- as.matrix(nbhd_border_df[,1:2]) ## Parcels in nbhd sw_props <- data.frame(pip(parcel_mtrx, sw_mtr)) sw_polys <- parcel_shp2[parcel_shp2$BLOCKLOT%in%rownames(sw_props),] sw_polys_df <- fortify(sw_polys) ## Vacants in nbhd sw_vb <- vacantBuildings_parc[vacantBuildings_parc$BLOCKLOT%in%rownames(sw_props),] sw_vl <- vacantLots_parc[vacantLots_parc$BLOCKLOT%in%rownames(sw_props),] ## Crime in nbhd sw_crime <- data.frame(pip(crime_mtrx, sw_mtr)) sw_crime <- crimeData[rownames(sw_crime),] sw_crime_2012 <- subset(sw_crime, year(crimeDate2)==2012) colnames(sw_props) <- c('long','lat') colnames(sw_vacB) <- c('long','lat') colnames(sw_vacL) <- c('long','lat') # https://github.com/wch/ggplot2/wiki/New-theme-system new_theme_empty <- theme_bw() new_theme_empty$line <- element_blank() new_theme_empty$rect <- element_blank() new_theme_empty$strip.text <- element_blank() new_theme_empty$axis.text <- element_blank() new_theme_empty$plot.title <- element_blank() new_theme_empty$axis.title <- element_blank() new_theme_empty$legend.position <- 'bottom' new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit") crimeCols <- brewer.pal(12,'Paired') crimeTypes <- list('RAPE'=c(crimeCols[1],crimeCols[2]), 'ARSON'=c(crimeCols[1],crimeCols[2]), 'COMMON ASSAULT'=c(crimeCols[3],crimeCols[4]), 'AGG. ASSAULT'=c(crimeCols[3],crimeCols[4]), 'SHOOTING'=c(crimeCols[5],crimeCols[6]), 'HOMICIDE'=c(crimeCols[5],crimeCols[6]), 'ROBBERY - STREET'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - CARJACKING'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - RESIDENCE'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - COMMERCIAL'=c(crimeCols[7],crimeCols[8]), 'BURGLARY'=c(crimeCols[9],crimeCols[10]), 'LARCENY'=c(crimeCols[9],crimeCols[10]), 'AUTO THEFT'=c(crimeCols[11],crimeCols[12]), 'LARCENY FROM AUTO'=c(crimeCols[11],crimeCols[12])) crimeCols <- as.data.frame(t(data.frame(crimeTypes))) col_cols <- crimeCols[,2] names(col_cols) <- names(crimeTypes) ggplot(data = nbhd_border_df) + geom_polygon(aes(x=long, y=lat, group=group), color='black', fill='white') + geom_path(data=sw_polys_df, aes(x=long,y=lat,group=group), size=.3) + geom_polygon(data = sw_vb, aes(x=long, y=lat, group=group), color = 'black', fill='pink',size=.3) + geom_polygon(data = sw_vl, aes(x=long, y=lat, group=group), color = 'black', fill='pink',size=.3) + geom_jitter(data = sw_crime_2012, aes(x=long, y=lat, color=description, shape=description), size=2, alpha='.8') + scale_color_manual(values = col_cols) + scale_shape_manual(values = crime_shapes) + coord_equal() + annotate("text", x = plot_title_x, y = plot_title_y, label=plot_title, size=6, color="black") + new_theme_empty + guides(color=guide_legend("",nrow=5),shape=guide_legend("",nrow=5)) + ggsave(paste('img/',file_name,'.png',sep=''),width=11, height=8.5) ######################################################################## # Vacant Lots nbhd_name <- 'Harlem Park' plot_title <- "Harlem Park\nVacant Properties\nand Crime" plot_title_x <- 1416400 plot_title_y <- 594500 file_name <- 'HarlemParkVacantsandCrime' ##border nbhd_border_df <- fortify(nbhds_shp@polygons[[nbhd_name]]) sw_mtr <- as.matrix(nbhd_border_df[,1:2]) ## Parcels in nbhd sw_props <- data.frame(pip(parcel_mtrx, sw_mtr)) sw_polys <- parcel_shp2[parcel_shp2$BLOCKLOT%in%rownames(sw_props),] sw_polys_df <- fortify(sw_polys) ## Vacants in nbhd sw_vb <- vacantBuildings_parc[vacantBuildings_parc$BLOCKLOT%in%rownames(sw_props),] sw_vl <- vacantLots_parc[vacantLots_parc$BLOCKLOT%in%rownames(sw_props),] ## Crime in nbhd sw_crime <- data.frame(pip(crime_mtrx, sw_mtr)) sw_crime <- crimeData[rownames(sw_crime),] sw_crime_2012 <- subset(sw_crime, year(crimeDate2)==2012) colnames(sw_props) <- c('long','lat') colnames(sw_vacB) <- c('long','lat') colnames(sw_vacL) <- c('long','lat') # https://github.com/wch/ggplot2/wiki/New-theme-system new_theme_empty <- theme_bw() new_theme_empty$line <- element_blank() new_theme_empty$rect <- element_blank() new_theme_empty$strip.text <- element_blank() new_theme_empty$axis.text <- element_blank() new_theme_empty$plot.title <- element_blank() new_theme_empty$axis.title <- element_blank() new_theme_empty$legend.position <- 'bottom' new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit") crimeCols <- brewer.pal(12,'Paired') crimeTypes <- list('RAPE'=c(crimeCols[1],crimeCols[2],'①'), 'ARSON'=c(crimeCols[1],crimeCols[2],'②'), 'COMMON ASSAULT'=c(crimeCols[3],crimeCols[4],'③'), 'AGG. ASSAULT'=c(crimeCols[3],crimeCols[4],'④'), 'SHOOTING'=c(crimeCols[5],crimeCols[6],'⑤'), 'HOMICIDE'=c(crimeCols[5],crimeCols[6],'⑥'), 'ROBBERY - STREET'=c(crimeCols[7],crimeCols[8],'⑦'), 'ROBBERY - CARJACKING'=c(crimeCols[7],crimeCols[8],'⑧'), 'ROBBERY - RESIDENCE'=c(crimeCols[7],crimeCols[8],'⑨'), 'ROBBERY - COMMERCIAL'=c(crimeCols[7],crimeCols[8],'⑩'), 'BURGLARY'=c(crimeCols[9],crimeCols[10],'Ⓐ'), 'LARCENY'=c(crimeCols[9],crimeCols[10],'Ⓑ'), 'AUTO THEFT'=c(crimeCols[11],crimeCols[12],'Ⓒ'), 'LARCENY FROM AUTO'=c(crimeCols[11],crimeCols[12],'Ⓓ')) crimeCols <- as.data.frame(t(data.frame(crimeTypes))) col_cols <- crimeCols[,2] crime_shapes <- crimeCols[,3] names(col_cols) <- names(crimeTypes) names(crime_shapes) <- names(crimeTypes) sw_crime_2012$description <- ordered(sw_crime_2012$description, levels=names(crimeTypes)) ggplot(data = nbhd_border_df) + geom_polygon(aes(x=long, y=lat, group=group), color='black', fill='white') + geom_path(data=sw_polys_df, aes(x=long,y=lat,group=group), size=.3) + geom_polygon(data = sw_vb, aes(x=long, y=lat, group=group), color = 'black', fill='pink',size=.3) + geom_polygon(data = sw_vl, aes(x=long, y=lat, group=group), color = 'black', fill='pink',size=.3) + geom_jitter(data = sw_crime_2012, aes(x=long, y=lat, color=description, shape=description), size=2, alpha='.8') + scale_color_manual(values = col_cols) + scale_shape_manual(values = crime_shapes) + coord_equal() + annotate("text", x = plot_title_x, y = plot_title_y, label=plot_title, size=6, color="black") + new_theme_empty + guides(color=guide_legend("",nrow=5),shape=guide_legend("",nrow=5)) + ggsave(paste('img/',file_name,'.png',sep=''),width=11, height=8.5) |

and Harlem Park:

These aren't very polished, I'll put up better versions.

Here's the code for those that want it:

## gis libraries library(spBayes) library(MBA) library(geoR) library(fields) library(sp) library(maptools) library(rgdal) library(classInt) library(lattice) library(xtable) library(spatstat) library(splancs) ## Other packages library(ggplot2) library(foreign) library(stringr) library(lubridate) library(plyr) library(xtable) library(scales) library(RColorBrewer) library(grid) library(ggmap) library(gridExtra) library(ggmcmc) ######################################################################## # City Boundary Shape File city_df <- read.dbf('Baltcity_20Line/baltcity_line.dbf') city_shp <- readOGR(dsn='Baltcity_20Line', layer='baltcity_line') origProj <- city_shp@proj4string ## Store original projection #city_shp = spTransform(city_shp,CRS("+proj=longlat +datum=WGS84")) city_pl_df <- fortify(city_shp, region='LABEL') cityLineCoords <- data.frame(city_shp@lines[[1]]@Lines[[1]]@coords) cityLinePoly <- Polygon(cityLineCoords) cityLinePolys <- Polygons(list(cityLinePoly), ID='cityline') cityLineSpPoly <- SpatialPolygons(list(cityLinePolys),proj4string=origProj) cityLineCoords <- cityLineCoords[,c(2,1)] ######################################################################## # Neighborhood Shape Files # Source: ## Neighborhood Shape Files read in v1 nbhds_df <- read.dbf('Neighborhood_202010/nhood_2010.dbf') nbhds_shp <- readOGR(dsn='Neighborhood_202010', layer='nhood_2010') origProj <- nbhds_shp@proj4string ## Store original projection #nbhds_shp = spTransform(nbhds_shp,CRS("+proj=longlat +datum=WGS84")) nbhds_pl_df <- fortify(nbhds_shp, region='LABEL') names(nbhds_shp@polygons) <- nbhds_shp@data$LABEL ## Neighborhood Shape Files read in v2 (from spatstat docs) #nbhds_shp <- readShapePoly('Neighborhood_202010/nhood_2010.shp') #nbhds_sp <- as(nbhds_shp, "SpatialPolygons") #nbhds_owin <- as(nbhds_sp, "owin") #centroids <- coordinates(nbhds_shp) hoodNames <- 'Mount Vernon' ggplot(data=nbhds_pl_df[nbhds_pl_df$id==hoodNames,], aes(x=long, y=lat, group=group)) + geom_path() + ggtitle(hoodNames) + coord_equal() ## plot actual city shape using empty nbhd boundaries city_plot <- bound_plot + geom_polygon(data=nbhds_pl_df, fill='white',color='white') ggsave('img/emptyCity.png') ## plot nbhd boundaries nbhds_plot <- bound_plot + geom_polygon(data=nbhds_pl_df,color='gray',fill='white') ggsave('img/nbhds.png') ######################################################################## # Parcel Shape Polygon Data # Source: parcel_df <- read.dbf('Parcel_Shp/parcel.dbf') parcel_shp <- readOGR(dsn='Parcel_Shp', layer='parcel') parcel_df <- data.frame(parcel_df, coordinates(parcel_shp)) parcel_mtrx <- as.matrix(coordinates(parcel_shp)) ######################################################################## # Vacant Buildings # Source: vacantBuildings_df <- read.csv('OpenDataSets/Vacant_Buildings.csv') vacantBuildings_df <- str2LatLong(vacantBuildings_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantBuildings_df <- convProj(vacantBuildings_df, inProj, outProj) vacantBuildings_df$type <- 'Vacant Building' vacBld_mtrx <- as.matrix(vacantBuildings_df[,c('long','lat')]) ######################################################################## # Vacant Lots # Source: vacantLots_df <- read.csv('OpenDataSets/Vacant_Lots.csv') vacantLots_df <- str2LatLong(vacantLots_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantLots_df <- convProj(vacantLots_df, inProj, outProj) vacantLots_df$type <- 'Vacant Lot' vacantLots_mtrx <- as.matrix(vacantLots_df[,c('long','lat')]) ######################################################################## ## Plot by neighborhood crimeData <- read.csv('OpenDataSets/BPD_Part_1_Victim_Based_Crime_Data.csv') crimeData_NoCoords <- crimeData[crimeData$Location.1 == '',] crimeData <- crimeData[crimeData$Location.1 != '',] ## Get and convert projection crimeData <- str2LatLong(crimeData) ## Incidents already in correct proj crimeData_ProjOrig <- crimeData[crimeData$lat>100,] crimeData <- crimeData[crimeData$lat<100,] inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj crimeData <- convProj(crimeData, inProj, outProj) crime_mtrx <- as.matrix(crimeData[,c('long','lat')]) ## Parse Dates crimeData$crimeDate2 <- parse_date_time( crimeData$crimeDate, orders='%m/%d/%Y' ) ## Get Burglary Incidents burg_df <- subset(crimeData, description=='BURGLARY') ## Hold Out 2012 Incidents burg_df_ho <- subset(burg_df, year(crimeDate2) == '2012') burg_df <- subset(burg_df, year(crimeDate2) != '2012') ggplot(data=burg_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Get Street Robbery Incidents robbStr_df <- subset(crimeData, description=="ROBBERY - STREET") ## Hold Out 2012 Incidents robbStr_df_ho <- subset(robbStr_df, year(crimeDate2) == '2012') robbStr_df <- subset(robbStr_df, year(crimeDate2) != '2012') ggplot(data=robbStr_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Homicide homic_df <- subset(crimeData, description=='HOMICIDE') ## Hold Out 2012 Incidents homic_df_ho <- subset(homic_df, year(crimeDate2) == '2012') homic_df <- subset(homic_df, year(crimeDate2) != '2012') ggplot(data=homic_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Aggravated Assault aggrAslt_df <- subset(crimeData, description=='AGG. ASSAULT') ## Hold Out 2012 Incidents aggrAslt_df_ho <- subset(aggrAslt_df, year(crimeDate2) == '2012') aggrAslt_df <- subset(aggrAslt_df, year(crimeDate2) != '2012') ggplot(data=aggrAslt_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ######################################################################## # Vacant Lots SandtownWinchester_df <- fortify(nbhds_shp@polygons[['Sandtown-Winchester']]) sw_mtr <- as.matrix(SandtownWinchester_df[,1:2]) sw_props <- data.frame(pip(parcel_mtrx, sw_mtr)) sw_vacB <- data.frame(pip(vacBld_mtrx, sw_mtr)) sw_vacL <- data.frame(pip(vacBld_mtrx, sw_mtr)) sw_crime <- data.frame(pip(crime_mtrx, sw_mtr)) sw_crime <- crimeData[rownames(sw_crime),] sw_crime_2012 <- subset(sw_crime, year(crimeDate2)==2012) colnames(sw_props) <- c('long','lat') colnames(sw_vacB) <- c('long','lat') colnames(sw_vacL) <- c('long','lat') # https://github.com/wch/ggplot2/wiki/New-theme-system new_theme_empty <- theme_bw() new_theme_empty$line <- element_blank() new_theme_empty$rect <- element_blank() new_theme_empty$strip.text <- element_blank() new_theme_empty$axis.text <- element_blank() new_theme_empty$plot.title <- element_blank() new_theme_empty$axis.title <- element_blank() new_theme_empty$legend.position <- 'bottom' new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit") crimeCols <- brewer.pal(12,'Paired') crimeTypes <- list('RAPE'=c(crimeCols[1],crimeCols[2]), 'ARSON'=c(crimeCols[1],crimeCols[2]), 'COMMON ASSAULT'=c(crimeCols[3],crimeCols[4]), 'AGG. ASSAULT'=c(crimeCols[3],crimeCols[4]), 'SHOOTING'=c(crimeCols[5],crimeCols[6]), 'HOMICIDE'=c(crimeCols[5],crimeCols[6]), 'ROBBERY - STREET'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - CARJACKING'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - RESIDENCE'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - COMMERCIAL'=c(crimeCols[7],crimeCols[8]), 'BURGLARY'=c(crimeCols[9],crimeCols[10]), 'LARCENY'=c(crimeCols[9],crimeCols[10]), 'AUTO THEFT'=c(crimeCols[11],crimeCols[12]), 'LARCENY FROM AUTO'=c(crimeCols[11],crimeCols[12])) crimeCols <- as.data.frame(t(data.frame(crimeTypes))) col_cols <- crimeCols[,2] names(col_cols) <- names(crimeTypes) ggplot(data = SandtownWinchester_df) + geom_polygon(aes(x=long, y=lat, group=group), color='black', fill='white') + geom_point(data = sw_props, aes(x=long, y=lat), shape = 0, color = 'gray') + geom_point(data = sw_vacB, aes(x=long, y=lat), shape = 4, color = 'red') + geom_point(data = sw_crime_2012, aes(x=long, y=lat, color=description), shape = 'o',size=2) + scale_color_manual(values = col_cols) + coord_equal() + annotate("text", x = 1415200, y = 598300, label="Sandtown-\nWinchester\nVacant Properties\nand Crime", size=6, color="black") + new_theme_empty + guides(color=guide_legend("",nrow=5)) + ggsave('img/SandtownWinchesterVacantsandCrime.png') ######################################################################## # Vacant Lots HarlemPark_df <- fortify(nbhds_shp@polygons[['Harlem Park']]) hp_mtr <- as.matrix(HarlemPark_df[,1:2]) hp_props <- data.frame(pip(parcel_mtrx, hp_mtr)) hp_vacB <- data.frame(pip(vacBld_mtrx, hp_mtr)) hp_vacL <- data.frame(pip(vacBld_mtrx, hp_mtr)) hp_crime <- data.frame(pip(crime_mtrx, hp_mtr)) hp_crime <- crimeData[rownames(hp_crime),] hp_crime_2012 <- subset(hp_crime, year(crimeDate2)==2012) colnames(hp_props) <- c('long','lat') colnames(hp_vacB) <- c('long','lat') colnames(hp_vacL) <- c('long','lat') # https://github.com/wch/ggplot2/wiki/New-theme-system new_theme_empty <- theme_bw() new_theme_empty$line <- element_blank() new_theme_empty$rect <- element_blank() new_theme_empty$strip.text <- element_blank() new_theme_empty$axis.text <- element_blank() new_theme_empty$plot.title <- element_blank() new_theme_empty$axis.title <- element_blank() new_theme_empty$legend.position <- 'bottom' new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit") crimeCols <- brewer.pal(12,'Paired') crimeTypes <- list('RAPE'=c(crimeCols[1],crimeCols[2]), 'ARSON'=c(crimeCols[1],crimeCols[2]), 'COMMON ASSAULT'=c(crimeCols[3],crimeCols[4]), 'AGG. ASSAULT'=c(crimeCols[3],crimeCols[4]), 'SHOOTING'=c(crimeCols[5],crimeCols[6]), 'HOMICIDE'=c(crimeCols[5],crimeCols[6]), 'ROBBERY - STREET'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - CARJACKING'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - RESIDENCE'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - COMMERCIAL'=c(crimeCols[7],crimeCols[8]), 'BURGLARY'=c(crimeCols[9],crimeCols[10]), 'LARCENY'=c(crimeCols[9],crimeCols[10]), 'AUTO THEFT'=c(crimeCols[11],crimeCols[12]), 'LARCENY FROM AUTO'=c(crimeCols[11],crimeCols[12])) crimeCols <- as.data.frame(t(data.frame(crimeTypes))) col_cols <- crimeCols[,2] names(col_cols) <- names(crimeTypes) hpplot <- ggplot(data = HarlemPark_df) + geom_polygon(aes(x=long, y=lat, group=group), color='black', fill='white') + geom_point(data = hp_props, aes(x=long, y=lat), shape = 0, color = 'gray',size=5) + geom_point(data = hp_vacB, aes(x=long, y=lat), shape = 4, color = 'red',size=5) + geom_point(data = hp_crime_2012, aes(x=long, y=lat, color=description), shape = 'o',size=3) + scale_color_manual(values = col_cols) + coord_equal() + annotate("text", x = 1416400, y = 594500, label="Harlem Park\nVacant Properties\nand Crime", size=4, color="black") + new_theme_empty + guides(color=guide_legend("",nrow=5)) ggsave('img/HarlemParkVacantsandCrime.png',width=11, height=8.5) |

This plot is of 15,928 vacant buildings and 17,169 vacant lots (according to the datasets here) across the city of Baltimore:

Here are visualizations of the 2-dimensional kernel density estimates for both of them. A density estimate essentially gives values at every point on a plane that communicate how close that point is to how many observations of the variable or point process you care about. So the more red, the more vacant properties are clustered together in that area.

And here are kernel density visualizations for homicide and aggravated assault:

And if you're interested, all the data is from here and here's the code:

## gis libraries library(spBayes) library(MBA) library(geoR) library(fields) library(sp) library(maptools) library(rgdal) library(classInt) library(lattice) library(xtable) library(spatstat) library(splancs) ## Other packages library(ggplot2) library(foreign) library(stringr) library(lubridate) library(plyr) library(xtable) library(scales) library(RColorBrewer) library(grid) library(ggmap) library(gridExtra) library(ggmcmc) setwd('/home/rmealey/Dropbox/school/gisClass/FinalProject') options(digits=10) Save <- function(projName){ savehistory(paste(projName,'.Rhistory',sep='')) save.image(paste(projName,'.RData',sep='')) } sv <- function() Save('FinalProject') ######################################################################## # City Boundary Shape File city_df <- read.dbf('Baltcity_20Line/baltcity_line.dbf') city_shp <- readOGR(dsn='Baltcity_20Line', layer='baltcity_line') origProj <- city_shp@proj4string ## Store original projection #city_shp = spTransform(city_shp,CRS("+proj=longlat +datum=WGS84")) city_pl_df <- fortify(city_shp, region='LABEL') cityLineCoords <- data.frame(city_shp@lines[[1]]@Lines[[1]]@coords) cityLinePoly <- Polygon(cityLineCoords) cityLinePolys <- Polygons(list(cityLinePoly), ID='cityline') cityLineSpPoly <- SpatialPolygons(list(cityLinePolys),proj4string=origProj) cityLineCoords <- cityLineCoords[,c(2,1)]) ######################################################################## ## Neighborhood Shape Files read in v1 nbhds_df <- read.dbf('Neighborhood_202010/nhood_2010.dbf') nbhds_shp <- readOGR(dsn='Neighborhood_202010', layer='nhood_2010') origProj <- nbhds_shp@proj4string ## Store original projection #nbhds_shp = spTransform(nbhds_shp,CRS("+proj=longlat +datum=WGS84")) nbhds_pl_df <- fortify(nbhds_shp, region='LABEL') ######################################################################## ## Utility Functions ## Read lat/lng coords function str2LatLong <- function(in_df){ latlng <- str_replace(str_replace(in_df$Location.1,'\\(',''),')','') latlng <- str_split(latlng,', ') latlng_df <- ldply(latlng[in_df$Location.1 != '']) out_df <- in_df out_df$lat <- as.numeric(latlng_df[,1]) out_df$long <- as.numeric(latlng_df[,2]) return(out_df) } ## convert projection function convProj <- function(in_df,in_proj,out_proj){ latlong <- in_df[,c('long','lat')] latlong_spdf <- SpatialPoints(latlong, proj4string=in_proj) latlong_spdf <- spTransform(latlong_spdf,out_proj) latlong_spdf_coords <- coordinates(latlong_spdf) out_df <- in_df out_df$long <- latlong_spdf_coords[,1] out_df$lat <- latlong_spdf_coords[,2] return(out_df) } ######################################################################## ## Preprocess Crime Data crimeData <- read.csv('OpenDataSets/BPD_Part_1_Victim_Based_Crime_Data.csv') crimeData_NoCoords <- crimeData[crimeData$Location.1 == '',] crimeData <- crimeData[crimeData$Location.1 != '',] ## Get and convert projection crimeData <- str2LatLong(crimeData) ## Incidents already in correct proj crimeData_ProjOrig <- crimeData[crimeData$lat>100,] crimeData <- crimeData[crimeData$lat<100,] inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj crimeData <- convProj(crimeData, inProj, outProj) ## Parse Dates crimeData$crimeDate2 <- parse_date_time( crimeData$crimeDate, orders='%m/%d/%Y' ) ## Get Burglary Incidents burg_df <- subset(crimeData, description=='BURGLARY') ## Hold Out 2012 Incidents burg_df_ho <- subset(burg_df, year(crimeDate2) == '2012') burg_df <- subset(burg_df, year(crimeDate2) != '2012') ggplot(data=burg_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Get Street Robbery Incidents robbStr_df <- subset(crimeData, description=="ROBBERY - STREET") ## Hold Out 2012 Incidents robbStr_df_ho <- subset(robbStr_df, year(crimeDate2) == '2012') robbStr_df <- subset(robbStr_df, year(crimeDate2) != '2012') ggplot(data=robbStr_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Homicide homic_df <- subset(crimeData, description=='HOMICIDE') ## Hold Out 2012 Incidents homic_df_ho <- subset(homic_df, year(crimeDate2) == '2012') homic_df <- subset(homic_df, year(crimeDate2) != '2012') ggplot(data=homic_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ## Aggravated Assault aggrAslt_df <- subset(crimeData, description=='AGG. ASSAULT') ## Hold Out 2012 Incidents aggrAslt_df_ho <- subset(aggrAslt_df, year(crimeDate2) == '2012') aggrAslt_df <- subset(aggrAslt_df, year(crimeDate2) != '2012') ggplot(data=aggrAslt_df, aes(x=long,y=lat)) + geom_point() + coord_equal() ######################################################################## # Religous Building Locations relig_df <- read.csv('geocoded/Religious_Buildings_gc.csv') ## Remove na rows relig_df <- relig_df[complete.cases(relig_df),] inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj relig_df <- convProj(relig_df, inProj, outProj) ######################################################################## # Police Station Locations police_df <- read.csv('geocoded/Police_Stations_gc.csv') inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj police_df <- convProj(police_df, inProj, outProj) ######################################################################## # Hospitals Locations hospitals_df <- read.csv('geocoded/Hospitals.csv') inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj hospitals_df <- convProj(hospitals_df, inProj, outProj) ######################################################################## # CCTV Locations cams_df <- read.csv('OpenDataSets/CCTV_Locations.csv') cams_df <- str2LatLong(cams_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj cams_df <- convProj(cams_df, inProj, outProj) cams_df$type <- "CCTV Camera" ######################################################################## # Vacant Buildings vacantBuildings_df <- read.csv('OpenDataSets/Vacant_Buildings.csv') vacantBuildings_df <- str2LatLong(vacantBuildings_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantBuildings_df <- convProj(vacantBuildings_df, inProj, outProj) vacantBuildings_df$type <- 'Vacant Building' ######################################################################## # Vacant Lots vacantLots_df <- read.csv('OpenDataSets/Vacant_Lots.csv') vacantLots_df <- str2LatLong(vacantLots_df) inProj <- CRS("+proj=longlat +datum=WGS84") outProj <- origProj vacantLots_df <- convProj(vacantLots_df, inProj, outProj) vacantLots_df$type <- 'Vacant Lot' ######################################################################## ## Get kernel density estimates kde2dRange <- c(apply(burg_df[,c('long','lat')], 2, range)) getKde <- function(in_df, N=400, Lims=kde2dRange){ pts <- as.matrix(in_df[,c('long','lat')]) dens <- kde2d(pts[,1],pts[,2], n=N, lims=Lims) dens_df <- data.frame(expand.grid(dens$x, dens$y), z = c(dens$z)) colnames(dens_df) <- c('x','y','z') return(dens_df) } plotKde2d <- function(in_df){ fillCols <- rev(brewer.pal(11,'Spectral')) return( ggplot() + geom_tile(data = in_df, aes(x=x, y=y, fill=z, group=1)) + scale_fill_gradientn(colours=fillCols) + theme_bw() + coord_equal() ) } saveKde2Plot <- function(plotDf, plotName, plotTitle,titlCol='white'){ # https://github.com/wch/ggplot2/wiki/New-theme-system new_theme_empty <- theme_bw() new_theme_empty$line <- element_blank() new_theme_empty$rect <- element_blank() new_theme_empty$strip.text <- element_blank() new_theme_empty$axis.text <- element_blank() new_theme_empty$plot.title <- element_blank() new_theme_empty$axis.title <- element_blank() new_theme_empty$legend.position <- 'none' new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit") nbhds_pl_df2 <- nbhds_pl_df[,c('long','lat','group')] colnames(nbhds_pl_df2) <- c('x','y','group') plotKde2d(plotDf) + geom_path(data=nbhds_pl_df2,aes(x=x,y=y, group=group),color='black',alpha=0.4) + new_theme_empty + annotate("text", x = 1405000, y = 568000, label=plotTitle, size=8, color=titlCol) ggsave(paste('img/',plotName,'.png', sep='')) } ## Get all simple gaussian 2d kernel density estimates burgDens <- getKde(burg_df) ## Burglary, 7 robbStrDens <- getKde(robbStr_df) ## Street Robbery, 7 homicDens <- getKde(homic_df) ## Homicide, 7 aggrAsltDens <- getKde(aggrAslt_df) ## Aggr Assault, 7 hospitalsDens <- getKde(hospitals_df) ## Hospitals policeDens <- getKde(police_df) ## Police Stations religDens <- getKde(relig_df) ## Religous Buildings camsDens <- getKde(cams_df) ## Cameras, 1 vacBldDens <- getKde(vacantBuildings_df) ## Vacant Buildings, 5 vacLotsDens <- getKde(vacantLots_df) ## Vacant Lots, 6 ## plot densities saveKde2Plot(burgDens, 'BurglaryKde2d', 'Burglary\n Density') saveKde2Plot(robbStrDens, 'StreetRobberyKde2d', 'Street\n Robbery\n Density') saveKde2Plot(homicDens, 'HomicideKde2d', 'Homicide\n Density') saveKde2Plot(aggrAsltDens, 'aggrAsltKde2d', 'Aggravated\n Assault\n Density') saveKde2Plot(hospitalsDens, 'HospitalKde2d', 'Hospital\n Location\n Density') saveKde2Plot(policeDens, 'PoliceKde2d', 'Police\n Station\n Density') saveKde2Plot(religDens, 'ReligiousKde2d', 'Religous\n Building\n Density') saveKde2Plot(camsDens, 'CCTVCamsKde2d', 'CCTV\n Cameras\n Density') saveKde2Plot(vacBldDens, 'VacBldgKde2d', 'Vacant\n Building\n Density') saveKde2Plot(vacLotsDens, 'vacLotsKde2d', 'Vacant\n Lot\n Density') |

I collect such projects. There are about a dozen currently on a list that I have invested some amount of time in. At the current rate, I will finish about 12 by the time I die...but the list will have quadrupled.

My wife and I recently purchased a home in the Mount Vernon neighborhood of Baltimore, moving up from Washington, DC. One of Baltimore's many nicknames is "the City of Neighborhoods", and it is probably the most apt. The city is full of clusters, and arbitrary but obvious lines that separate this place from that place, and these people from those people.

The only exercise regime that I have been able to get myself to stick to over the years is running outside, no matter the weather. This is because the only way I can trick myself into keeping moving is to give myself an artificial destination somewhere X miles away or to give myself a direction to run in towards places I haven't yet been. It's a way for me to romanticize the process of making sure my stress levels stay manageable and my body doesn't slowly atrophy in front of this computer.

This habit has allowed me to cross a lot of those lines in a relatively short time here and I've tried within reason to cross some that maybe white dudes in jogging pants aren't expected to cross. No matter where you are in this city, one of those particular lines isn't far and once you cross one, you know it.

All that to say that I'm currently finishing up an intro to analytics in GIS class, and for my final project I chose one of those interests I'd collected but done very little about: using the fantastic wealth of data here to learn more about this city that I'm now calling home.

I'm building a lot of maps using good old ggplot2 for this project, and they're so pretty. There's already lots of ggplot2 mapping blog posts but in the interest of sharing that pretty, here's another.

Obviously:

## Crime Incident Plots library(ggplot2) library(foreign) library(stringr) library(lubridate) library(plyr) library(xtable) library(scales) library(RColorBrewer) library(ggmap) ## gis libraries library(maptools) library(sp) library(rgdal) library(spatstat) |

Then pulling in the data - shape files - using some of the great (but mostly HORRIBLY documented) GIS packages available in R, first the city boundary:

city_shp <- readOGR(dsn='Baltcity_20Line', layer='baltcity_line') |

and I store the original map projection. I've always had a bit of a map fetish, and learning details about the different projections have been way more fun than they should be. First thing to note is, these shapefiles are not in the latitude/longitude coordinate system. If I want to convert them to lat/long, there's a function for that:

`#city_shp <- spTransform(city_shp,CRS("+proj=longlat +datum=WGS84"))` |

But it's commented out because I don't want to do that. The projection they're currently in allows me to treat the distances between points as though they were on a plane, as opposed to a sphere. This is ok as my window of analysis is fairly small (just Bmore) and makes clustering and model fitting much more simple mathematically. It allows me to use more general tools in that part of my analysis. In fact, I'll store the original projection, and convert other data given to me in lat/long to it later on:

origProj <- city_shp@proj4string ## Store original projection |

ggplot2 only takes data frames, so I gotta convert the shape files to a data frame representation:

city_pl_df <- fortify(city_shp, region='LABEL') |

For all the city-wide plots, I use the city line as the first layer, so I'm going to store it as my "bound" blot and gray out the surrounding area in the plot background:

bound_plot <- ggplot(data=city_pl_df, aes(x=long, y=lat, group=group)) + geom_polygon(color='gray', fill='lightblue') + coord_equal() + theme_nothing() |

By itself, eh:

So how about all those neighborhoods then? Pull in the shape files and convert them to a data frame the same way:

## Neighborhood Shape Files read in v1 nbhds_df <- read.dbf('Neighborhood_202010/nhood_2010.dbf') nbhds_shp <- readOGR(dsn='Neighborhood_202010', layer='nhood_2010') origProj <- nbhds_shp@proj4string ## Store original projection #nbhds_shp <- spTransform(nbhds_shp,CRS("+proj=longlat +datum=WGS84")) nbhds_pl_df <- fortify(nbhds_shp, region='LABEL') |

and THIS is why Baltimore is the "City of Neighborhoods":

## plot nbhd boundaries nbhds_plot <- bound_plot + geom_path(data=nbhds_pl_df,color='gray') |

I'm looking at lots of different datasets for this project. Some are point datasets, like 311 calls and crime incidents. Some are region or place data, like building footprints, or land use. And others are pre-summarized data by area, like demographic or economic data at the census block group or neighborhood level. Visualizing your data is important in all types of analysis, but in GIS data, it's essential. For instance, crime incidents. The crime data here locked and loaded like:

crimeData <- read.csv('OpenDataSets/BPD_Part_1_Victim_Based_Crime_Data.csv') |

The data are 285,415 individual incidents reported by victims of crime, in the categories:

- AGG. ASSAULT: 31,507 incidents
- ARSON: 1,948 incidents incidents
- AUTO THEFT: 2,6954 incidents incidents
- BURGLARY: 4,5168 incidents
- COMMON ASSAULT: 54,226 incidents
- HOMICIDE: 1,342 incidents
- LARCENY: 57,247 incidents
- LARCENY FROM AUTO: 40,260 incidents
- RAPE: 1,170 incidents
- ROBBERY - CARJACKING: 1,225 incidents
- ROBBERY - COMMERCIAL: 3,592 incidents
- ROBBERY - RESIDENCE: 2,720 incidents
- ROBBERY - STREET: 15,288 incidents
- SHOOTING: 2,768 incidents

The coordinates are given as text, so:

latlng <- str_replace(str_replace(crimeData$Location.1,'\\(',''),')','') latlng <- str_split(latlng,', ') latlng_df <- ldply(latlng[crimeData$Location.1 != '']) crimeData$lat <- as.numeric(latlng_df[,1]) crimeData$long <- as.numeric(latlng_df[,2]) |

The coordinates are given mostly (4,477 rows with no coordinates, and 6 rows in the same projection as the shapefiles) in latitude/longitude, and like I said before, distance between two points in lat/long gives distance on the surface of a sphere. so I gotta convert it:

## Convert lat/long to maryland grid latlng_df2 <- crimeData[,c('long','lat')] latlng_spdf <- SpatialPoints(latlng_df2, proj4string=CRS("+proj=longlat +datum=WGS84")) latlng_spdf <- spTransform(latlng_spdf,origProj) latlng_spdf_coords <- coordinates(latlng_spdf) crimeData$long <- latlng_spdf_coords[,1] crimeData$lat <- latlng_spdf_coords[,2] |

When I'm doing this kind of exploratory visualization, I like to store my plot parameters in a named list like this:

crimeCols <- brewer.pal(12,'Paired') crimeTypes <- list('RAPE'=c(crimeCols[1],crimeCols[2]), 'ARSON'=c(crimeCols[1],crimeCols[2]), 'COMMON ASSAULT'=c(crimeCols[3],crimeCols[4]), 'AGG. ASSAULT'=c(crimeCols[3],crimeCols[4]), 'SHOOTING'=c(crimeCols[5],crimeCols[6]), 'HOMICIDE'=c(crimeCols[5],crimeCols[6]), 'ROBBERY - STREET'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - CARJACKING'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - RESIDENCE'=c(crimeCols[7],crimeCols[8]), 'ROBBERY - COMMERCIAL'=c(crimeCols[7],crimeCols[8]), 'BURGLARY'=c(crimeCols[9],crimeCols[10]), 'LARCENY'=c(crimeCols[9],crimeCols[10]), 'AUTO THEFT'=c(crimeCols[11],crimeCols[12]), 'LARCENY FROM AUTO'=c(crimeCols[11],crimeCols[12])) crimeTypeNames <- names(crimeTypes) |

Because that lets me loop through and plot all the subsets much more easily.

## By crime type for (crimeType in crimeTypeNames){ ## All Incidents Densities ttl <- str_replace_all(str_replace_all(crimeType, '\\s', '_'),'_-_','_') crimeDataSubset <- subset(crimeData, (description==crimeType)) p <- nbhds_plot + geom_point(data=crimeDataSubset,aes(group=1), shape='x', color=crimeTypes[[crimeType]][[1]], alpha='.8', guide=F) + stat_density2d(data=crimeDataSubset,aes(group=1), color = crimeTypes[[crimeType]][[2]]) + annotate("text", x = 1405000, y = 565000, label=paste( str_replace_all(str_replace(ttl, '_', '\n'),'_',' ') , sep=''), size=8) + ggsave(paste('img/',ttl,'.png',sep='')) } |

The loop above plots incidents and 2d kernel density estimates for all the crime types, allowing us to compare and contrast.

This allows us to see that while people get beat up all over the city...

...they really get shot and/or killed in mostly just East or West Baltimore.

And while people steal FROM cars downtown a lot...

...they steal the cars themselves pretty much everywhere BUT downtown.

And other, very similar city wide patterns for larceny vs burglary:

The different types of robbery: first, where the people are...

...and then where the property is...

I know, I know. Everyone plots crime data. Boring. I'll put up some of the other stuff I've been doing for this project as well. But I gotta tease it out, you know?

]]>Slides can be found here. Doesn't work well in mobile and touchscreen browsers.

Code for simulation can be found here.

Code for polling data example can be found here.

I learned a lot from giving the talk and really appreciated the opportunity.

]]>This means I use the command line for way more stuff than is healthy and I often rewrite when I could just copy and paste, obnoxious things like that. It also means that, when I'm learning a new model or tool, I try and write elaborate simulations that I usually never share to make sure I actually understand what's going in and what's coming out. This also happens when I'm trying to explain a concept or problem to someone, as evidenced by the earlier Monty Hall Monte Carlo post. You'll have to ask my coworkers, friends and family members whether this makes me more capable at imparting any of this knowledge to others or doing useful things with it, I guess. (On second thought, don't.) Part of the point of this site is to share these instances of "Incredible Overkill", as one of my professors once called it in the (unlikely?) event that someone else may find them useful.

Probabilistic topic modeling was added to my ever-growing list of obsessions in the last year and I have spent a lot of time consuming research papers, tutorials and tool documentation in an effort to apply this exciting area of research to problems at work and in personal projects. Wikipedia defines a topic model as "a type of statistical model for discovering abstract 'topics' that occur in a collection of documents." The research in this area is quite new, with the major developments of Probabilistic Latent Semantic Indexing and the most common topic model, Latent Dirichlet allocation models, in 1999 and 2003, respectively. The chief developer of the Latent Dirichlet allocation models, David Blei of Princeton's computer science department, has written many useful and accessible treatments of the technique, such as those available here, here, and here. Many of the most exciting areas of research in computational linguistics involve extensions of LDA, and many of those areas are being pursued by talented local machine learning, computer science and computation linguistics professionals and academics in the Baltimore/Washington DC area, where I call home.

This is the first of a set of R scripts I wrote a couple months ago, in an effort to understand all the moving parts and assumptions of the basic model and understand what I am putting in and what I am getting out. When I do these things for myself, I will usually lay out the basic model in (excruciating?) detail and attempt to examine the assumptions inherent in it, write code that simulates data generated according to those assumptions and that fits the model to that simulated data. I usually also try to also break those assumptions and see how those affect the model output. The output of this effort is usually not fit for public consumption - though it is ALWAYS well commented - so this post is the result of a fair amount of polishing. That said, it could still be wrong in any number of ways and if you think it is, please do let me know.

This simulated generative process is based essentially on Blei's descriptions and equations in the papers linked above. As he describes it:

"We formally define a topic to be a distribution over a fixed vocabulary...

We assume that these topics are specified before any data has been generated.

Now for each document in the collection, we generate the words in a two-stage process.

- Randomly choose a distribution over topics.
- For each word in the document:

- Randomly choose a topic from the distribution over topics in step #1.
- Randomly choose a word from the corresponding distribution over the vocabulary.

Code that simulates an LDA corpus and source simulation function:

### Basic LDA Topic Model Simulation ### ### Generate Simulated Corpus ### library(ggplot2) library(tm) library(MCMCpack) simulateCorpus <- function( M, # number of documents nTerms, docLengths, K, # Number of Topics alphA, # parameter for symmetric # Document/Topic dirichlet distribution betA, # parameter for Topic/Term dirichlet distribution Alpha=rep(alphA,K), # number-of-topics length vector # set to symmetric alpha parameter # across all topics Beta=rep(betA,nTerms)) # number-of-terms length vector # set to symmetric beta parameter # across all terms { # Labels Terms <- paste("Term",seq(nTerms)) Topics <- paste("Topic", seq(K)) Documents <- paste("Document", seq(M)) ## Generate latent topic and term distributions # "True" Document/Topic distribution matrix Theta <- rdirichlet(M, Alpha) colnames(Theta) <- Topics rownames(Theta) <- Documents # "True" Topic/Term Distribution Matrix Phi <- rdirichlet(K, Beta) colnames(Phi) <- Terms rownames(Phi) <- Topics ## Function to generate individual document generateDoc <- function(docLength, topic_dist, terms_topics_dist){ # docLength is specific document length # topic_dist is specific topic distribution for this document # terms_topics_dist is terms distribution matrix over all topics document <- c() for (i in seq(docLength)){ # For each word in a document, # choose a topic from that # document's topic distribution topic <- rmultinom(1, 1, topic_dist) # Then choose a term from that topic's term distribution term <- rmultinom(1, 1, terms_topics_dist[topic,]) # and append term to document vector document <- c(document, colnames(terms_topics_dist)[which.max(term)]) } return(document) } ## generate "observed" corpus as list of terms corpus <- list() for (i in seq(M)){ corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi) } ## convert document term vectors to frequency vectors freqsLists <- llply(corpus, table) ## write values to termFreqMatrix termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0) colnames(termFreqMatrix) <- Terms rownames(termFreqMatrix) <- Documents for (i in seq(M)){ termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]] } stopifnot(rowSums(termFreqMatrix) == docLengths) return(list("docs"=corpus, 'termFreqMatrix'=termFreqMatrix, "Theta"=Theta, "Phi"=Phi)) } |

I will finish polishing the inferential and viz code and put that up shortly. But you should be able, if you're exploring topic models, to use this code to see how best to tune the canned R packages and play with the "true" hyperparameters and see what effect they have on the output.

]]>It could be the votes cast in a two-way election in your town, or the free throw shots the center on your favorite basketball team takes, the survival of the people diagnosed with a specific form of cancer after five years, all the red/black bets ever placed on a specific roulette wheel, the gender of all the children in 18th century France; many phenomena in the world either fit this discription or can be thought of in this way. The most important thing is that the outcome of this phenomenon - or the way you group the outcomes - can only take one of two values: or :

Let's call this phenomenon .

Let's also define a number, , as the total number of occurances of . So it is the total number of shots taken, the total count of people diagnosed, the total number of votes cast.

Now remember, is something you care about. It is likely, since you care so much about , that you have some preference for one of the two possible outcomes. For the purposes of this example, let's say we prefer . is a shot made, a vote recieved, a survival. is a "success", from your perspective. And an outcome of is a failure.

Lets define another value, given that we prefer outcomes over outcomes. is the number of outcomes out of all the times happens. It is the number of votes that your candidate is going to get in that election, the number of free throws the big man makes, the number of cancer patients who survive.

Of course, we can define this kind of process in this way even if we don't actually prefer one outcome over the other. In the case of the gender of newborns, for instance: we can define as the birth of a girl and as the birth of a boy, and treat the birth of a boy as a failure, just for the purposes of our model and not because all little boys are born clinically insane.

You, as a keen, long-time observer of , probably have some opinion on the share of outcomes in the total of all occurrences of that you likely express as an opinion on the value of the ratio of outcomes to all occurances of , or , e.g.:

- More often than not, equals or
- Most of the time, equal or
- is as likely to equal as it is to equal or

It would be nice, probably, to have an actual number as an estimate for , or perhaps a range of numbers you can be confident contains the value of .

And perhaps you want to make some prediction about future occurances of . You want to know if someone you know with that particular form of cancer is likely to still be alive five years from now.

Or maybe your friend, also a fan of that same basketball team, thinks that your guy actually only misses about half the time. Or a political talking head says that your candidate is going to lose big. You probably want a way to compare your beliefs with theirs.

Essentially, we would like to estimate the unknown quantity , preferably with some additional estimate of our uncertainty of this value, use that estimate to predict future values of trials of , and, given that estimate, get an idea of who is more likely right about that quantity given disagreements.

Translating our above situation into the language of probability, this phenomenon - any phenomenon with a ``this'' or ``that'' outcome - can be modeled mathematically as a ``random variable'' , with a binomial probability distribution:

where is number of trials - the total number of cancer patients, shots taken, votes cast - is the number of successes or cases where the outcome equals A, and is that unknown value between 0 and 1 that equals the proportion . This distribution describes the probability that equals times in occurances (also called trials) of .

can also be modeled as the product of individual occurances of , , where equals 1 if the outcome equals A and 0 if the outcome equals B, and is still the unknown proportion of A outcomes in all occurances of :

This formulation - really a special case of the binomial distribution where N equals 1 - is often refered to a bernoulli random variable. The product of individual occurances - - is also equal to the conditional probability of the total number of successes Y on the value of , because the probability of a number of independent events occuring together is the product of all of their individual probabilities:

And in fact this product simplifies to the formula for the binomial distribution above.

It is important to realize that the value most useful for us to know the most about in the formulas above is not or or any single but .

To illustrate all of this further, I'm going to let R simulate an ``true'', unknown or and hide that value from myself as a fixed quantity :

That function call generates a single pseudo-random number between 0 and 1 from a uniform distribution, meaning that the value is equally as likely to be anywhere in that (0,1) interval.

I'm going to use that hidden proportion to generate a ``true population'' Z, with an unknown N total number of occurances, distributed according to our hidden :

Since Z is very large, and we are pretending it is not data in memory on my computer but a set of outcomes that are very difficult or impossible to count in their entireity, we will be working with a sample pulled from Z, some small subset that we can count and infer something about the value of from. But as I said above, we know something about Z already. We follow Z very closely. We talk to everyone we know about the election, even if we don't keep a running tally of who they say they're voting for. We watch every game, even if we don't have an exact count of shots missed vs shots made. We have some prior understanding of Z already. Of course, most of the time, our first prior looks something like this:

I can't calculate from that understanding, at least not just by looking at it. It probably wouldn't be possible to count all the outcomes up there, given how jumbled they are, and even if it was, there's no way I can count them in any reasonable time. Maybe most of them haven't even happened yet, in which case I definitely can't count them. Actually, I don't even know what the total value of N is.

But I can tell some things from that picture, right? For instance, I know that there are definitely at least some trials where the outcome of is . I know that there are at least some trials where the outcome of is . In this case, actually, outcomes look pretty rare and it seems a pretty safe bet to say even more than that: our boy misses most of his free throws, or our guy is going to lose this election.

Lets get our sample, which we will call , from our population:

In this case, we were able to obtain a sample of 500 trials of Z, and they look, before we do anything to them, like this:

Since our random variable stores outcomes as successes or 1's and outcomes as failures or 0's, we can easily obtain our sample - by summing up all our occurances of .

Which in this case turns out to be 26.

We could just calculate the sample proportion of and take that for our estimate of . In this case, that would be 0.052.

But that estimate only allows us to achieve part of one of our three goals above. We can't really compare our opinions to anyone elses with any meaning, and we can't use that number by itself to predict future values.

But what if we treat our parameter as a random variable? What if we assign it a 'mean' or a most likely value, and a variance, or some quantification of uncertainty around that mean?

If we have probability distributions for all of our values of interest, we can use Bayes theorem:

In this case, the posterior probabilility is the conditional probability distribution we get for given the data and our prior distribution for .

The generalization of Bayes' theorem for use in inference involving the entire probability distribution of a random variable instead of just a point estimate of a probability allows us to, in essence, ignore the term in the expression:

because we know that it, with respect to the conditional distribution of - - is just a constant. And since is a probability distribution, we know that it has to integrate to 1 in the end, so determining that normalizing constant after we have the non-normalized distribution shouldn't be a problem.

This allows us to work with the proportional relationship, giving us our model:

or:

This is the central - really the only - tool of Bayesian statistical inference. And it suggests one of the central appeals, to me, of the approach: every input into a Bayesian framework is expressed as probability and every output of a Bayesian framework is expressed as probability.

To use this generalization of Bayes' theorem to answer our above questions, we first need to come up with a model for 's distribution. There are a number of ways that could be distributed. In fact, any distribution that ensures that the value of will be between 0 and 1 will do.

In this first example, we will take advantage of the fact that there exists aconjugateprior for the binomial distribution: the beta distribution.

The beta distribution

looks very similar in form to the binomial distribution

except it represents the probabilities assigned to values of in the domain given values for the parameters and , as opposed to the binomial distribution above, which represents the probability of values of given .

The concept of conjugacy is fairly simple. It just means that the functional forms of the distributions of which you are calculating the product are the same, so they multiply easily. The product of a beta and a binomial, given their identical functional forms, is simply:

and since is just a constant in relationship to , our final Bayes formulation of our beta prior, binomial likelihood model is:

This also is a beta probability distribution, with equal to and equal to .

But how do we choose our beta priors?

The shape of a beta distribution is dictated by the values of those and parameters and shifting those values can allow you to represent a wide range of different prior beliefs about the distribution of . Priors can be ``uninformative'' or ``informative'', meaning we can weight our prior probabilities very low in relationship to the data or we can weight them higher, informing our outcome - the posterior - more as we weight them more.

A simple function - using ggplot2's qplot - to examine different values of and and their effect on the shape of the distribution allows us to show this:

Setting and both equal to 1 gives us an non-informative uniform prior, allowing us to express that we believe could be anywhere in the interval with equal probability, meaning that the proportion of successes to failures - A outcomes to B outcomes - could be anything:

Setting and both equal to 0.5 gives us an weakly informative uniform prior that expresses a belief that is more likely to be at either extreme end of the distribution than anywhere in the center of it, meaning it is more likely that we get all successes or all failures than it is we get some more even mixture of outcomes:

Setting and both equal to a high value gives us an more strongly informative prior expressing that we believe that is likely to be at the center or that it is equally likely to see successes and failures:

We could express a stronger belief that is high - that success is very likely - with a higher and a lower :

or a stronger belief that is very low - that success is unlikely - with a lower and a higher :

Essentially, higher values of the ratio of to weights higher values of higher, lower values of that ratio place greater weight on lower values, and higher value of indicates higher certainty.

Still, choosing these 's and 's may seem a bit arbitrary. Perhaps a more intuitive way to choose an informative prior is to allow ourselves the ability to calculate analogous values to and - essentially a value that actually quantifies our prior belief about the likelihood of success and a value that quantifies how strongly we weigh that belief as a prior ``sample size''. We want to be able to express the 'mean' of our prior distribution - its most likely value - and something like a variance or how tightly clustered it is around that mean.

The mean of a beta distribution is:

and the ``sample size'' is:

and solving those two equations for and gives us

where, again, expresses how large our prior ``sample size'' is - i.e. the higher it is, the stronger our beliefs - and expresses our actual prior belief for the value of .

Getting the values for our prior distribution using any chosen values for and can be acomplished with a simple R function:

And expressing the likelihood - a binomial - as a beta where equals and equals is another simple function.

And combining them into the posterior beta distribution:

and getting the mean:

the mode:

and the standard deviation of the posterior:

can all be accomplished using functions of similar structure.

First, we generate a model with a uniform prior:

where the dotted gray lines indicate the outer bounds of our credibility interval and the dotted blue line indicates our mean.

The mean of our posterior distribution equals 0.052, the mode equals 0.05, and the standard deviation equals 0.01.

This gives us a 95% (normal-approximation) credibility interval of 0.033 to 0.071.

Our posterior and our likelihood distributions are almost identical, as would be expected, since our prior is essentially that we have no idea and the data should give us all of the information in our posterior.

A weak, equal probability prior gives:

The mean of our posterior distribution equals 0.059, the mode equals 0.057, and the standard deviation equals 0.01.

This gives us a 95% (normal-approximation) credibility interval of 0.039 to 0.08.

Our mean and mode is a bit higher than before, as we weighted our prior beliefs a little bit, but our posterior is very close to our likelihood, meaning that most of the result was informed by the data.

A strong equal prior gives:

The mean of our posterior distribution equals 0.276, the mode equals 0.275, and the standard deviation equals 0.014.

This gives us a 95% (normal-approximation) credibility interval of 0.248 to 0.303.

A model with a medium, high success prior looks like:

The mean of our posterior distribution equals 0.201, the mode equals 0.2, and the standard deviation equals 0.016.

This gives us a 95% (normal-approximation) credibility interval of 0.169 to 0.233.

And finally a medium, low success proportion prior:

The mean of our posterior distribution equals 0.05, the mode equals 0.049, and the standard deviation equals 0.009.

This gives us a 95% (normal-approximation) credibility interval of 0.033 to 0.068.

So how did each of our models do, in this case? Well, since we simulated this data, we can discover that actually equals 0.042, or about 4% of all occurances of result in outcomes.

The results of all of our models are:

Mean of Dist | Mode of Dist | Std Dev of Dist | |

Uniform Prior | 0.052 | 0.050 | 0.010 |

Weak, Equal Proportions | 0.059 | 0.057 | 0.010 |

Strong, Equal Proportions | 0.276 | 0.275 | 0.014 |

Medium, High Success | 0.201 | 0.200 | 0.016 |

Medium, Low Success | 0.050 | 0.049 | 0.009 |

And it is obvious that our two initial priors - the non-informative uniform and the weakly informative equal proportions - and our last prior - the medium confidence of a low proportion of success outcomes - all gave fairly accurate estimates of .

Our strong, equal proportions prior and medium, high sucess prior - what could be called, with a terminology nod to John Myles White, our "strong, wrong" priors - gave pretty bad estimates, obviously, though the likelihood moved our posterior much closer to the truth in the second case, and our updated belief is much better in both cases than where we started.

This susceptability to strong, wrong priors is a common critique of Bayesian inference. But these results aren't incorrect, are they? My results in the cases of my strong, wrong priors are the correct highest probability distributions of **conditional on my incorrect priors**. But I would be an idiot to choose those priors, given what I already know about , which is, remember:

And if I didn't know anything about or I had an idea but not a lot of confidence in it, why wouldn't I choose either of my first two priors, both of which arrived at perfectly serviceable estimates for ?

As importantly, in both my strong, wrong priors, my assumptions are clearly stated and easy to interpret and critique. If I published something using those assumptions, and everyone and their mother could just look out and see that:

It would be easy to establish that my analysis was based on those flawed assumptions. It could even be done by someone who has only a cursory understanding of how I actually arrived at those estimates.

In future posts, I would like to continue this example and examine the effects of smaller sample sizes and the ease of updating beliefs using a series of smaller samples, much as Kruschke and Bolstadt do in their texts. I think their choices for visualizations and some of the explanations in Bolstadt are sometimes more confusing than necessary, and establishing a stronger single thread through the explanations would make things more intuitive, so I'm going to attempt to actually do that here. Krushke and Gelman have excellent explanations for all of this that are well worth the read. I would also like to look at comparing beliefs and predicting future values in binomial proportions as well.

At his seminar on Bayesian methods back in April, John Myles White said something about traditional statisticians being better at actually getting things done over the course of the development of modern statistics. I didn't really understand what he meant until recently.

The basic toolkit of Bayesian statistics produces intuitive, easier to understand - and use and update and compare - outputs through comparatively difficult computational and mathematical procedures. Everything in and out of a Bayesian analysis is probability and can be combined or broken apart according to the rules of probability. But understanding code and sampling algorithms - really understanding algorithms and computation generally - and a much deeper grasp of probability distribution theory are much more important in understanding Bayesian inference much earlier on.

Basic traditional statistical methods produce output that is fairly difficult to understand through comparatively simple computational and mathematical procedures. Most results in traditional statistics depend on logical appeals to unseen - really un-see-able - asymptotic properties of the estimators being used and assumptions and relationships between samples and populations that may be valid or not in any given case.

This is a very real catch-22: always easier to understand and use, much harder to do initially versus always harder to understand and use, much easier to do initially. I think that much of the difficulty so many have when faced with statistics comes from the fact that traditional OUTPUTS are so unintuitive and seem to exist in isolation or only in relationship to something with a touch of the `other' about it.

The concept of maximum likelihood and the MLE methods that comprise the basis of much of traditional methods are very elegant - actually quite beautiful - logical constructs that manage to give one the ability to say SOMETHING when faced with the problem of lots of data and not a lot of computational power.

But that's not our problem anymore. Now we have lots of data AND lots of computational power. Our problem now is statistical literacy, and building on the body of human knowledge in a way that is both rigorous and democratic.

Credit where credit is due: I've studied and digested the work of the following to learn this stuff and everything I'm going to post here and much of what I use on a daily basis in my work:

- Hadley Wickam's ggplot2 - this R package is something of an obsession of mine. Learn it well and essentially any static visualization is available to you. Incredibly powerful tool.
- Scott Lynch's
*Introduction to Applied Bayesian Statistics and Estimation for Social Scientists*- This book wasn't on my original list, but it has become my first stop. Especially if you think in code instead of equations, his explanations are fantastic and his walkthroughs of sampling algorithms and MCMC are great. It can be purchased here.

and of course the Bolstadt, Kruschke and Gelman books and the work of John Myles White mentioned in my initial post here.

]]>For anyone who doesn't know, the Monty Hall problem is a now classic probability thought experiment that goes something like:

You are a contestant on the famous television game show "Lets Make A Deal". The host of the show, Monty Hall, points towards a wall with three doors, numbered 1, 2 and 3, and says, "Behind two of those doors is your garden variety mountain goat, completely worthless to you, but behind just one of those doors is A NEW CAR!". He then tells you to pickadooranydoor, after which instead of opening the door you chose, he proceeds to open one of the two remaining doors, revealing a goat behind. He then gives you the chance to either stay on your door and claim the prize behind, or switch to the only remaining door and claim the prize behind that. The question is: should you switch?

If you're like most people, and you haven't seen this before in some probability lecture, you'd likely say that it doesn't matter, that either door is likely to contain the car or the goat. 50/50. But actually, the logical and mathematical answer to the question is that **it is ALWAYS better to switch**.

You have a one-in-three chance of choosing the right door in the first place and so a two-in-three chance of choosing the wrong door. It is more likely that you choose the wrong door initially. If you choose the right door initially, Monty can open either of the other two doors to reveal a goat and if you switch you will lose. If you choose the wrong door initially, Monty only has one door left to open to reveal a goat, as he can't open your door and the other door reveals the car.

Probabilistically, after Monty opens that door, the door you chose initially still has that same one-in-three chance of containing a car. But now the door that is left has all of that left-over TWO-in-three chance of containing that car because of the initial two-in-three chance that YOU chose the wrong door. So always switch.

The initial reaction of my wife and a few other people to this was something along the lines of: "Idon'tlikethatthatmakesmeangrystoptalking." To be fair, that is most people's reaction to my nonsense. Nevertheless, it got me thinking about how best to visualize this problem to make the result more intuitive.

Aside: There is at least one other thing that SHOULD bother you about all this. A goat is ONLY completely worthless to silly fat Americans, right? I mean, yes, most people would value a new car over a goat, but it's a bit much to equate a GOAT with WORTHLESS. You can literally live off a goat. Drink its milk. Make cheese that people will buy for ten dollars and call 'chev-RA' or however you pronounce that. Goat meat is delicious when cooked right. A goat definitely is not worthless.

Okay. So my first attempt was just to code up the simulation (a "Monte Carlo" simulation, as it's referred to, hence the clever title that has probably been used ten thousand times) in R:

########## Simulation Loop ######### for (i in seq(n)){ ## 1. Randomly place prize behind one of three doors PlacePrize <- c(1,0,0)[sample(1:3,3)] ## 2. Randomly pick one of three doors YouPick <- Doors[sample(1:3,1)] ## 3. Monty either randomly opens one of the two doors left over if ## you happen to pick the correct door or picks the only door left ## if you pick one of two incorrect doors MontyOpens <- ifelse(PlacePrize[Doors==YouPick]==1, Doors[!Doors%in%YouPick][sample(1:2,1)], Doors[(!Doors%in%c(YouPick,Doors[PlacePrize==1]))]) PrizeIsBehind <- Doors[PlacePrize==1] ## 4. If the prize is behind the leftover door, you win if you switch. ## Else you win if you stick on your original choice. WinIfSwitch <- ifelse(PlacePrize[!Doors%in%c(YouPick,MontyOpens)]==1,1,0) Picks <- c(Picks, YouPick) Opens <- c(Opens, MontyOpens) WinningDoor <- c(WinningDoor, PrizeIsBehind) WinsIfSwitch <- c(WinsIfSwitch, WinIfSwitch) ### Write results to data frames PlacedDf[i,] <- PlacePrize PicksDf[i,YouPick] <- 2 OpensDf[i,MontyOpens] <- 3} ########## End Simulation Loop ######### |

The key in the above code is that all the choices, the placement of the prize, your choice of a door, your choice to switch, are random EXCEPT Monty's choice of which door to open. That choice is conditional on your choice, as if you choose the wrong door, he is constrained to only one of the two remaining. That's why #3 is not just a simple random function call, it is a conditional statement on the results of a random function.

Plotting the results, first of three trials or runs of the simulation, gives us:

In which, probably confusingly, randomness didn't go our way and for all three trials, we would have won if we switched doors. That's the thing about probability, though. It is not that the course of action a probability-based analysis suggests always turns out to be the right one each time an event occurs, but that in the aggregate, given what you know about the situation and the type of uncertainty you are dealing with, the course of action suggested is the right one to take BEFORE you know the outcome.

Also likely to be confusing, as in these particular 10 runs, we got unlucky again, and only 50 percent of the time we were likely to win if we switched.

But as n (number of trials) gets higher and higher, the result becomes more and more obvious, first for 100:

It is obvious from these plots of high numbers of simulations that it is the right choice to switch. I mean, the orange bar is almost exactly twice as large as the purple. Don't you see? Yeah, my wife was still confused and mildly annoyed as well. This plot shows the RESULT clearly but gives no additional intuition to help understand the WHY. Fail.

I have another plot that I'm slowly tinkering with that I hope will actually demonstrate the why of the thing, because I REALLY want to explain this to my wife for some reason. But for now, here's my quick and dirty R code to simulate the data and generate the above plots:

#### Monty Hall Monte Carlo #### Rob Mealey library(ggplot2) library(RColorBrewer) library(reshape2) ### Function: Run simulation n times and plot results in stacked bar histograms montyMonte <- function(n,titleSize=7,legendTitle=5,ytextSize=5,xtextSize=5){ Picks <- c() Opens <- c() WinningDoor <- c() WinsIfSwitch <- c() PlacedDf <- matrix(nrow=n, ncol=3) OpensDf <- matrix(nrow=n, ncol=3) PicksDf <- matrix(nrow=n, ncol=3) Doors <- c('Door 1', 'Door 2', 'Door 3') colnames(PlacedDf) <- Doors colnames(PicksDf) <- Doors colnames(OpensDf) <- Doors ########## Simulation Loop ######### for (i in seq(n)){ ## 1. Randomly place prize behind one of three doors PlacePrize <- c(1,0,0)[sample(1:3,3)] ## 2. Randomly pick one of three doors YouPick <- Doors[sample(1:3,1)] ## 3. Monty either randomly opens one of the two doors left over if ## you happen to pick the correct door or picks the only door left ## if you pick one of two incorrect doors MontyOpens <- ifelse(PlacePrize[Doors==YouPick]==1, Doors[!Doors%in%YouPick][sample(1:2,1)], Doors[(!Doors%in%c(YouPick,Doors[PlacePrize==1]))]) PrizeIsBehind <- Doors[PlacePrize==1] ## 4. If the prize is behind the leftover door, you win if you switch. ## Else you win if you stick on your original choice. WinIfSwitch <- ifelse(PlacePrize[!Doors%in%c(YouPick,MontyOpens)]==1,1,0) Picks <- c(Picks, YouPick) Opens <- c(Opens, MontyOpens) WinningDoor <- c(WinningDoor, PrizeIsBehind) WinsIfSwitch <- c(WinsIfSwitch, WinIfSwitch) ### Write results to data frames PlacedDf[i,] <- PlacePrize PicksDf[i,YouPick] <- 2 OpensDf[i,MontyOpens] <- 3} ########## End Simulation Loop ######### WinsIfSwitches <- ifelse(WinsIfSwitch==1, 'Switch Door = Win','Switch Door = Lose') Games <- data.frame(Picks, Opens, WinningDoor, WinsIfSwitches) Wins <- sum(WinsIfSwitch)/n Games <- melt(Games,measure.vars=c('Picks', 'Opens', 'WinningDoor', 'WinsIfSwitches')) Games$variable <- ordered(Games$variable, levels=c('WinsIfSwitches', 'WinningDoor', 'Opens','Picks')) PicksDf[is.na(PicksDf)] <- 0 OpensDf[is.na(OpensDf)] <- 0 ResultsDf <- rbind( data.frame('Type'=rep('Placed',n*3),melt(PlacedDf,measure.vars=Doors)), data.frame('Type'=rep('Picked',n*3),melt(PicksDf,measure.vars=Doors)), data.frame('Type'=rep('Opens',n*3),melt(OpensDf,measure.vars=Doors))) colnames(ResultsDf) <- c('Type','Trial','Door','value') # Plot stacked bar histograms of your picks, monty's opens, winning doors # and win if switch ggplot(Games, aes(x=variable, fill=factor(value))) last_plot() + geom_histogram() last_plot() + scale_x_discrete(labels=rev(c('Your Picks',"Monty's Opens", 'Winning Door','Switch=Win/Lose'))) last_plot() + scale_fill_brewer(type='qual',palette=6) + xlab('') + ylab('') last_plot() + theme_bw() + coord_flip() last_plot() + opts(title = paste('Monty Hall Monte Carlo Total Simulation Results, N = ',n,', Pct Switches Win = ',Wins,sep=''), legend.position='bottom',legend.title=theme_blank()) last_plot() + opts(plot.title = theme_text(size=titleSize), legend.text = theme_text(size=legendTitle), axis.text.y = theme_text(size=ytextSize), axis.text.x = theme_text(size=xtextSize)) ggsave(paste('MontyMonteHistograms',n,'.png',sep=''),width=5, height=3) WinsIfSwitches <- factor(WinsIfSwitches) ResultsDf$lineTypes <- ordered(rep(WinsIfSwitches,3*3), levels=rev(levels(WinsIfSwitches))) ResultsDf$Trial <- ordered(ResultsDf$Trial,levels=rev(seq(nrow(ResultsDf)))) return(ResultsDf)} trialLengths <- c(3,10,100,1000,10000) resultsList <- list() for (i in seq(length(trialLengths))){resultsList[[i]] <- montyMonte(trialLengths[i])} |

Good times.

]]>- Introduction to Bayesian Statistics, William Bolstad
- Doing Bayesian Data Analysis, John K Kruschke
- Bayesian Data Analysis, Andrew Gelman, John B. Carlin, Hal S. Stern and Donald B. Rubin

The first book is a true introductory textbook, whose audience is first-time statistics students. It aims to introduce probability and statistics in a Bayesian framework. Kruschke's book is great for the practical R and BUGS code, and is entertainingly written, and Gelman's book has a lot of great detail on distribution theory and social science examples that have more immediate traction in my brain. I think between the three, I will be able get anything I could want from a textbook on bayesian statistics. Well...not anything...but no textbook can actually provide that...

]]>Gaining additional marketable skills, for one. Becoming a Bayesian samurai can only help me in my quest to continue to get paid to play with data, right? Data science street cred, of course. Very difficult to come by honestly. General fame and fortune, really. Mostly, though, this undertaking is driven by a long-held and burning desire for most, not just some, of the things I say in any work I do to be useful and meaningful to real people in the real world.

Most of my formal statistical training is through the study of applied economics, where I've studied both quantitative methods and theory. I've learned a lot and am grateful for the education, but by far the most obvious thing I've learned is that I am decidedly not a theory guy. My economic theory classes have often felt uncomfortably similar to the Sunday school of my childhood, the major difference being the expression of the core theological principles in equations. My quantitative and modeling classes have mostly been real pleasures, and I value the additional tools I've gained through them. But another thing I've learned is that a major part of the reason so many have difficulty grasping statistics and probability is that the traditional interpretations of probability and asymptotic justifications for the most common methods of statistical inference are fundamentally and unnecessarily unintuitive.

There are other people, far more able than me, currently engaged in laying out many of the problems with traditional statistical inference: John Myles White just posted his third in a great series articulating the weaknesses of Null Hypothesis Significance Testing (NHST). His twitter feed is abuzz with links to other great sources of why p-values, besides being unintuitive, are not nearly as useful in judging the quality of research results as their ubiquity would suggest. (First post in the series is here, second here). My first really useful overview of Bayesian methods of inference was at a recent talk John gave at George Mason University, the content and related code of which is available through his github repository, and he and Drew Conroy wrote a really great O'Reilly book called Machine Learning for Hackers that I've used at work. Incredibly useful guy to know about.

In the comments of John Myles White's first post, Ethan Fosse effectively articulates the fundamental strangeness of the p-value and the null hypothesis:

It’s incredibly difficult to interpret them correctly, in part because they really are very weird constructions. As is well-known, the p-values and confidence intervals of a particular parameter describe neither the properties of the data set at hand nor the particular model fit to the data. Instead, the p-values and confidence intervals describe the imagined properties of an imagined distribution of an unobserved parameter from a set of unobserved models that we imagine have been fit repeatedly to imagined data sets gathered in a similarly imagined way from the same unobserved population. Thus, a p-value never gives the probability that our parameter is above a certain observed threshold, and a confidence interval never indicates the probability that our parameter lies within a certain set of observed values.

Read that again.

I know that if you have studied statistics, it is very likely that you on some level knew that that is what a p-value is. But are you really confident that everyone you studied with, every professor you had, all the people that are now likely employing these techniques in analyzing data in all sorts of fields, that they truly understand and could articulate what these values actually mean in the context of their research? Especially in the private sector, p-values and confidence intervals are abused and repackaged in all sorts of meaningless ways. Part of my plan with this research is to show examples of this and rework them in more useful and interpretable ways in the bayesian framework.

Pierre Laplace - of the "Laplacian ambitions" - was an 18th century French mathematician and my favorite. I'm calling this pursuit my Laplacian ambition not JUST to show off that I am huge nerd and have a favorite mathematician and not JUST to add an absurd level of grandiosity to the whole thing. I'm calling the man out because he was really the first to apply the Bayesian idea to attempts to learn useful things in astronomy, demography and other areas. That's why I'm working to master these methods: to rigorously and continuously learn and communicate broadly useful, intuitively meaningful things about our world.

I'm not entirely sure why I'm ** blogging ** my attempt. I do feel a need for some very public accountability, both in ensuring that I'm not doing it wrong and that I'm continuing to do it. At the very least, a long period between posts will make me look foolish to my professors and coworkers and the few friends and family members nerdy enough to actually read this stuff. Though the risk of severely embarrassing myself by doing it wrong may end up reducing my aforementioned 'marketability'. I guess we'll just have to see how it goes. I can always drop this whole thing down the memoryhole...

I'll be keeping all the code and output in a repository on github and posting condensed versions of my work here. Please feel free to embarass me, it's the only way I'll learn. At least that's what my parents always said.

]]>