Graduate Urban Economics Lecture 11, May 10, 2018
Gentzkow and Shapiro wrote a very helpful document called "Code and Data for the Social Sciences: A Practitioner's Guide"
My Stata trick:
Write a test procedure
Know your data: I have caught countless errors by realizing data changed unexpectedly
library(rgdal) #manipulating shape files (.shp) library(ggplot2) #very advanced plotting/graphics library library(sp) #spatial data utility library(maptools) #mapping utility library(rgeos) #additional geographic utility
## Warning: package 'rgeos' was built under R version 3.3.2
library(raster) #raster data
## Warning: package 'raster' was built under R version 3.3.2
library(spatstat) #spatial statistics #library(ggmap) #interfacing with Google Maps (VPN required) library(OpenStreetMap)
## Warning: package 'OpenStreetMap' was built under R version 3.3.2
#chinaMap<-getData('GADM',country="CHN",level=1) #download data from GADM; getData is part of package raster ChinaRD0<-readRDS("data/CHN_adm0.rds") #Country outline; data folder is local ChinaRD1<-readRDS("data/CHN_adm1.rds") #provinces ChinaRD2<-readRDS("data/CHN_adm2.rds") #prefectures ChinaRD3<-readRDS("data/CHN_adm3.rds") #counties plot(ChinaRD0,main="Mainland China country outline") #Plot Mainland China
ChinaRD1@data[1:5,1:7] #shows first 5 rows 7 cols of ChinaRD1; could also use: slot(ChinaRD1,"data")[1:10,]
## OBJECTID ID_0 ISO NAME_0 ID_1 NAME_1 HASC_1 ## 1 1 49 CHN China 1 Anhui CN.AH ## 2 2 49 CHN China 2 Beijing CN.BJ ## 3 3 49 CHN China 3 Chongqing CN.CQ ## 4 4 49 CHN China 4 Fujian CN.FJ ## 5 5 49 CHN China 5 Gansu CN.GS
names(ChinaRD3) #names of columns in data
## [1] "OBJECTID" "ID_0" "ISO" "NAME_0" "ID_1" ## [6] "NAME_1" "ID_2" "NAME_2" "ID_3" "NAME_3" ## [11] "CCN_3" "CCA_3" "TYPE_3" "ENGTYPE_3" "NL_NAME_3" ## [16] "VARNAME_3"
shanghaiCities<-ChinaRD3[ChinaRD3$NAME_2=="Shanghai",] #Selects only cities in Shanghai municipality
plot(shanghaiCities) plot(shanghaiCities[shanghaiCities$NAME_3=="Shanghai",],border="blue",add=TRUE) #add=T draws on current plot text(coordinates(shanghaiCities), labels=shanghaiCities$NAME_3,col="red",cex=0.7) #Adds names
popData<-read.table("data/china_pop_9293.csv",header=T,sep=",",stringsAsFactors=FALSE) #examine: str(popData) ChinaData2 <- slot(ChinaRD2, "data") names(ChinaData2)[c(6,8)]<-c("province_name","city_name") table(ChinaData2$province_name) #Show frequency table of prefectures by province prefectureData<-merge(ChinaData2, popData, by=c("province_name", "city_name"),all.x=TRUE) #merging by two columns prefectureData$city_name[is.na(prefectureData$pop92)] #not a great way to match names(ChinaRD2)[c(6,8)]<-c("province_name","city_name") #SP package allows merging onto SpatialPolygonsDataFrame popDataSP<-merge(ChinaRD2, popData, by=c("province_name", "city_name"),all.x=TRUE) #rm(ChinaRD2) #Removes old object--not really necessary summary(popDataSP) #shows summary statistics on new object
gg_map_obj<-fortify(ChinaRD2, region= "OBJECTID") china_map<-ggplot() + geom_map(data = slot(popDataSP, "data"), aes(map_id = OBJECTID, fill = pop92), map = gg_map_obj) + expand_limits(x = gg_map_obj$long, y = gg_map_obj$lat) + ggtitle("Population in 1992, Selected Prefectures") china_map
#ggsave("china_pop92.pdf", china_map, width = 10, height = 6)
provinceCentroids<-coordinates(ChinaRD1) #gets coordinates plot(ChinaRD1) points(provinceCentroids,col="red") #adds points to current plot
#plot(ChinaRD3,border="blue",add=T) #draws borders of prefectures
spdf_pts <- SpatialPointsDataFrame(coords=provinceCentroids, data=data.frame(ChinaRD1$NAME_1)) proj4string(spdf_pts) <-proj4string(popDataSP) #Make sure projection is the same (should be since comes from same data source) #proj4string(spdf_pts)<-CRS(" +proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0") #could use this code to adjust pref_pts<-over(spdf_pts,popDataSP) pref_pts$x<-slot(spdf_pts,"coords")[,1] pref_pts$y<-slot(spdf_pts,"coords")[,2] #write.table(pref_pts, file = "centroids_in_cities.CSV", sep = ",", col.names=names(pref_pts),row.names=FALSE,na="") #write CSV
plot(ChinaRD1) points(provinceCentroids,col="red") #adds points to current plot text(pref_pts$x,pref_pts$y, labels=pref_pts$city_name,col="blue",cex=0.7) #Add names
sufe_names="SUFE" sufe_pts<-SpatialPointsDataFrame(coords=cbind(121.5153883,31.2948375), data=data.frame(sufe_names)) sufe_pts$lon<-coordinates(sufe_pts)[1,1] sufe_pts$lat<-coordinates(sufe_pts)[1,2] sufe_map <- invisible(openmap(c(sufe_pts$lat+0.03,sufe_pts$lon-0.03), c(sufe_pts$lat-0.03,sufe_pts$lon+0.03), minNumTiles=10,type="osm")) #invisible suppresses warning messages
plot(sufe_map)
Geocoding is the process of converting street addresses to latitude and longitude
This can be done with a geodatabase or by calling a service (ex: Google, Baidu)
In R, there are multiple packages to do this, including geoChina and ggmap
You will need an API and the output will often come back as a JSON; Google gives 2500 free requests per day, Baidu seems to allow even more
Google will also allow network analysis (distance between two points by travel mode)
Note that Chinese maps use a shifted coordinate system (GCJ02) that may require additional work to use
We use a projection (NAD1983 Long Island, feet) to do simple distance calcuations
rest_data<-read.table("data/menupages_Wb126.csv", header=T, sep=",",stringsAsFactors =FALSE) nyc_tracts<-readOGR("data","manhattan_tracts") #Read shapefiles with readOGR
ggplot() + geom_polygon(data = nyc_tracts, aes(x=long, y = lat, group = group), fill = NA, color = "black") + geom_point(data=rest_data,aes(x=x_feet, y=y_feet), size=1, color = "blue") + ggtitle("Manhattan Restaurants")
## Regions defined for each Polygons
ggplot() + geom_polygon(data = nyc_tracts, aes(x=long, y = lat, group = group), fill = NA, color = "black") + geom_point(data=rest_data[rest_data$cuisName1=="Chinese",],aes(x=x_feet, y=y_feet), size=1, color = "red") + ggtitle("Chinese Restaurants")
## Regions defined for each Polygons
min_x<-min(rest_data$x_feet) max_x<-max(rest_data$x_feet) min_y<-min(rest_data$y_feet) max_y<-max(rest_data$y_feet) W <- owin(xrange=c(min_x,max_x),yrange=c(min_y,max_y)) #a window defined by limits of data allrests.ppp<-ppp(rest_data$x_feet,rest_data$y_feet,window=W)
## Warning in ppp(rest_data$x_feet, rest_data$y_feet, window = W): data ## contain duplicated points
italian.ppp<-allrests.ppp[rest_data$cuisName1=="Italian",] chinese.ppp<-allrests.ppp[rest_data$cuisName1=="Chinese",] pizza.ppp<-allrests.ppp[rest_data$cuisName1=="Pizza",]
neighDist<-pairdist(chinese.ppp) radii_set<-seq(200,3600,200) chinese_mat<-array(,c(length(radii_set),2)) for (i in 1:length(radii_set)) { chinese_mat[i,1]<-radii_set[i] chinese_mat[i,2]<-mean(rowSums((neighDist<radii_set[i]), na.rm=TRUE))-1 #remove own distance } plot(chinese_mat,xlab="radius",ylab="Count",main="Count Chinese Restaurants within Radius")
numSims<-1000 sim_mat<-array(,c(length(chinese_mat[,1]),numSims)) for (i in 1:numSims) { sim.ppp<-allrests.ppp[sample(1:length(allrests.ppp$x),length(chinese.ppp$x),replace=F)] neighDist_sim<-pairdist(sim.ppp) for (j in 1:length(radii_set)) { sim_mat[j,i]<-mean(rowSums(neighDist_sim<radii_set[j], na.rm=TRUE))-1 } } conf_mat<-array(,c(length(chinese_mat[,1]),2)) for (j in 1:length(radii_set)) { conf_mat[j,1]<-quantile(sim_mat[j,],probs=.025,na.rm=TRUE) conf_mat[j,2]<-quantile(sim_mat[j,],probs=.975,na.rm=TRUE) } chinese_mat<-cbind(chinese_mat,conf_mat)