This notebook contains R code to accompany Chapter 6 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.

NOTE: working directory should be set to this file’s location.

Data for NYC taxi example

The data files for the examples in this chapter are available at http://www.andresmh.com/nyctaxitrips/. They are compressed as a 7-Zip file archive (e.g. with p7zip), so you will need to have the 7z command available in your path for the below code to work. (On a mac, you can use Homebrew to install p7zip with the command brew install p7zip.)

NOTE: downloading and decompressing the archives will take a while

baseUrl <- "https://archive.org/download/nycTaxiTripData2013/"
tripFile <- "trip_data.7z"
localTripFile <- paste0("../data/", tripFile)
tripFile1 <- "../data/trip_data_1.csv"
fareFile <- "trip_fare.7z"
localFareFile <- paste0("../data/", fareFile)
fareFile1 <- "../data/trip_fare_1.csv"
if(!file.exists(localTripFile)){
  download.file(paste0(baseUrl, tripFile), 
                destfile = localTripFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(tripFile1)){
    system(paste0("7z x ", localTripFile, " -o../data"))
  }
}
if(!file.exists(localFareFile)){
  download.file(paste0(baseUrl, fareFile), 
                destfile = localFareFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(fareFile1)){
    system(paste0("7z x ", localFareFile, " -o../data"))
  }
}
npoints <- 50000
tripData <- fread(tripFile1, nrows=npoints, stringsAsFactors = TRUE) %>%
  mutate(store_and_fwd_flag = 
           replace(store_and_fwd_flag, which(store_and_fwd_flag == ""), "N")) %>%
  filter(trip_distance > 0 & trip_time_in_secs > 0 & passenger_count > 0) %>%
  filter(pickup_longitude < -70 & pickup_longitude > -80) %>%
  filter(pickup_latitude > 0 & pickup_latitude < 41) %>%
  filter(dropoff_longitude < 0 & dropoff_latitude > 0)
tripData$store_and_fwd_flag <- factor(tripData$store_and_fwd_flag)
fareData <- fread(fareFile1, nrows=npoints, stringsAsFactors = TRUE)
dataJoined <- inner_join(tripData, fareData)
## Joining, by = c("medallion", "hack_license", "vendor_id", "pickup_datetime")
remove(fareData, tripData)

Figure 6.1 The first six rows of the NYC taxi trip and fare record data

tableRows <- 6
kable(head(dataJoined[,1:5],tableRows))
medallion hack_license vendor_id rate_code store_and_fwd_flag
89D227B655E5C82AECF13C3F540D4CF4 BA96DE419E711691B9445D6A6307C170 CMT 1 N
0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N
0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N
DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N
DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N
20D9ECB2CA0767CF7A01564DF2844A3E 598CCE5B9C1918568DEE71F43CF26CD2 CMT 1 N
kable(head(dataJoined[,6:10],tableRows))
pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance
2013-01-01 15:11:48 2013-01-01 15:18:10 4 382 1.0
2013-01-06 00:18:35 2013-01-06 00:22:54 1 259 1.5
2013-01-05 18:49:41 2013-01-05 18:54:23 1 282 1.1
2013-01-07 23:54:15 2013-01-07 23:58:20 2 244 0.7
2013-01-07 23:25:03 2013-01-07 23:34:24 1 560 2.1
2013-01-07 15:27:48 2013-01-07 15:38:37 1 648 1.7
kable(head(dataJoined[,11:15],tableRows))
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude payment_type
-73.97817 40.75798 -73.98984 40.75117 CSH
-74.00668 40.73178 -73.99450 40.75066 CSH
-74.00471 40.73777 -74.00983 40.72600 CSH
-73.97460 40.75995 -73.98473 40.75939 CSH
-73.97625 40.74853 -74.00259 40.74787 CSH
-73.96674 40.76425 -73.98332 40.74376 CSH
kable(head(dataJoined[,16:21],tableRows))
fare_amount surcharge mta_tax tip_amount tolls_amount total_amount
6.5 0.0 0.5 0 0 7.0
6.0 0.5 0.5 0 0 7.0
5.5 1.0 0.5 0 0 7.0
5.0 0.5 0.5 0 0 6.0
9.5 0.5 0.5 0 0 10.5
9.5 0.0 0.5 0 0 10.0

Figure 6.2 The distribution of values across some of the categorical-looking columns in our dataset

p1 <- ggplot(dataJoined, aes(vendor_id)) +
  geom_bar()
p2 <- ggplot(dataJoined, aes(rate_code)) +
  geom_bar()
p3 <- ggplot(dataJoined, aes(store_and_fwd_flag)) +
  geom_bar()
p4 <- ggplot(dataJoined, aes(payment_type)) +
  geom_bar()
grid.arrange(p1,p2,p3,p4,nrow=2)

Figure 6.3 Scatter plots of taxi trips for the time in seconds versus the trip distance, and the time in seconds versus the trip amount (USD), respectively.

p5 <- ggplot(dataJoined, aes(trip_time_in_secs, trip_distance)) +
  geom_point(alpha = 0.1)
p6 <- ggplot(dataJoined, aes(trip_time_in_secs, total_amount)) +
  geom_point(alpha = 0.1)
grid.arrange(p5,p6,nrow=2)