This notebook contains R code to accompany Chapter 6 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.

NOTE: working directory should be set to this file’s location.

Data for NYC taxi example

The data files for the examples in this chapter are available at http://www.andresmh.com/nyctaxitrips/. They are compressed as a 7-Zip file archive (e.g. with p7zip), so you will need to have the 7z command available in your path for the below code to work. (On a mac, you can use Homebrew to install p7zip with the command brew install p7zip.)

NOTE: downloading and decompressing the archives will take a while

baseUrl <- "https://archive.org/download/nycTaxiTripData2013/"
tripFile <- "trip_data.7z"
localTripFile <- paste0("../data/", tripFile)
tripFile1 <- "../data/trip_data_1.csv"
fareFile <- "trip_fare.7z"
localFareFile <- paste0("../data/", fareFile)
fareFile1 <- "../data/trip_fare_1.csv"
if(!file.exists(localTripFile)){
  download.file(paste0(baseUrl, tripFile), 
                destfile = localTripFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(tripFile1)){
    system(paste0("7z x ", localTripFile, " -o../data"))
  }
}
if(!file.exists(localFareFile)){
  download.file(paste0(baseUrl, fareFile), 
                destfile = localFareFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(fareFile1)){
    system(paste0("7z x ", localFareFile, " -o../data"))
  }
}
npoints <- 50000
tripData <- fread(tripFile1, nrows=npoints, stringsAsFactors = TRUE) %>%
  mutate(store_and_fwd_flag = 
           replace(store_and_fwd_flag, which(store_and_fwd_flag == ""), "N")) %>%
  filter(trip_distance > 0 & trip_time_in_secs > 0 & passenger_count > 0) %>%
  filter(pickup_longitude < -70 & pickup_longitude > -80) %>%
  filter(pickup_latitude > 0 & pickup_latitude < 41) %>%
  filter(dropoff_longitude < 0 & dropoff_latitude > 0)
tripData$store_and_fwd_flag <- factor(tripData$store_and_fwd_flag)
fareData <- fread(fareFile1, nrows=npoints, stringsAsFactors = TRUE)
dataJoined <- inner_join(tripData, fareData)
## Joining, by = c("medallion", "hack_license", "vendor_id", "pickup_datetime")
remove(fareData, tripData)

Figure 6.1 The first six rows of the NYC taxi trip and fare record data

tableRows <- 6
kable(head(dataJoined[,1:5],tableRows))
medallion hack_license vendor_id rate_code store_and_fwd_flag
89D227B655E5C82AECF13C3F540D4CF4 BA96DE419E711691B9445D6A6307C170 CMT 1 N
0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N
0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N
DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N
DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N
20D9ECB2CA0767CF7A01564DF2844A3E 598CCE5B9C1918568DEE71F43CF26CD2 CMT 1 N
kable(head(dataJoined[,6:10],tableRows))
pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance
2013-01-01 15:11:48 2013-01-01 15:18:10 4 382 1.0
2013-01-06 00:18:35 2013-01-06 00:22:54 1 259 1.5
2013-01-05 18:49:41 2013-01-05 18:54:23 1 282 1.1
2013-01-07 23:54:15 2013-01-07 23:58:20 2 244 0.7
2013-01-07 23:25:03 2013-01-07 23:34:24 1 560 2.1
2013-01-07 15:27:48 2013-01-07 15:38:37 1 648 1.7
kable(head(dataJoined[,11:15],tableRows))
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude payment_type
-73.97817 40.75798 -73.98984 40.75117 CSH
-74.00668 40.73178 -73.99450 40.75066 CSH
-74.00471 40.73777 -74.00983 40.72600 CSH
-73.97460 40.75995 -73.98473 40.75939 CSH
-73.97625 40.74853 -74.00259 40.74787 CSH
-73.96674 40.76425 -73.98332 40.74376 CSH
kable(head(dataJoined[,16:21],tableRows))
fare_amount surcharge mta_tax tip_amount tolls_amount total_amount
6.5 0.0 0.5 0 0 7.0
6.0 0.5 0.5 0 0 7.0
5.5 1.0 0.5 0 0 7.0
5.0 0.5 0.5 0 0 6.0
9.5 0.5 0.5 0 0 10.5
9.5 0.0 0.5 0 0 10.0

Figure 6.2 The distribution of values across some of the categorical-looking columns in our dataset

p1 <- ggplot(dataJoined, aes(vendor_id)) +
  geom_bar()
p2 <- ggplot(dataJoined, aes(rate_code)) +
  geom_bar()
p3 <- ggplot(dataJoined, aes(store_and_fwd_flag)) +
  geom_bar()
p4 <- ggplot(dataJoined, aes(payment_type)) +
  geom_bar()
grid.arrange(p1,p2,p3,p4,nrow=2)

Figure 6.3 Scatter plots of taxi trips for the time in seconds versus the trip distance, and the time in seconds versus the trip amount (USD), respectively.

p5 <- ggplot(dataJoined, aes(trip_time_in_secs, trip_distance)) +
  geom_point(alpha = 0.1)
p6 <- ggplot(dataJoined, aes(trip_time_in_secs, total_amount)) +
  geom_point(alpha = 0.1)
grid.arrange(p5,p6,nrow=2)

Figure 6.4 The latitude/longitude of pickup locations. Note that the x-axis is flipped, compared to a regular map.

p7 <- ggplot(dataJoined, aes(pickup_latitude, pickup_longitude)) +
  geom_point(shape = ".") +
  scale_x_continuous(limits = c(40.6, 40.9)) +
  scale_y_continuous(limits = c(-74.04, -73.90))
p7
## Warning: Removed 1852 rows containing missing values (geom_point).

Figure 6.5 The distribution of tip amount.

p8 <- ggplot(dataJoined, aes(tip_amount)) +
  geom_histogram() +
  scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
  scale_y_continuous(name = "Count")
p8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

Figure 6.6 The distribution of tip amounts when omitting cash payments

dataJoined <- dataJoined %>%
  filter(payment_type != "CSH")
p9 <- ggplot(dataJoined, aes(tip_amount)) +
  geom_histogram() +
  scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
  scale_y_continuous(name = "Count")
p9
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

Functions to plot a ROC curve adapted from Chapter 4

# Returns the false-positive and true-positive rates at nPoints thresholds for 
# the given true and predicted labels
# trueLabels: 0=FALSE; 1=TRUE
rocCurve <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  # Allocates the threshold and ROC lists
  thr <- seq(0,1,length=nPoints)
  tpr <- numeric(nPoints)
  fpr <- numeric(nPoints)
  
  # Precalculates values for the positive and negative cases, used in the loop
  pos <- trueLabels == posClass
  neg <- !pos 
  nPos <- sum(pos, na.rm=TRUE)
  nNeg <- sum(neg, na.rm=TRUE)
  
  # For each threshold, calculates the rate of true and false positives
  for (i in 2:length(thr)) {
    t <- thr[i]
    meetOrExceedThreshold <- predictedProbs >= t
    tpr[i] <-  sum((meetOrExceedThreshold & pos), na.rm=TRUE) / nPos
    fpr[i] <-  sum((meetOrExceedThreshold & neg), na.rm=TRUE) / nNeg
  }
  
  # Create data frame without duplicated fpr's to return
  duplicatedFPRs <- duplicated(fpr)
  df <- data.frame(fpr=fpr[!duplicatedFPRs],tpr=tpr[!duplicatedFPRs],thr=thr[!duplicatedFPRs])
  
  return(df)
}
plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  auc <- 0
  df <- rocCurve(trueLabels = trueLabels,
                 predictedProbs = predictedProbs,
                 nPoints = nPoints,
                 posClass = posClass)
  
  idx <- 3:length(df$fpr)
  auc <- as.double( 
    (df$fpr[idx-1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx-1]) ) / 2.0 
  
  ggplot(df,aes(x=fpr,y=tpr)) +
  geom_abline(slope=1, intercept = 0,
              colour = "green") + #we've added the line for random baseline for reference
  geom_step(direction="vh", colour="blue") +
  scale_x_continuous(limits = c(0,1)) +
  scale_y_continuous(limits = c(0,1)) +
  labs(x = "False-positive rate",
       y = "True-positive rate") +
  annotate("text", x=.6,y=.2,label=paste0("Area under the curve (AUC) = ",round(auc,digits = 2)))
  
}

Listing 6.1 Logistic regression tip-prediction model

numericalData <- dataJoined[,c(8:14,16:18,20)]
numericalData[,c(1:11)] <- scale(numericalData[,c(1:11)], scale=TRUE)
numericalData$tipped <- as.factor(c(dataJoined$tip_amount > 0))
levels(numericalData$tipped) <- c("false", "true")
trainIndex <- createDataPartition(numericalData$tipped, p = .8, 
                                  list = FALSE, 
                                  times = 1)

numericalDataTrain <- numericalData[ trainIndex,]
numericalDataTest <- numericalData[-trainIndex,]

fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

plrFit <- train(tipped ~ ., data = numericalDataTrain,
                method = "plr",
                trControl = fitControl) 
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

Figure 6.7 The receiver operating characteristic (ROC) curve of the logistic regression tip/no-tip classifier.

plrPredProb <- predict(object=plrFit, 
                       numericalDataTest, type='prob')
plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
        plrPredProb$true, nPoints = 4000)

Listing 6.2 Random forest tip-prediction model

rfFit <- train(tipped ~ ., data = numericalDataTrain,
               method = "ranger",
               importance = "impurity",
               trControl = fitControl) 
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

Figure 6.8 The ROC curve of the nonlinear random forest model.

rfPredProb <- predict(object=rfFit, 
                       numericalDataTest, type='prob')

plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
        rfPredProb$true, nPoints = 4000)

Figure 6.9 The important features of the random forest model.

rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
##                    rn     Overall
##  1:   passenger_count  20.2661048
##  2: trip_time_in_secs  55.9549999
##  3:     trip_distance  96.1466936
##  4:  pickup_longitude 113.2667645
##  5:   pickup_latitude 112.6390474
##  6: dropoff_longitude 114.6606390
##  7:  dropoff_latitude 111.9583849
##  8:       fare_amount  58.4955870
##  9:         surcharge   7.7041169
## 10:           mta_tax   0.2159311
## 11:      tolls_amount   5.0873816
rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                 Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))
fig6.9 <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
grid.arrange(fig6.9,
             as.table=TRUE)

Listing 6.3 Converting categorical columns to numerical features

categoryData <- dataJoined[,c(3:5,15)]
dummies <- dummyVars(" ~ .", data = categoryData, fullRank = TRUE, 
                     levelsOnly = FALSE)
catNumeric <- data.frame(predict(dummies, newdata = categoryData))
numCatData <- data.frame(numericalData, catNumeric)

Figure 6.10 The ROC curve and feature importance list of the random forest model with all categorical variables converted to Boolean (0/1) columns, one per category per feature.

numCatDataTrain <- numCatData[ trainIndex,]
numCatDataTest <- numCatData[-trainIndex,]

rfFitNumCatData <- train(tipped ~ ., data = numCatDataTrain,
                         method = "ranger",
                         importance = "impurity", 
                         trControl = fitControl) 
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
rfNumCatPredProb <- predict(object=rfFitNumCatData, 
                            numCatDataTest, type='prob')

fig6.10a <- plotROC(revalue(numCatDataTest$tipped, c("false" = 0, "true" = 1)),
        rfNumCatPredProb$true, nPoints = 4000)

rfImp <- varImp(rfFitNumCatData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
##                       rn     Overall
##  1:      passenger_count  4.23651685
##  2:    trip_time_in_secs 13.15351581
##  3:        trip_distance 17.85025778
##  4:     pickup_longitude 20.65590678
##  5:      pickup_latitude 19.74232896
##  6:    dropoff_longitude 21.38594163
##  7:     dropoff_latitude 20.28050391
##  8:          fare_amount 13.90309525
##  9:            surcharge  2.65079523
## 10:              mta_tax  0.22165302
## 11:         tolls_amount  2.26378031
## 12:        vendor_id.VTS  0.55211380
## 13:            rate_code  2.66180999
## 14: store_and_fwd_flag.Y  0.00000000
## 15:     payment_type.CSH  0.00000000
## 16:     payment_type.DIS  1.96495720
## 17:     payment_type.NOC  0.00000000
## 18:     payment_type.UNK  0.01864229
rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                    Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))

fig6.10b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.10a, fig6.10b,
             nrow=1,
             as.table=TRUE,
             widths = c(5,3))

Listing 6.4 Date-time features

pickupData <- data.frame(pickup_datetime = dataJoined$pickup_datetime) %>%
  mutate(datetime = as.POSIXct(strptime(pickup_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
         pickup_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         pickup_day_of_week = as.integer(as.factor(weekdays(datetime))),
         pickup_week_of_year = week(datetime)
  ) %>%
  select(c(-datetime, -pickup_datetime))

dropoffData <- data.frame(dropoff_datetime = dataJoined$dropoff_datetime) %>%
  mutate(datetime = as.POSIXct(strptime(dropoff_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
         dropoff_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         dropoff_day_of_week = as.integer(as.factor(weekdays(datetime))),
         dropoff_week_of_year = week(datetime)
  ) %>%
  select(c(-datetime, -dropoff_datetime))

numCatTimeData <- data.frame(numCatData, pickupData, dropoffData)

Figure 6.11 The ROC curve and feature importance list for the random forest model, including all categorical features and additional date-time features

numCatTimeDataTrain <- numCatTimeData[ trainIndex,]
numCatTimeDataTest <- numCatTimeData[-trainIndex,]

rfFitNumCatTimeData <- train(tipped ~ ., data = numCatTimeDataTrain,
                         method = "ranger",
                         importance = "impurity", 
                         trControl = fitControl) 
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
rfNumCatTimePredProb <- predict(object=rfFitNumCatTimeData, 
                            numCatTimeDataTest, type='prob')

fig6.11a <- plotROC(revalue(numCatTimeDataTest$tipped, c("false" = 0, "true" = 1)),
                    rfNumCatTimePredProb$true, nPoints = 4000)

rfImp <- varImp(rfFitNumCatTimeData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
##                       rn     Overall
##  1:      passenger_count  2.91937611
##  2:    trip_time_in_secs  9.88302565
##  3:        trip_distance 13.33529243
##  4:     pickup_longitude 14.03307071
##  5:      pickup_latitude 14.11743583
##  6:    dropoff_longitude 14.58962191
##  7:     dropoff_latitude 13.65313199
##  8:          fare_amount 10.71341885
##  9:            surcharge  1.19939251
## 10:              mta_tax  0.20980671
## 11:         tolls_amount  1.92016725
## 12:        vendor_id.VTS  0.32077126
## 13:            rate_code  2.45313817
## 14: store_and_fwd_flag.Y  0.00000000
## 15:     payment_type.CSH  0.00000000
## 16:     payment_type.DIS  1.15685977
## 17:     payment_type.NOC  0.00000000
## 18:     payment_type.UNK  0.01500323
## 19:   pickup_hour_of_day  6.30524098
## 20:   pickup_day_of_week  0.81318686
## 21:  pickup_week_of_year  0.20749978
## 22:  dropoff_hour_of_day  5.84304509
## 23:  dropoff_day_of_week  0.54417903
## 24: dropoff_week_of_year  0.22832888
##                       rn     Overall
rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                    Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))

fig6.11b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.11a, fig6.11b,
             nrow=1,
             as.table=TRUE,
             widths = c(5,3))