Real-World Machine Learning

This notebook contains R code to accompany Chapter 6 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.

NOTE: working directory should be set to this file’s location.

Data for NYC taxi example

The data files for the examples in this chapter are available at http://www.andresmh.com/nyctaxitrips/. They are compressed as a 7-Zip file archive (e.g. with p7zip), so you will need to have the 7z command available in your path for the below code to work. (On a mac, you can use Homebrew to install p7zip with the command brew install p7zip.)

NOTE: downloading and decompressing the archives will take a while

baseUrl <- "https://archive.org/download/nycTaxiTripData2013/"
tripFile <- "trip_data.7z"
localTripFile <- paste0("../data/", tripFile)
tripFile1 <- "../data/trip_data_1.csv"
fareFile <- "trip_fare.7z"
localFareFile <- paste0("../data/", fareFile)
fareFile1 <- "../data/trip_fare_1.csv"
if(!file.exists(localTripFile)){
  download.file(paste0(baseUrl, tripFile), 
                destfile = localTripFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(tripFile1)){
    system(paste0("7z x ", localTripFile, " -o../data"))
  }
}
if(!file.exists(localFareFile)){
  download.file(paste0(baseUrl, fareFile), 
                destfile = localFareFile,
                method = "wget",
                mode = "wb",
                quiet = TRUE)
  if(!file.exists(fareFile1)){
    system(paste0("7z x ", localFareFile, " -o../data"))
  }
}

npoints <- 50000
tripData <- fread(tripFile1, nrows=npoints, stringsAsFactors = TRUE) %>%
  mutate(store_and_fwd_flag = 
           replace(store_and_fwd_flag, which(store_and_fwd_flag == ""), "N")) %>%
  filter(trip_distance > 0 & trip_time_in_secs > 0 & passenger_count > 0) %>%
  filter(pickup_longitude < -70 & pickup_longitude > -80) %>%
  filter(pickup_latitude > 0 & pickup_latitude < 41) %>%
  filter(dropoff_longitude < 0 & dropoff_latitude > 0)
tripData$store_and_fwd_flag <- factor(tripData$store_and_fwd_flag)
fareData <- fread(fareFile1, nrows=npoints, stringsAsFactors = TRUE)
dataJoined <- inner_join(tripData, fareData)

## Joining, by = c("medallion", "hack_license", "vendor_id", "pickup_datetime")

remove(fareData, tripData)

Figure 6.1 The first six rows of the NYC taxi trip and fare record data

tableRows <- 6
kable(head(dataJoined[,1:5],tableRows))

medallion	hack_license	vendor_id	rate_code	store_and_fwd_flag
89D227B655E5C82AECF13C3F540D4CF4	BA96DE419E711691B9445D6A6307C170	CMT	1	N
0BD7C8F5BA12B88E0B67BED28BEA73D8	9FD8F69F0804BDB5549F40E9DA1BE472	CMT	1	N
0BD7C8F5BA12B88E0B67BED28BEA73D8	9FD8F69F0804BDB5549F40E9DA1BE472	CMT	1	N
DFD2202EE08F7A8DC9A57B02ACB81FE2	51EE87E3205C985EF8431D850C786310	CMT	1	N
DFD2202EE08F7A8DC9A57B02ACB81FE2	51EE87E3205C985EF8431D850C786310	CMT	1	N
20D9ECB2CA0767CF7A01564DF2844A3E	598CCE5B9C1918568DEE71F43CF26CD2	CMT	1	N

kable(head(dataJoined[,6:10],tableRows))

pickup_datetime	dropoff_datetime	passenger_count	trip_time_in_secs	trip_distance
2013-01-01 15:11:48	2013-01-01 15:18:10	4	382	1.0
2013-01-06 00:18:35	2013-01-06 00:22:54	1	259	1.5
2013-01-05 18:49:41	2013-01-05 18:54:23	1	282	1.1
2013-01-07 23:54:15	2013-01-07 23:58:20	2	244	0.7
2013-01-07 23:25:03	2013-01-07 23:34:24	1	560	2.1
2013-01-07 15:27:48	2013-01-07 15:38:37	1	648	1.7

kable(head(dataJoined[,11:15],tableRows))

pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	payment_type
-73.97817	40.75798	-73.98984	40.75117	CSH
-74.00668	40.73178	-73.99450	40.75066	CSH
-74.00471	40.73777	-74.00983	40.72600	CSH
-73.97460	40.75995	-73.98473	40.75939	CSH
-73.97625	40.74853	-74.00259	40.74787	CSH
-73.96674	40.76425	-73.98332	40.74376	CSH

kable(head(dataJoined[,16:21],tableRows))

fare_amount	surcharge	mta_tax	total_amount
6.5	0.0	0.5	7.0
6.0	0.5	0.5	7.0
5.5	1.0	0.5	7.0
5.0	0.5	0.5	6.0
9.5	0.5	0.5	10.5
9.5	0.0	0.5	10.0

Figure 6.2 The distribution of values across some of the categorical-looking columns in our dataset

p1 <- ggplot(dataJoined, aes(vendor_id)) +
  geom_bar()
p2 <- ggplot(dataJoined, aes(rate_code)) +
  geom_bar()
p3 <- ggplot(dataJoined, aes(store_and_fwd_flag)) +
  geom_bar()
p4 <- ggplot(dataJoined, aes(payment_type)) +
  geom_bar()
grid.arrange(p1,p2,p3,p4,nrow=2)

Figure 6.3 Scatter plots of taxi trips for the time in seconds versus the trip distance, and the time in seconds versus the trip amount (USD), respectively.

p5 <- ggplot(dataJoined, aes(trip_time_in_secs, trip_distance)) +
  geom_point(alpha = 0.1)
p6 <- ggplot(dataJoined, aes(trip_time_in_secs, total_amount)) +
  geom_point(alpha = 0.1)
grid.arrange(p5,p6,nrow=2)

Figure 6.4 The latitude/longitude of pickup locations. Note that the x-axis is flipped, compared to a regular map.

p7 <- ggplot(dataJoined, aes(pickup_latitude, pickup_longitude)) +
  geom_point(shape = ".") +
  scale_x_continuous(limits = c(40.6, 40.9)) +
  scale_y_continuous(limits = c(-74.04, -73.90))
p7

## Warning: Removed 1852 rows containing missing values (geom_point).

Figure 6.5 The distribution of tip amount.

p8 <- ggplot(dataJoined, aes(tip_amount)) +
  geom_histogram() +
  scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
  scale_y_continuous(name = "Count")
p8

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 41 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing missing values (geom_bar).

Figure 6.6 The distribution of tip amounts when omitting cash payments

dataJoined <- dataJoined %>%
  filter(payment_type != "CSH")
p9 <- ggplot(dataJoined, aes(tip_amount)) +
  geom_histogram() +
  scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
  scale_y_continuous(name = "Count")
p9

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 41 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing missing values (geom_bar).

Functions to plot a ROC curve adapted from Chapter 4

# Returns the false-positive and true-positive rates at nPoints thresholds for 
# the given true and predicted labels
# trueLabels: 0=FALSE; 1=TRUE
rocCurve <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  # Allocates the threshold and ROC lists
  thr <- seq(0,1,length=nPoints)
  tpr <- numeric(nPoints)
  fpr <- numeric(nPoints)
  
  # Precalculates values for the positive and negative cases, used in the loop
  pos <- trueLabels == posClass
  neg <- !pos 
  nPos <- sum(pos, na.rm=TRUE)
  nNeg <- sum(neg, na.rm=TRUE)
  
  # For each threshold, calculates the rate of true and false positives
  for (i in 2:length(thr)) {
    t <- thr[i]
    meetOrExceedThreshold <- predictedProbs >= t
    tpr[i] <-  sum((meetOrExceedThreshold & pos), na.rm=TRUE) / nPos
    fpr[i] <-  sum((meetOrExceedThreshold & neg), na.rm=TRUE) / nNeg
  }
  
  # Create data frame without duplicated fpr's to return
  duplicatedFPRs <- duplicated(fpr)
  df <- data.frame(fpr=fpr[!duplicatedFPRs],tpr=tpr[!duplicatedFPRs],thr=thr[!duplicatedFPRs])
  
  return(df)
}

plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  auc <- 0
  df <- rocCurve(trueLabels = trueLabels,
                 predictedProbs = predictedProbs,
                 nPoints = nPoints,
                 posClass = posClass)
  
  idx <- 3:length(df$fpr)
  auc <- as.double( 
    (df$fpr[idx-1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx-1]) ) / 2.0 
  
  ggplot(df,aes(x=fpr,y=tpr)) +
  geom_abline(slope=1, intercept = 0,
              colour = "green") + #we've added the line for random baseline for reference
  geom_step(direction="vh", colour="blue") +
  scale_x_continuous(limits = c(0,1)) +
  scale_y_continuous(limits = c(0,1)) +
  labs(x = "False-positive rate",
       y = "True-positive rate") +
  annotate("text", x=.6,y=.2,label=paste0("Area under the curve (AUC) = ",round(auc,digits = 2)))
  
}

Listing 6.1 Logistic regression tip-prediction model

numericalData <- dataJoined[,c(8:14,16:18,20)]
numericalData[,c(1:11)] <- scale(numericalData[,c(1:11)], scale=TRUE)
numericalData$tipped <- as.factor(c(dataJoined$tip_amount > 0))
levels(numericalData$tipped) <- c("false", "true")
trainIndex <- createDataPartition(numericalData$tipped, p = .8, 
                                  list = FALSE, 
                                  times = 1)

numericalDataTrain <- numericalData[ trainIndex,]
numericalDataTest <- numericalData[-trainIndex,]

fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

plrFit <- train(tipped ~ ., data = numericalDataTrain,
                method = "plr",
                trControl = fitControl)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

Figure 6.7 The receiver operating characteristic (ROC) curve of the logistic regression tip/no-tip classifier.

plrPredProb <- predict(object=plrFit, 
                       numericalDataTest, type='prob')
plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
        plrPredProb$true, nPoints = 4000)

Listing 6.2 Random forest tip-prediction model

rfFit <- train(tipped ~ ., data = numericalDataTrain,
               method = "ranger",
               importance = "impurity",
               trControl = fitControl)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

Figure 6.8 The ROC curve of the nonlinear random forest model.

rfPredProb <- predict(object=rfFit, 
                       numericalDataTest, type='prob')

plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
        rfPredProb$true, nPoints = 4000)

Figure 6.9 The important features of the random forest model.

rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]

##                    rn     Overall
##  1:   passenger_count  20.2661048
##  2: trip_time_in_secs  55.9549999
##  3:     trip_distance  96.1466936
##  4:  pickup_longitude 113.2667645
##  5:   pickup_latitude 112.6390474
##  6: dropoff_longitude 114.6606390
##  7:  dropoff_latitude 111.9583849
##  8:       fare_amount  58.4955870
##  9:         surcharge   7.7041169
## 10:           mta_tax   0.2159311
## 11:      tolls_amount   5.0873816

rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                 Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))
fig6.9 <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
grid.arrange(fig6.9,
             as.table=TRUE)

Listing 6.3 Converting categorical columns to numerical features

categoryData <- dataJoined[,c(3:5,15)]
dummies <- dummyVars(" ~ .", data = categoryData, fullRank = TRUE, 
                     levelsOnly = FALSE)
catNumeric <- data.frame(predict(dummies, newdata = categoryData))
numCatData <- data.frame(numericalData, catNumeric)

Figure 6.10 The ROC curve and feature importance list of the random forest model with all categorical variables converted to Boolean (0/1) columns, one per category per feature.

numCatDataTrain <- numCatData[ trainIndex,]
numCatDataTest <- numCatData[-trainIndex,]

rfFitNumCatData <- train(tipped ~ ., data = numCatDataTrain,
                         method = "ranger",
                         importance = "impurity", 
                         trControl = fitControl)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

rfNumCatPredProb <- predict(object=rfFitNumCatData, 
                            numCatDataTest, type='prob')

fig6.10a <- plotROC(revalue(numCatDataTest$tipped, c("false" = 0, "true" = 1)),
        rfNumCatPredProb$true, nPoints = 4000)

rfImp <- varImp(rfFitNumCatData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]

##                       rn     Overall
##  1:      passenger_count  4.23651685
##  2:    trip_time_in_secs 13.15351581
##  3:        trip_distance 17.85025778
##  4:     pickup_longitude 20.65590678
##  5:      pickup_latitude 19.74232896
##  6:    dropoff_longitude 21.38594163
##  7:     dropoff_latitude 20.28050391
##  8:          fare_amount 13.90309525
##  9:            surcharge  2.65079523
## 10:              mta_tax  0.22165302
## 11:         tolls_amount  2.26378031
## 12:        vendor_id.VTS  0.55211380
## 13:            rate_code  2.66180999
## 14: store_and_fwd_flag.Y  0.00000000
## 15:     payment_type.CSH  0.00000000
## 16:     payment_type.DIS  1.96495720
## 17:     payment_type.NOC  0.00000000
## 18:     payment_type.UNK  0.01864229

rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                    Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))

fig6.10b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.10a, fig6.10b,
             nrow=1,
             as.table=TRUE,
             widths = c(5,3))

Listing 6.4 Date-time features

pickupData <- data.frame(pickup_datetime = dataJoined$pickup_datetime) %>%
  mutate(datetime = as.POSIXct(strptime(pickup_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
         pickup_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         pickup_day_of_week = as.integer(as.factor(weekdays(datetime))),
         pickup_week_of_year = week(datetime)
  ) %>%
  select(c(-datetime, -pickup_datetime))

dropoffData <- data.frame(dropoff_datetime = dataJoined$dropoff_datetime) %>%
  mutate(datetime = as.POSIXct(strptime(dropoff_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
         dropoff_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         dropoff_day_of_week = as.integer(as.factor(weekdays(datetime))),
         dropoff_week_of_year = week(datetime)
  ) %>%
  select(c(-datetime, -dropoff_datetime))

numCatTimeData <- data.frame(numCatData, pickupData, dropoffData)

Figure 6.11 The ROC curve and feature importance list for the random forest model, including all categorical features and additional date-time features

numCatTimeDataTrain <- numCatTimeData[ trainIndex,]
numCatTimeDataTest <- numCatTimeData[-trainIndex,]

rfFitNumCatTimeData <- train(tipped ~ ., data = numCatTimeDataTrain,
                         method = "ranger",
                         importance = "impurity", 
                         trControl = fitControl)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

rfNumCatTimePredProb <- predict(object=rfFitNumCatTimeData, 
                            numCatTimeDataTest, type='prob')

fig6.11a <- plotROC(revalue(numCatTimeDataTest$tipped, c("false" = 0, "true" = 1)),
                    rfNumCatTimePredProb$true, nPoints = 4000)

rfImp <- varImp(rfFitNumCatTimeData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]

##                       rn     Overall
##  1:      passenger_count  2.91937611
##  2:    trip_time_in_secs  9.88302565
##  3:        trip_distance 13.33529243
##  4:     pickup_longitude 14.03307071
##  5:      pickup_latitude 14.11743583
##  6:    dropoff_longitude 14.58962191
##  7:     dropoff_latitude 13.65313199
##  8:          fare_amount 10.71341885
##  9:            surcharge  1.19939251
## 10:              mta_tax  0.20980671
## 11:         tolls_amount  1.92016725
## 12:        vendor_id.VTS  0.32077126
## 13:            rate_code  2.45313817
## 14: store_and_fwd_flag.Y  0.00000000
## 15:     payment_type.CSH  0.00000000
## 16:     payment_type.DIS  1.15685977
## 17:     payment_type.NOC  0.00000000
## 18:     payment_type.UNK  0.01500323
## 19:   pickup_hour_of_day  6.30524098
## 20:   pickup_day_of_week  0.81318686
## 21:  pickup_week_of_year  0.20749978
## 22:  dropoff_hour_of_day  5.84304509
## 23:  dropoff_day_of_week  0.54417903
## 24: dropoff_week_of_year  0.22832888
##                       rn     Overall

rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
                    Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))

fig6.11b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.11a, fig6.11b,
             nrow=1,
             as.table=TRUE,
             widths = c(5,3))