This notebook contains R code to accompany Chapter 6 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.
NOTE: working directory should be set to this file’s location.
The data files for the examples in this chapter are available at http://www.andresmh.com/nyctaxitrips/. They are compressed as a 7-Zip file archive (e.g. with p7zip), so you will need to have the 7z
command available in your path for the below code to work. (On a mac, you can use Homebrew to install p7zip with the command brew install p7zip
.)
NOTE: downloading and decompressing the archives will take a while
baseUrl <- "https://archive.org/download/nycTaxiTripData2013/"
tripFile <- "trip_data.7z"
localTripFile <- paste0("../data/", tripFile)
tripFile1 <- "../data/trip_data_1.csv"
fareFile <- "trip_fare.7z"
localFareFile <- paste0("../data/", fareFile)
fareFile1 <- "../data/trip_fare_1.csv"
if(!file.exists(localTripFile)){
download.file(paste0(baseUrl, tripFile),
destfile = localTripFile,
method = "wget",
mode = "wb",
quiet = TRUE)
if(!file.exists(tripFile1)){
system(paste0("7z x ", localTripFile, " -o../data"))
}
}
if(!file.exists(localFareFile)){
download.file(paste0(baseUrl, fareFile),
destfile = localFareFile,
method = "wget",
mode = "wb",
quiet = TRUE)
if(!file.exists(fareFile1)){
system(paste0("7z x ", localFareFile, " -o../data"))
}
}
npoints <- 50000
tripData <- fread(tripFile1, nrows=npoints, stringsAsFactors = TRUE) %>%
mutate(store_and_fwd_flag =
replace(store_and_fwd_flag, which(store_and_fwd_flag == ""), "N")) %>%
filter(trip_distance > 0 & trip_time_in_secs > 0 & passenger_count > 0) %>%
filter(pickup_longitude < -70 & pickup_longitude > -80) %>%
filter(pickup_latitude > 0 & pickup_latitude < 41) %>%
filter(dropoff_longitude < 0 & dropoff_latitude > 0)
tripData$store_and_fwd_flag <- factor(tripData$store_and_fwd_flag)
fareData <- fread(fareFile1, nrows=npoints, stringsAsFactors = TRUE)
dataJoined <- inner_join(tripData, fareData)
## Joining, by = c("medallion", "hack_license", "vendor_id", "pickup_datetime")
remove(fareData, tripData)
tableRows <- 6
kable(head(dataJoined[,1:5],tableRows))
medallion | hack_license | vendor_id | rate_code | store_and_fwd_flag |
---|---|---|---|---|
89D227B655E5C82AECF13C3F540D4CF4 | BA96DE419E711691B9445D6A6307C170 | CMT | 1 | N |
0BD7C8F5BA12B88E0B67BED28BEA73D8 | 9FD8F69F0804BDB5549F40E9DA1BE472 | CMT | 1 | N |
0BD7C8F5BA12B88E0B67BED28BEA73D8 | 9FD8F69F0804BDB5549F40E9DA1BE472 | CMT | 1 | N |
DFD2202EE08F7A8DC9A57B02ACB81FE2 | 51EE87E3205C985EF8431D850C786310 | CMT | 1 | N |
DFD2202EE08F7A8DC9A57B02ACB81FE2 | 51EE87E3205C985EF8431D850C786310 | CMT | 1 | N |
20D9ECB2CA0767CF7A01564DF2844A3E | 598CCE5B9C1918568DEE71F43CF26CD2 | CMT | 1 | N |
kable(head(dataJoined[,6:10],tableRows))
pickup_datetime | dropoff_datetime | passenger_count | trip_time_in_secs | trip_distance |
---|---|---|---|---|
2013-01-01 15:11:48 | 2013-01-01 15:18:10 | 4 | 382 | 1.0 |
2013-01-06 00:18:35 | 2013-01-06 00:22:54 | 1 | 259 | 1.5 |
2013-01-05 18:49:41 | 2013-01-05 18:54:23 | 1 | 282 | 1.1 |
2013-01-07 23:54:15 | 2013-01-07 23:58:20 | 2 | 244 | 0.7 |
2013-01-07 23:25:03 | 2013-01-07 23:34:24 | 1 | 560 | 2.1 |
2013-01-07 15:27:48 | 2013-01-07 15:38:37 | 1 | 648 | 1.7 |
kable(head(dataJoined[,11:15],tableRows))
pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | payment_type |
---|---|---|---|---|
-73.97817 | 40.75798 | -73.98984 | 40.75117 | CSH |
-74.00668 | 40.73178 | -73.99450 | 40.75066 | CSH |
-74.00471 | 40.73777 | -74.00983 | 40.72600 | CSH |
-73.97460 | 40.75995 | -73.98473 | 40.75939 | CSH |
-73.97625 | 40.74853 | -74.00259 | 40.74787 | CSH |
-73.96674 | 40.76425 | -73.98332 | 40.74376 | CSH |
kable(head(dataJoined[,16:21],tableRows))
fare_amount | surcharge | mta_tax | tip_amount | tolls_amount | total_amount |
---|---|---|---|---|---|
6.5 | 0.0 | 0.5 | 0 | 0 | 7.0 |
6.0 | 0.5 | 0.5 | 0 | 0 | 7.0 |
5.5 | 1.0 | 0.5 | 0 | 0 | 7.0 |
5.0 | 0.5 | 0.5 | 0 | 0 | 6.0 |
9.5 | 0.5 | 0.5 | 0 | 0 | 10.5 |
9.5 | 0.0 | 0.5 | 0 | 0 | 10.0 |
p1 <- ggplot(dataJoined, aes(vendor_id)) +
geom_bar()
p2 <- ggplot(dataJoined, aes(rate_code)) +
geom_bar()
p3 <- ggplot(dataJoined, aes(store_and_fwd_flag)) +
geom_bar()
p4 <- ggplot(dataJoined, aes(payment_type)) +
geom_bar()
grid.arrange(p1,p2,p3,p4,nrow=2)
p5 <- ggplot(dataJoined, aes(trip_time_in_secs, trip_distance)) +
geom_point(alpha = 0.1)
p6 <- ggplot(dataJoined, aes(trip_time_in_secs, total_amount)) +
geom_point(alpha = 0.1)
grid.arrange(p5,p6,nrow=2)
p7 <- ggplot(dataJoined, aes(pickup_latitude, pickup_longitude)) +
geom_point(shape = ".") +
scale_x_continuous(limits = c(40.6, 40.9)) +
scale_y_continuous(limits = c(-74.04, -73.90))
p7
## Warning: Removed 1852 rows containing missing values (geom_point).
p8 <- ggplot(dataJoined, aes(tip_amount)) +
geom_histogram() +
scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
scale_y_continuous(name = "Count")
p8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
dataJoined <- dataJoined %>%
filter(payment_type != "CSH")
p9 <- ggplot(dataJoined, aes(tip_amount)) +
geom_histogram() +
scale_x_continuous(limits = c(-1, 16.0), name = "Tip amount (USD)") +
scale_y_continuous(name = "Count")
p9
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 41 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
# Returns the false-positive and true-positive rates at nPoints thresholds for
# the given true and predicted labels
# trueLabels: 0=FALSE; 1=TRUE
rocCurve <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
# Allocates the threshold and ROC lists
thr <- seq(0,1,length=nPoints)
tpr <- numeric(nPoints)
fpr <- numeric(nPoints)
# Precalculates values for the positive and negative cases, used in the loop
pos <- trueLabels == posClass
neg <- !pos
nPos <- sum(pos, na.rm=TRUE)
nNeg <- sum(neg, na.rm=TRUE)
# For each threshold, calculates the rate of true and false positives
for (i in 2:length(thr)) {
t <- thr[i]
meetOrExceedThreshold <- predictedProbs >= t
tpr[i] <- sum((meetOrExceedThreshold & pos), na.rm=TRUE) / nPos
fpr[i] <- sum((meetOrExceedThreshold & neg), na.rm=TRUE) / nNeg
}
# Create data frame without duplicated fpr's to return
duplicatedFPRs <- duplicated(fpr)
df <- data.frame(fpr=fpr[!duplicatedFPRs],tpr=tpr[!duplicatedFPRs],thr=thr[!duplicatedFPRs])
return(df)
}
plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
auc <- 0
df <- rocCurve(trueLabels = trueLabels,
predictedProbs = predictedProbs,
nPoints = nPoints,
posClass = posClass)
idx <- 3:length(df$fpr)
auc <- as.double(
(df$fpr[idx-1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx-1]) ) / 2.0
ggplot(df,aes(x=fpr,y=tpr)) +
geom_abline(slope=1, intercept = 0,
colour = "green") + #we've added the line for random baseline for reference
geom_step(direction="vh", colour="blue") +
scale_x_continuous(limits = c(0,1)) +
scale_y_continuous(limits = c(0,1)) +
labs(x = "False-positive rate",
y = "True-positive rate") +
annotate("text", x=.6,y=.2,label=paste0("Area under the curve (AUC) = ",round(auc,digits = 2)))
}
numericalData <- dataJoined[,c(8:14,16:18,20)]
numericalData[,c(1:11)] <- scale(numericalData[,c(1:11)], scale=TRUE)
numericalData$tipped <- as.factor(c(dataJoined$tip_amount > 0))
levels(numericalData$tipped) <- c("false", "true")
trainIndex <- createDataPartition(numericalData$tipped, p = .8,
list = FALSE,
times = 1)
numericalDataTrain <- numericalData[ trainIndex,]
numericalDataTest <- numericalData[-trainIndex,]
fitControl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE,
summaryFunction=twoClassSummary,
classProbs=TRUE)
plrFit <- train(tipped ~ ., data = numericalDataTrain,
method = "plr",
trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
plrPredProb <- predict(object=plrFit,
numericalDataTest, type='prob')
plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
plrPredProb$true, nPoints = 4000)
rfFit <- train(tipped ~ ., data = numericalDataTrain,
method = "ranger",
importance = "impurity",
trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
rfPredProb <- predict(object=rfFit,
numericalDataTest, type='prob')
plotROC(revalue(numericalDataTest$tipped, c("false" = 0, "true" = 1)),
rfPredProb$true, nPoints = 4000)
rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
## rn Overall
## 1: passenger_count 20.2661048
## 2: trip_time_in_secs 55.9549999
## 3: trip_distance 96.1466936
## 4: pickup_longitude 113.2667645
## 5: pickup_latitude 112.6390474
## 6: dropoff_longitude 114.6606390
## 7: dropoff_latitude 111.9583849
## 8: fare_amount 58.4955870
## 9: surcharge 7.7041169
## 10: mta_tax 0.2159311
## 11: tolls_amount 5.0873816
rfImp$importance <-
rfImp$importance[order(rfImp$importance$Overall,
decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))
fig6.9 <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
grid.arrange(fig6.9,
as.table=TRUE)
categoryData <- dataJoined[,c(3:5,15)]
dummies <- dummyVars(" ~ .", data = categoryData, fullRank = TRUE,
levelsOnly = FALSE)
catNumeric <- data.frame(predict(dummies, newdata = categoryData))
numCatData <- data.frame(numericalData, catNumeric)
numCatDataTrain <- numCatData[ trainIndex,]
numCatDataTest <- numCatData[-trainIndex,]
rfFitNumCatData <- train(tipped ~ ., data = numCatDataTrain,
method = "ranger",
importance = "impurity",
trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
rfNumCatPredProb <- predict(object=rfFitNumCatData,
numCatDataTest, type='prob')
fig6.10a <- plotROC(revalue(numCatDataTest$tipped, c("false" = 0, "true" = 1)),
rfNumCatPredProb$true, nPoints = 4000)
rfImp <- varImp(rfFitNumCatData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
## rn Overall
## 1: passenger_count 4.23651685
## 2: trip_time_in_secs 13.15351581
## 3: trip_distance 17.85025778
## 4: pickup_longitude 20.65590678
## 5: pickup_latitude 19.74232896
## 6: dropoff_longitude 21.38594163
## 7: dropoff_latitude 20.28050391
## 8: fare_amount 13.90309525
## 9: surcharge 2.65079523
## 10: mta_tax 0.22165302
## 11: tolls_amount 2.26378031
## 12: vendor_id.VTS 0.55211380
## 13: rate_code 2.66180999
## 14: store_and_fwd_flag.Y 0.00000000
## 15: payment_type.CSH 0.00000000
## 16: payment_type.DIS 1.96495720
## 17: payment_type.NOC 0.00000000
## 18: payment_type.UNK 0.01864229
rfImp$importance <-
rfImp$importance[order(rfImp$importance$Overall,
decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))
fig6.10b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.10a, fig6.10b,
nrow=1,
as.table=TRUE,
widths = c(5,3))
pickupData <- data.frame(pickup_datetime = dataJoined$pickup_datetime) %>%
mutate(datetime = as.POSIXct(strptime(pickup_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
pickup_hour_of_day = as.numeric(substr(datetime, 12, 13)),
pickup_day_of_week = as.integer(as.factor(weekdays(datetime))),
pickup_week_of_year = week(datetime)
) %>%
select(c(-datetime, -pickup_datetime))
dropoffData <- data.frame(dropoff_datetime = dataJoined$dropoff_datetime) %>%
mutate(datetime = as.POSIXct(strptime(dropoff_datetime, format="%Y-%m-%d %H:%M:%OS", tz="UTC")),
dropoff_hour_of_day = as.numeric(substr(datetime, 12, 13)),
dropoff_day_of_week = as.integer(as.factor(weekdays(datetime))),
dropoff_week_of_year = week(datetime)
) %>%
select(c(-datetime, -dropoff_datetime))
numCatTimeData <- data.frame(numCatData, pickupData, dropoffData)
numCatTimeDataTrain <- numCatTimeData[ trainIndex,]
numCatTimeDataTest <- numCatTimeData[-trainIndex,]
rfFitNumCatTimeData <- train(tipped ~ ., data = numCatTimeDataTrain,
method = "ranger",
importance = "impurity",
trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
rfNumCatTimePredProb <- predict(object=rfFitNumCatTimeData,
numCatTimeDataTest, type='prob')
fig6.11a <- plotROC(revalue(numCatTimeDataTest$tipped, c("false" = 0, "true" = 1)),
rfNumCatTimePredProb$true, nPoints = 4000)
rfImp <- varImp(rfFitNumCatTimeData, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
## rn Overall
## 1: passenger_count 2.91937611
## 2: trip_time_in_secs 9.88302565
## 3: trip_distance 13.33529243
## 4: pickup_longitude 14.03307071
## 5: pickup_latitude 14.11743583
## 6: dropoff_longitude 14.58962191
## 7: dropoff_latitude 13.65313199
## 8: fare_amount 10.71341885
## 9: surcharge 1.19939251
## 10: mta_tax 0.20980671
## 11: tolls_amount 1.92016725
## 12: vendor_id.VTS 0.32077126
## 13: rate_code 2.45313817
## 14: store_and_fwd_flag.Y 0.00000000
## 15: payment_type.CSH 0.00000000
## 16: payment_type.DIS 1.15685977
## 17: payment_type.NOC 0.00000000
## 18: payment_type.UNK 0.01500323
## 19: pickup_hour_of_day 6.30524098
## 20: pickup_day_of_week 0.81318686
## 21: pickup_week_of_year 0.20749978
## 22: dropoff_hour_of_day 5.84304509
## 23: dropoff_day_of_week 0.54417903
## 24: dropoff_week_of_year 0.22832888
## rn Overall
rfImp$importance <-
rfImp$importance[order(rfImp$importance$Overall,
decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
impDF <- data.frame(Feature=rfImp$importance$rn,
Importance=(rfImp$importance$Overall/sum(rfImp$importance$Overall)))
fig6.11b <- tableGrob(head(impDF,11), rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(fig6.11a, fig6.11b,
nrow=1,
as.table=TRUE,
widths = c(5,3))