Real-World Machine Learning

This notebook contains R code to accompany Chapter 5 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.

NOTE: working directory should be set to this file’s location.

Data for event-recommendation model

The data for the examples from this chapter are available at the Kaggle Event Recommendation Engine Challenge. The rules of the challenge prohibit redristribution of the data. To run the example code, login to Kaggle, download the data, and put the train.csv, events.csv, and users.csv files in a folder named ../proprietary_data/.

Figure 5.2 A sample of the datasets used for training the event-recommendations model

dataDir <- "../proprietary_data"
# join on user and event identifiers
#from users data, get user_id, birthyear, gender, timezone
users <- read.csv(file.path(dataDir, "users.csv"),
                  colClasses = c('integer64', 
                                 "NULL", 
                                 "integer",
                                 "factor",
                                 "NULL",
                                 "NULL",
                                 "integer"),
                  na.strings = c("16-Mar", "23-May", "NA", "None"))

# from events data, get event_id, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               rep("NULL",6),
                               "numeric",
                               "numeric",
                               rep("NULL",101)))

## 
Read 0.0% of 3137972 rows
Read 25.2% of 3137972 rows
Read 51.0% of 3137972 rows
Read 78.7% of 3137972 rows
Read 3137972 rows and 3 (of 110) columns from 1.092 GB file in 00:00:08

# from train data, get user, event, invited, interested, not_interested 
# rename user to user_id and event to event_id to match users and events datasets
# also need to drop observations without explicit interested or not_interested selection
# then drop not_interested since we only need one of the categorical variables
train <- read.csv(file.path(dataDir, "train.csv"),
                  colClasses = c("integer64",
                                 "integer64",
                                 "integer",
                                 "NULL",
                                 "integer",
                                 "integer"),
                  col.names = c("user_id",
                                "event_id",
                                "invited",
                                "NULL",
                                "interested",
                                "not_interested")) %>%
  subset(interested == 1 | not_interested == 1) %>%
  select(-not_interested)

train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))
kable(head(train.joined,7))

interested	birthyear	gender	timezone	lat	lng
1	1990	male	480	3.567	98.650
1	1998	female	240	-20.162	57.499
1	1998	female	240	-20.161	57.499
1	1998	female	240	-20.245	57.482
1	1982	male	-360	NA	NA
1	1991	male	270	NA	NA
1	1992	female	420	-6.226	106.507

Figure 5.3 Cross-validated ROC curve and AUC metric for the simple event- recommendation model

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))

rocCurve <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
    thr <- seq(0, 1, length = nPoints)
    tpr <- numeric(nPoints)
    fpr <- numeric(nPoints)
    pos <- trueLabels == posClass
    neg <- !pos
    nPos <- sum(pos, na.rm = TRUE)
    nNeg <- sum(neg, na.rm = TRUE)
    for (i in 2:length(thr)) {
        t <- thr[i]
        meetOrExceedThreshold <- predictedProbs >= t
        tpr[i] <- sum((meetOrExceedThreshold & pos), na.rm = TRUE)/nPos
        fpr[i] <- sum((meetOrExceedThreshold & neg), na.rm = TRUE)/nNeg
    }
    duplicatedFPRs <- duplicated(fpr)
    df <- data.frame(fpr = fpr[!duplicatedFPRs], tpr = tpr[!duplicatedFPRs], 
        thr = thr[!duplicatedFPRs])
    return(df)
}

auc <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
    auc <- 0
    df <- rocCurve(trueLabels = trueLabels, predictedProbs = predictedProbs, 
        nPoints = nPoints, posClass = posClass)
    idx <- 3:length(df$fpr)
    auc <- as.double((df$fpr[idx - 1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx - 
        1]))/2
    return(auc)
}

plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  auc <- auc(trueLabels, predictedProbs, nPoints, posClass)
  df <- rocCurve(trueLabels = trueLabels,
                 predictedProbs = predictedProbs,
                 nPoints = nPoints,
                 posClass = posClass)
  ggplot(df,aes(x=fpr,y=tpr)) +
    geom_step(direction="vh") +
    scale_x_continuous(limits = c(0,1)) +
    scale_y_continuous(limits = c(0,1)) +
    labs(x = "False-positive rate",
         y = "True-positive rate") +
    annotate("text", x=.5,y=.5,label=paste0("Area under the curve: ",round(auc,digits = 2)))
  
}

trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')

plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Figure 5.4 Additional date-time columns extracted from the timestamp column for the event-recommendation dataset

# from events data, get event_id, start_time, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame

events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               "NULL",
                               'myDateTime',
                               rep("NULL",4),
                               "numeric",
                               "numeric",
                               rep("NULL",101))) %>%
  semi_join(y = train) %>%
  mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
         datetime_year = as.numeric(substr(datetime, 1, 4)), 
         datetime_month_of_year = as.numeric(substr(datetime, 6, 7)), 
         datetime_day_of_month = as.numeric(substr(datetime, 9,10)), 
         datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
         datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
         datetime_day_of_week = as.factor(weekdays(datetime)),
         datetime_week_of_year = week(datetime),
         datetime_quarter_of_year = quarter(datetime),
         datetime_day_of_year = as.POSIXlt(datetime)$yday
  ) %>%
  select(c(-datetime, -start_time))

## 
Read 0.0% of 3137972 rows
Read 23.3% of 3137972 rows
Read 44.9% of 3137972 rows
Read 65.6% of 3137972 rows
Read 87.0% of 3137972 rows
Read 3137972 rows and 4 (of 110) columns from 1.092 GB file in 00:00:08

## Joining, by = "event_id"

kable(head(
  select(events,
         c(datetime_hour_of_day, 
           datetime_day_of_week,
           datetime_day_of_month,
           datetime_day_of_year,
           datetime_month_of_year)),
  5))

datetime_hour_of_day	datetime_day_of_week	datetime_day_of_month	datetime_day_of_year	datetime_month_of_year
13	Friday	26	299	10
7	Saturday	3	307	11
0	Wednesday	31	304	10
16	Thursday	1	305	11
0	Friday	30	334	11

kable(head(
  select(events,
         c(datetime_minute_of_hour,
           datetime_second_of_minute,
           datetime_year,
           datetime_quarter_of_year,
           datetime_week_of_year)), 
  5))

datetime_minute_of_hour	datetime_year	datetime_quarter_of_year	datetime_week_of_year
30	2012	4	43
0	2012	4	45
0	2012	4	44
0	2012	4	44
0	2012	4	48

Figure 5.5 Cross-validated ROC curve for model including date-time features

train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))

trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')

plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Figure 5.6 A slice of the bag-of-words data for the event-recommendation example. These numbers are the counts of the top-occurring words in the event descriptions. A large fraction of the cells contain 0, so we call the dataset sparse.

# from events data, get event_id, start_time, lat, lng, c_1 through c_other
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame

events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               "NULL",
                               'myDateTime',
                               rep("NULL",4),
                               "numeric",
                               "numeric",
                               rep("integer",101))) %>%
  semi_join(y = train) %>%
  mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
         datetime_year = as.numeric(substr(datetime, 1, 4)), 
         datetime_month_of_year = as.numeric(substr(datetime, 6, 7)), 
         datetime_day_of_month = as.numeric(substr(datetime, 9,10)), 
         datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
         datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
         datetime_day_of_week = as.factor(weekdays(datetime)),
         datetime_week_of_year = week(datetime),
         datetime_quarter_of_year = quarter(datetime),
         datetime_day_of_year = as.POSIXlt(datetime)$yday
  ) %>%
  select(c(-datetime, -start_time))

## 
Read 0.0% of 3137972 rows
Read 11.2% of 3137972 rows
Read 22.9% of 3137972 rows
Read 33.8% of 3137972 rows
Read 44.6% of 3137972 rows
Read 55.8% of 3137972 rows
Read 66.9% of 3137972 rows
Read 78.1% of 3137972 rows
Read 89.2% of 3137972 rows
Read 99.7% of 3137972 rows
Read 3137972 rows and 105 (of 110) columns from 1.092 GB file in 00:00:13

## Joining, by = "event_id"

rhandsontable(matrix(data=unlist(events[1:110,4:104]),nrow=11,ncol=10),
              readOnly = TRUE)

Figure 5.7 Cross-validated ROC curve for full model including date-time and text features

train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))

trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')

plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Inset: Some algorithms have built-in feature selection

rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]

##                            rn    Overall
##   1:                  invited  0.7038939
##   2:                birthyear 83.9764275
##   3:                     male  8.8013301
##   4:                 timezone 26.8073843
##   5:                      lat 35.6837413
##  ---                                    
## 118:                  Tuesday  2.2122801
## 119:                Wednesday  2.9494567
## 120:    datetime_week_of_year 10.0579370
## 121: datetime_quarter_of_year  0.7604578
## 122:     datetime_day_of_year 21.2791525

rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
ggplot(rfImp$importance[1:7], aes(x = rn, y = Overall)) + 
  geom_bar(stat = "identity") +
  xlab("Feature") +
  ylab("Importance") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(text=element_text(size=16, family="Arial")) +
  scale_y_continuous(position="top") +
  #theme(axis.title.y = element_text(angle = 90, vjust = 1.08)) +
  coord_flip()