This notebook contains R code to accompany Chapter 5 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.

NOTE: working directory should be set to this file’s location.

Data for event-recommendation model

The data for the examples from this chapter are available at the Kaggle Event Recommendation Engine Challenge. The rules of the challenge prohibit redristribution of the data. To run the example code, login to Kaggle, download the data, and put the train.csv, events.csv, and users.csv files in a folder named ../proprietary_data/.

Figure 5.2 A sample of the datasets used for training the event-recommendations model

dataDir <- "../proprietary_data"
# join on user and event identifiers
#from users data, get user_id, birthyear, gender, timezone
users <- read.csv(file.path(dataDir, "users.csv"),
                  colClasses = c('integer64', 
                                 "NULL", 
                                 "integer",
                                 "factor",
                                 "NULL",
                                 "NULL",
                                 "integer"),
                  na.strings = c("16-Mar", "23-May", "NA", "None"))

# from events data, get event_id, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               rep("NULL",6),
                               "numeric",
                               "numeric",
                               rep("NULL",101)))
## 
Read 0.0% of 3137972 rows
Read 25.2% of 3137972 rows
Read 51.0% of 3137972 rows
Read 78.7% of 3137972 rows
Read 3137972 rows and 3 (of 110) columns from 1.092 GB file in 00:00:08
# from train data, get user, event, invited, interested, not_interested 
# rename user to user_id and event to event_id to match users and events datasets
# also need to drop observations without explicit interested or not_interested selection
# then drop not_interested since we only need one of the categorical variables
train <- read.csv(file.path(dataDir, "train.csv"),
                  colClasses = c("integer64",
                                 "integer64",
                                 "integer",
                                 "NULL",
                                 "integer",
                                 "integer"),
                  col.names = c("user_id",
                                "event_id",
                                "invited",
                                "NULL",
                                "interested",
                                "not_interested")) %>%
  subset(interested == 1 | not_interested == 1) %>%
  select(-not_interested)
train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))
kable(head(train.joined,7))
invited interested birthyear gender timezone lat lng
0 1 1990 male 480 3.567 98.650
0 1 1998 female 240 -20.162 57.499
0 1 1998 female 240 -20.161 57.499
0 1 1998 female 240 -20.245 57.482
0 1 1982 male -360 NA NA
0 1 1991 male 270 NA NA
0 1 1992 female 420 -6.226 106.507

Figure 5.3 Cross-validated ROC curve and AUC metric for the simple event- recommendation model

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))
rocCurve <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
    thr <- seq(0, 1, length = nPoints)
    tpr <- numeric(nPoints)
    fpr <- numeric(nPoints)
    pos <- trueLabels == posClass
    neg <- !pos
    nPos <- sum(pos, na.rm = TRUE)
    nNeg <- sum(neg, na.rm = TRUE)
    for (i in 2:length(thr)) {
        t <- thr[i]
        meetOrExceedThreshold <- predictedProbs >= t
        tpr[i] <- sum((meetOrExceedThreshold & pos), na.rm = TRUE)/nPos
        fpr[i] <- sum((meetOrExceedThreshold & neg), na.rm = TRUE)/nNeg
    }
    duplicatedFPRs <- duplicated(fpr)
    df <- data.frame(fpr = fpr[!duplicatedFPRs], tpr = tpr[!duplicatedFPRs], 
        thr = thr[!duplicatedFPRs])
    return(df)
}
auc <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
    auc <- 0
    df <- rocCurve(trueLabels = trueLabels, predictedProbs = predictedProbs, 
        nPoints = nPoints, posClass = posClass)
    idx <- 3:length(df$fpr)
    auc <- as.double((df$fpr[idx - 1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx - 
        1]))/2
    return(auc)
}
plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
  auc <- auc(trueLabels, predictedProbs, nPoints, posClass)
  df <- rocCurve(trueLabels = trueLabels,
                 predictedProbs = predictedProbs,
                 nPoints = nPoints,
                 posClass = posClass)
  ggplot(df,aes(x=fpr,y=tpr)) +
    geom_step(direction="vh") +
    scale_x_continuous(limits = c(0,1)) +
    scale_y_continuous(limits = c(0,1)) +
    labs(x = "False-positive rate",
         y = "True-positive rate") +
    annotate("text", x=.5,y=.5,label=paste0("Area under the curve: ",round(auc,digits = 2)))
  
}
trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Figure 5.4 Additional date-time columns extracted from the timestamp column for the event-recommendation dataset

# from events data, get event_id, start_time, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame

events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               "NULL",
                               'myDateTime',
                               rep("NULL",4),
                               "numeric",
                               "numeric",
                               rep("NULL",101))) %>%
  semi_join(y = train) %>%
  mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
         datetime_year = as.numeric(substr(datetime, 1, 4)), 
         datetime_month_of_year = as.numeric(substr(datetime, 6, 7)), 
         datetime_day_of_month = as.numeric(substr(datetime, 9,10)), 
         datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
         datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
         datetime_day_of_week = as.factor(weekdays(datetime)),
         datetime_week_of_year = week(datetime),
         datetime_quarter_of_year = quarter(datetime),
         datetime_day_of_year = as.POSIXlt(datetime)$yday
  ) %>%
  select(c(-datetime, -start_time))
## 
Read 0.0% of 3137972 rows
Read 23.3% of 3137972 rows
Read 44.9% of 3137972 rows
Read 65.6% of 3137972 rows
Read 87.0% of 3137972 rows
Read 3137972 rows and 4 (of 110) columns from 1.092 GB file in 00:00:08
## Joining, by = "event_id"
kable(head(
  select(events,
         c(datetime_hour_of_day, 
           datetime_day_of_week,
           datetime_day_of_month,
           datetime_day_of_year,
           datetime_month_of_year)),
  5))
datetime_hour_of_day datetime_day_of_week datetime_day_of_month datetime_day_of_year datetime_month_of_year
13 Friday 26 299 10
7 Saturday 3 307 11
0 Wednesday 31 304 10
16 Thursday 1 305 11
0 Friday 30 334 11
kable(head(
  select(events,
         c(datetime_minute_of_hour,
           datetime_second_of_minute,
           datetime_year,
           datetime_quarter_of_year,
           datetime_week_of_year)), 
  5))
datetime_minute_of_hour datetime_second_of_minute datetime_year datetime_quarter_of_year datetime_week_of_year
30 0 2012 4 43
0 0 2012 4 45
0 0 2012 4 44
0 0 2012 4 44
0 0 2012 4 48

Figure 5.5 Cross-validated ROC curve for model including date-time features

train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))

trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Figure 5.6 A slice of the bag-of-words data for the event-recommendation example. These numbers are the counts of the top-occurring words in the event descriptions. A large fraction of the cells contain 0, so we call the dataset sparse.

# from events data, get event_id, start_time, lat, lng, c_1 through c_other
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame

events <- fread(file.path(dataDir, "events.csv"),
                sep=",",
                colClasses = c("integer64",
                               "NULL",
                               'myDateTime',
                               rep("NULL",4),
                               "numeric",
                               "numeric",
                               rep("integer",101))) %>%
  semi_join(y = train) %>%
  mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
         datetime_year = as.numeric(substr(datetime, 1, 4)), 
         datetime_month_of_year = as.numeric(substr(datetime, 6, 7)), 
         datetime_day_of_month = as.numeric(substr(datetime, 9,10)), 
         datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)), 
         datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
         datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
         datetime_day_of_week = as.factor(weekdays(datetime)),
         datetime_week_of_year = week(datetime),
         datetime_quarter_of_year = quarter(datetime),
         datetime_day_of_year = as.POSIXlt(datetime)$yday
  ) %>%
  select(c(-datetime, -start_time))
## 
Read 0.0% of 3137972 rows
Read 11.2% of 3137972 rows
Read 22.9% of 3137972 rows
Read 33.8% of 3137972 rows
Read 44.6% of 3137972 rows
Read 55.8% of 3137972 rows
Read 66.9% of 3137972 rows
Read 78.1% of 3137972 rows
Read 89.2% of 3137972 rows
Read 99.7% of 3137972 rows
Read 3137972 rows and 105 (of 110) columns from 1.092 GB file in 00:00:13
## Joining, by = "event_id"
rhandsontable(matrix(data=unlist(events[1:110,4:104]),nrow=11,ncol=10),
              readOnly = TRUE)

Figure 5.7 Cross-validated ROC curve for full model including date-time and text features

train.joined <- inner_join(train, users, by=c("user_id")) %>%
  inner_join(events, by=c("event_id")) %>%
  select(-c(user_id,event_id))

dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE, 
                     levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
  select(c(-female)) %>%
  na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <- 
  revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))

trainIndex <- createDataPartition(train.joined.numeric$male, p = .8, 
                                  list = FALSE, 
                                  times = 1)

eventTrain <- train.joined.numeric[ trainIndex,]
eventTest  <- train.joined.numeric[-trainIndex,]


fitControl <- trainControl(method = "repeatedcv", 
                           number = 10, 
                           savePredictions = TRUE,
                           summaryFunction=twoClassSummary, 
                           classProbs=TRUE)

rfFit <- train(interested ~ ., data = eventTrain,
                method = "rf", 
                trControl = fitControl, 
                verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit, 
                              eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
        event.rf.pred.prob$yes, nPoints = 1000)

Inset: Some algorithms have built-in feature selection

rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
##                            rn    Overall
##   1:                  invited  0.7038939
##   2:                birthyear 83.9764275
##   3:                     male  8.8013301
##   4:                 timezone 26.8073843
##   5:                      lat 35.6837413
##  ---                                    
## 118:                  Tuesday  2.2122801
## 119:                Wednesday  2.9494567
## 120:    datetime_week_of_year 10.0579370
## 121: datetime_quarter_of_year  0.7604578
## 122:     datetime_day_of_year 21.2791525
rfImp$importance <- 
  rfImp$importance[order(rfImp$importance$Overall,
                         decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
ggplot(rfImp$importance[1:7], aes(x = rn, y = Overall)) + 
  geom_bar(stat = "identity") +
  xlab("Feature") +
  ylab("Importance") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(text=element_text(size=16, family="Arial")) +
  scale_y_continuous(position="top") +
  #theme(axis.title.y = element_text(angle = 90, vjust = 1.08)) +
  coord_flip()