This notebook contains R code to accompany Chapter 5 of the book “Real-World Machine Learning”, by Henrik Brink, Joseph W. Richards, and Mark Fetherolf. The code was contributed by Paul Adamson.
NOTE: working directory should be set to this file’s location.
The data for the examples from this chapter are available at the Kaggle Event Recommendation Engine Challenge. The rules of the challenge prohibit redristribution of the data. To run the example code, login to Kaggle, download the data, and put the train.csv
, events.csv
, and users.csv
files in a folder named ../proprietary_data/
.
dataDir <- "../proprietary_data"
# join on user and event identifiers
#from users data, get user_id, birthyear, gender, timezone
users <- read.csv(file.path(dataDir, "users.csv"),
colClasses = c('integer64',
"NULL",
"integer",
"factor",
"NULL",
"NULL",
"integer"),
na.strings = c("16-Mar", "23-May", "NA", "None"))
# from events data, get event_id, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
events <- fread(file.path(dataDir, "events.csv"),
sep=",",
colClasses = c("integer64",
rep("NULL",6),
"numeric",
"numeric",
rep("NULL",101)))
##
Read 0.0% of 3137972 rows
Read 25.2% of 3137972 rows
Read 51.0% of 3137972 rows
Read 78.7% of 3137972 rows
Read 3137972 rows and 3 (of 110) columns from 1.092 GB file in 00:00:08
# from train data, get user, event, invited, interested, not_interested
# rename user to user_id and event to event_id to match users and events datasets
# also need to drop observations without explicit interested or not_interested selection
# then drop not_interested since we only need one of the categorical variables
train <- read.csv(file.path(dataDir, "train.csv"),
colClasses = c("integer64",
"integer64",
"integer",
"NULL",
"integer",
"integer"),
col.names = c("user_id",
"event_id",
"invited",
"NULL",
"interested",
"not_interested")) %>%
subset(interested == 1 | not_interested == 1) %>%
select(-not_interested)
train.joined <- inner_join(train, users, by=c("user_id")) %>%
inner_join(events, by=c("event_id")) %>%
select(-c(user_id,event_id))
kable(head(train.joined,7))
invited | interested | birthyear | gender | timezone | lat | lng |
---|---|---|---|---|---|---|
0 | 1 | 1990 | male | 480 | 3.567 | 98.650 |
0 | 1 | 1998 | female | 240 | -20.162 | 57.499 |
0 | 1 | 1998 | female | 240 | -20.161 | 57.499 |
0 | 1 | 1998 | female | 240 | -20.245 | 57.482 |
0 | 1 | 1982 | male | -360 | NA | NA |
0 | 1 | 1991 | male | 270 | NA | NA |
0 | 1 | 1992 | female | 420 | -6.226 | 106.507 |
dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE,
levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
select(c(-female)) %>%
na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <-
revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))
rocCurve <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
thr <- seq(0, 1, length = nPoints)
tpr <- numeric(nPoints)
fpr <- numeric(nPoints)
pos <- trueLabels == posClass
neg <- !pos
nPos <- sum(pos, na.rm = TRUE)
nNeg <- sum(neg, na.rm = TRUE)
for (i in 2:length(thr)) {
t <- thr[i]
meetOrExceedThreshold <- predictedProbs >= t
tpr[i] <- sum((meetOrExceedThreshold & pos), na.rm = TRUE)/nPos
fpr[i] <- sum((meetOrExceedThreshold & neg), na.rm = TRUE)/nNeg
}
duplicatedFPRs <- duplicated(fpr)
df <- data.frame(fpr = fpr[!duplicatedFPRs], tpr = tpr[!duplicatedFPRs],
thr = thr[!duplicatedFPRs])
return(df)
}
auc <- function(trueLabels, predictedProbs, nPoints = 100, posClass = 1) {
auc <- 0
df <- rocCurve(trueLabels = trueLabels, predictedProbs = predictedProbs,
nPoints = nPoints, posClass = posClass)
idx <- 3:length(df$fpr)
auc <- as.double((df$fpr[idx - 1] - df$fpr[idx]) %*% (df$tpr[idx] + df$tpr[idx -
1]))/2
return(auc)
}
plotROC <- function(trueLabels, predictedProbs, nPoints=100, posClass=1){
auc <- auc(trueLabels, predictedProbs, nPoints, posClass)
df <- rocCurve(trueLabels = trueLabels,
predictedProbs = predictedProbs,
nPoints = nPoints,
posClass = posClass)
ggplot(df,aes(x=fpr,y=tpr)) +
geom_step(direction="vh") +
scale_x_continuous(limits = c(0,1)) +
scale_y_continuous(limits = c(0,1)) +
labs(x = "False-positive rate",
y = "True-positive rate") +
annotate("text", x=.5,y=.5,label=paste0("Area under the curve: ",round(auc,digits = 2)))
}
trainIndex <- createDataPartition(train.joined.numeric$male, p = .8,
list = FALSE,
times = 1)
eventTrain <- train.joined.numeric[ trainIndex,]
eventTest <- train.joined.numeric[-trainIndex,]
fitControl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE,
summaryFunction=twoClassSummary,
classProbs=TRUE)
rfFit <- train(interested ~ ., data = eventTrain,
method = "rf",
trControl = fitControl,
verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit,
eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
event.rf.pred.prob$yes, nPoints = 1000)
# from events data, get event_id, start_time, lat, lng
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame
events <- fread(file.path(dataDir, "events.csv"),
sep=",",
colClasses = c("integer64",
"NULL",
'myDateTime',
rep("NULL",4),
"numeric",
"numeric",
rep("NULL",101))) %>%
semi_join(y = train) %>%
mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
datetime_year = as.numeric(substr(datetime, 1, 4)),
datetime_month_of_year = as.numeric(substr(datetime, 6, 7)),
datetime_day_of_month = as.numeric(substr(datetime, 9,10)),
datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)),
datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
datetime_day_of_week = as.factor(weekdays(datetime)),
datetime_week_of_year = week(datetime),
datetime_quarter_of_year = quarter(datetime),
datetime_day_of_year = as.POSIXlt(datetime)$yday
) %>%
select(c(-datetime, -start_time))
##
Read 0.0% of 3137972 rows
Read 23.3% of 3137972 rows
Read 44.9% of 3137972 rows
Read 65.6% of 3137972 rows
Read 87.0% of 3137972 rows
Read 3137972 rows and 4 (of 110) columns from 1.092 GB file in 00:00:08
## Joining, by = "event_id"
kable(head(
select(events,
c(datetime_hour_of_day,
datetime_day_of_week,
datetime_day_of_month,
datetime_day_of_year,
datetime_month_of_year)),
5))
datetime_hour_of_day | datetime_day_of_week | datetime_day_of_month | datetime_day_of_year | datetime_month_of_year |
---|---|---|---|---|
13 | Friday | 26 | 299 | 10 |
7 | Saturday | 3 | 307 | 11 |
0 | Wednesday | 31 | 304 | 10 |
16 | Thursday | 1 | 305 | 11 |
0 | Friday | 30 | 334 | 11 |
kable(head(
select(events,
c(datetime_minute_of_hour,
datetime_second_of_minute,
datetime_year,
datetime_quarter_of_year,
datetime_week_of_year)),
5))
datetime_minute_of_hour | datetime_second_of_minute | datetime_year | datetime_quarter_of_year | datetime_week_of_year |
---|---|---|---|---|
30 | 0 | 2012 | 4 | 43 |
0 | 0 | 2012 | 4 | 45 |
0 | 0 | 2012 | 4 | 44 |
0 | 0 | 2012 | 4 | 44 |
0 | 0 | 2012 | 4 | 48 |
train.joined <- inner_join(train, users, by=c("user_id")) %>%
inner_join(events, by=c("event_id")) %>%
select(-c(user_id,event_id))
dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE,
levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
select(c(-female)) %>%
na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <-
revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))
trainIndex <- createDataPartition(train.joined.numeric$male, p = .8,
list = FALSE,
times = 1)
eventTrain <- train.joined.numeric[ trainIndex,]
eventTest <- train.joined.numeric[-trainIndex,]
fitControl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE,
summaryFunction=twoClassSummary,
classProbs=TRUE)
rfFit <- train(interested ~ ., data = eventTrain,
method = "rf",
trControl = fitControl,
verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit,
eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
event.rf.pred.prob$yes, nPoints = 1000)
# from events data, get event_id, start_time, lat, lng, c_1 through c_other
# note don't get confused about the user_id in this dataset. it is the user that created the event and is different from the user_id in the train dataset
# also, doing a semi_join so we only work with the event data in the train data
# frame
events <- fread(file.path(dataDir, "events.csv"),
sep=",",
colClasses = c("integer64",
"NULL",
'myDateTime',
rep("NULL",4),
"numeric",
"numeric",
rep("integer",101))) %>%
semi_join(y = train) %>%
mutate(datetime = strptime(start_time, format="%Y-%m-%dT%H:%M:%OS", tz="UTC"),
datetime_year = as.numeric(substr(datetime, 1, 4)),
datetime_month_of_year = as.numeric(substr(datetime, 6, 7)),
datetime_day_of_month = as.numeric(substr(datetime, 9,10)),
datetime_hour_of_day = as.numeric(substr(datetime, 12, 13)),
datetime_minute_of_hour = as.numeric(substr(datetime, 15, 16)),
datetime_second_of_minute = as.numeric(substr(datetime, 18, 19)),
datetime_day_of_week = as.factor(weekdays(datetime)),
datetime_week_of_year = week(datetime),
datetime_quarter_of_year = quarter(datetime),
datetime_day_of_year = as.POSIXlt(datetime)$yday
) %>%
select(c(-datetime, -start_time))
##
Read 0.0% of 3137972 rows
Read 11.2% of 3137972 rows
Read 22.9% of 3137972 rows
Read 33.8% of 3137972 rows
Read 44.6% of 3137972 rows
Read 55.8% of 3137972 rows
Read 66.9% of 3137972 rows
Read 78.1% of 3137972 rows
Read 89.2% of 3137972 rows
Read 99.7% of 3137972 rows
Read 3137972 rows and 105 (of 110) columns from 1.092 GB file in 00:00:13
## Joining, by = "event_id"
rhandsontable(matrix(data=unlist(events[1:110,4:104]),nrow=11,ncol=10),
readOnly = TRUE)
train.joined <- inner_join(train, users, by=c("user_id")) %>%
inner_join(events, by=c("event_id")) %>%
select(-c(user_id,event_id))
dummies <- dummyVars(" ~ .", data = train.joined, fullRank = TRUE,
levelsOnly = TRUE)
train.joined.numeric <- data.frame(predict(dummies, newdata = train.joined)) %>%
select(c(-female)) %>%
na.omit()
train.joined.numeric$interested <- factor(train.joined.numeric$interested)
train.joined.numeric$interested <-
revalue(train.joined.numeric$interested, c("0" = "no", "1" = "yes"))
trainIndex <- createDataPartition(train.joined.numeric$male, p = .8,
list = FALSE,
times = 1)
eventTrain <- train.joined.numeric[ trainIndex,]
eventTest <- train.joined.numeric[-trainIndex,]
fitControl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE,
summaryFunction=twoClassSummary,
classProbs=TRUE)
rfFit <- train(interested ~ ., data = eventTrain,
method = "rf",
trControl = fitControl,
verbose = FALSE)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
event.rf.pred.prob <- predict(object=rfFit,
eventTest, type='prob')
plotROC(revalue(eventTest$interested, c("no" = 0, "yes" = 1)),
event.rf.pred.prob$yes, nPoints = 1000)
rfImp <- varImp(rfFit, scale = FALSE)
setDT(rfImp$importance, keep.rownames = TRUE)[]
## rn Overall
## 1: invited 0.7038939
## 2: birthyear 83.9764275
## 3: male 8.8013301
## 4: timezone 26.8073843
## 5: lat 35.6837413
## ---
## 118: Tuesday 2.2122801
## 119: Wednesday 2.9494567
## 120: datetime_week_of_year 10.0579370
## 121: datetime_quarter_of_year 0.7604578
## 122: datetime_day_of_year 21.2791525
rfImp$importance <-
rfImp$importance[order(rfImp$importance$Overall,
decreasing = TRUE),]
rfImp$importance$rn <- reorder(rfImp$importance$rn, rfImp$importance$Overall)
ggplot(rfImp$importance[1:7], aes(x = rn, y = Overall)) +
geom_bar(stat = "identity") +
xlab("Feature") +
ylab("Importance") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(text=element_text(size=16, family="Arial")) +
scale_y_continuous(position="top") +
#theme(axis.title.y = element_text(angle = 90, vjust = 1.08)) +
coord_flip()