library(gbm)
library(MLmetrics)
library(vip)
library(dplyr)
library(ggplot2)
library(caret)
#remotes::install_github("rstudio/reticulate")
library(reticulate)
#Data used:
## Wine Quality for regression:
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
# For later use, define a custom ggplot theme for all plots generated in R code:
ggplot2::theme_set(
theme(
panel.border = element_rect(color = 'black', fill = NA, size = .9),
panel.background = element_rect(color = 'black', fill = 'white'),
panel.grid = element_line(color = 'lightgrey', size = .2),
plot.background = element_rect(fill = 'white'),
plot.title = element_text(hjust = .5)
)
)
# The red wine dataset:
download.file(
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
destfile = "~/red_wine_qual.csv"
)
# The white wine dataset:
download.file(
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
destfile = "~/white_wine_qual.csv"
)
# Here we read in [using read.csv()] both datasets inside of a call to rbind().
# So they will be read in and immediately stacked atop one another into one dataset.
all_wine = rbind(read.csv(
# Arguments for the first call to read.csv()
# Enter the file path and name:
file = "~/red_wine_qual.csv",
# For some reason, each data point was seperated with a semi-colon in the downloaded data
sep = ";",
# For character variables, read them in as character strings, rather than levels of a factor
# (In case the two data frames we're rbind-ing have different factor levels, rbind would fail)
stringsAsFactors = F),
# same for the other dataset:
read.csv("~/white_wine_qual.csv", sep = ";", stringsAsFactors = F)
) # close the rbind.
# Check out the nature of the dataset
str(all_wine)
## 'data.frame': 6497 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
We’ll take a quick look at the distribution of the response variable
ggplot(all_wine) + # initialize a plotting space
geom_histogram(aes(x = quality), # visualize the histogram of the variable "quality"
fill = 'grey', color = 'black',
# One 'bin' of the histogram per value of quality, since it is actually categorical:
bins = n_distinct(all_wine$quality))
set.seed(1)
# Use sample() to select random numbers from 1, 2, 3, ... Nrow of the data (row indices)
train.idx <- sample(1:nrow(all_wine), size = nrow(all_wine)*.8)
wine_train <- all_wine[train.idx, ] # use square brackets to subset by these row indices
wine_test <- all_wine[-train.idx, ] # negate the row indexer to select all other rows.
# Save these out as csv files on your computer so that we can later
# access them directly when we run the python code (much later)
write.csv(all_wine, "~/all_wine_quality.csv",
# don't add a column enumarting the rows:
row.names = F)
write.csv(wine_train, "~/wine_train.csv", row.names = F)
write.csv(wine_test, "~/wine_test.csv", row.names = F)
gbm
to predict wine quality scoresFor now we will hold hyperparameters constant (rather than tuning them). However, the hyperparemeters often tuned for a GBM fit with gbm::gbm
model are:
shrinkage
: the proportion by which to scale down the new predictions from each sequential tree. n.minobsinnode
: The minimum size of a terminal leaf or node. Values greater thatn 1 will help attenuate overfittinginteraction.depth
: The number of splits each tree is allowed to make. n.trees
,cv.folds
or bag.fraction
, usually don’t affect model performance, but can interfere with our ability to find the optimal model if we set a bad value. use ?gbm
for details. More on hyperparameter tuning below.wine_gbm = gbm(
# Specify the formula for the model in the fomrat: "response ~ predictors"
formula = quality ~ .,
# the train data:
data = wine_train,
# Specify the assumed distribution of the response variable:
distribution = 'gaussian',
# The number of trees to build in total:
n.trees = 500,
# The percentage (form 0 to 1) of the residuals from each new tree to increment
# our cumulative prediction:
shrinkage = .1, # this is the default
# The number of samples to randomly sample from previous trees with which
# to fit subsequent trees:
bag.fraction = .7,
# The number of cross validation folds to determine the performance of each tree:
cv.folds = 5
)
# Use gbm's gbm.perf() function
gbm.perf(
# Supply the model:
wine_gbm,
# Compute performance with respect to the cross-validation folds:
method = 'cv',
# Set plot.it to T (or TRUE) to visualize the error at each boosting iteration (at each tree):
plot.it = T
)
## [1] 487
Note: this also printed out a number: 487. That is the boosting iteration, or the number of the tree at which cross-validation error from the 5-fold cross-validation was minimized.
We’ll use two functions imported from the MLmetrics
package: R2_Score
and MSE
best_tree = gbm.perf(wine_gbm, plot.it = F, method = 'cv') # now, don't plot it
# Set up a little data frame with a column for the R2 and MSE computed with the test data
# so that it prints out in a clear way.
cross_val_score = data.frame(
# Here we use functions from MLmetrics package, which take the predicted and true values of
# the response variable, in order to compute cross-validation scores/metrics of model performance
R2 = R2_Score(y_pred = predict(wine_gbm, newdata = wine_test, n.trees = best_tree),
y_true = wine_test$quality),
MSE = MSE(y_pred = predict(wine_gbm, newdata = wine_test, n.trees = best_tree),
y_true = wine_test$quality)
)
cross_val_score
## R2 MSE
## 1 0.3408006 0.5014888
# Fit model
reg_mod = lm(quality ~ ., data = wine_train)
# Score model with test data:
data.frame(
R2 = R2_Score(
y_pred = predict(object = reg_mod, newdata = wine_test),
y_true = wine_test$quality),
MSE = MSE(
y_pred = predict(object = reg_mod, newdata = wine_test),
y_true = wine_test$quality)
)
## R2 MSE
## 1 0.2889758 0.5409148
vip::vi_permute()
to compute cross-validated permutation variable importance# (required by `vip`) Write a function with which to get our predicted values.
# For compatibility, the arguments of the function must be 'object' and 'newdata'
predict_from_best_tree <- function(object, newdata)
{
predict(object, newdata, n.trees = best_tree)
}
# calculate variable importance
permutation_imp = vi_permute(
# 1. supply the model (any model)
object = wine_gbm,
# 2. Specify the names of the predictors of which we wish to compute the importance
feature_names = wine_train %>% select(-quality) %>% names,
# 3. Supply the TEST data, instead of the TRAIN data (a small hack here!)
train = wine_test,
# 4. Supply the corresponding response variable
target = wine_test$quality,
# 5. The metric with which to assess model performance before/after permutation
metric = 'mse',
# 6. define importance as the ratio of true-to-permuted MSE
type = 'ratio',
# 7. For stability, permute each variable 100 times.
nsim = 100,
keep = T, # for plotting, this needs to be TRUE
pred_wrapper = predict_from_best_tree # Here we supply our wrapper of predict()
)
# Print the table of variable importances, and the St.Dev from the 100 permutations
permutation_imp
## # A tibble: 11 x 3
## Variable Importance StDev
## <chr> <dbl> <dbl>
## 1 fixed.acidity 1.01 0.00213
## 2 volatile.acidity 1.15 0.0151
## 3 citric.acid 1.02 0.00586
## 4 residual.sugar 1.06 0.00934
## 5 chlorides 1.00 0.00439
## 6 free.sulfur.dioxide 1.12 0.0150
## 7 total.sulfur.dioxide 1.11 0.0133
## 8 density 1.00 0.00258
## 9 pH 1.01 0.00323
## 10 sulphates 1.05 0.00680
## 11 alcohol 1.46 0.0328
vip(object = permutation_imp, # supply the result of vi_permute()
num_features = ncol(wine_test) - 1, # tell it how many variables to plot (all)
mapping = aes(fill = Variable), # pass along arguments to ggplot()
geom = 'boxplot') # plot boxplots (other options in ?vip)
summary(reg_mod)
##
## Call:
## lm(formula = quality ~ ., data = wine_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5014 -0.4661 -0.0418 0.4705 2.9675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.479e+01 1.297e+01 4.996 6.03e-07 ***
## fixed.acidity 7.323e-02 1.717e-02 4.265 2.03e-05 ***
## volatile.acidity -1.302e+00 8.553e-02 -15.218 < 2e-16 ***
## citric.acid -4.306e-02 8.866e-02 -0.486 0.627
## residual.sugar 4.679e-02 5.676e-03 8.244 < 2e-16 ***
## chlorides -5.217e-01 3.644e-01 -1.432 0.152
## free.sulfur.dioxide 4.929e-03 8.311e-04 5.930 3.22e-09 ***
## total.sulfur.dioxide -2.455e-03 3.100e-04 -7.917 2.95e-15 ***
## density -6.408e+01 1.323e+01 -4.842 1.32e-06 ***
## pH 4.854e-01 1.005e-01 4.831 1.40e-06 ***
## sulphates 7.425e-01 8.425e-02 8.813 < 2e-16 ***
## alcohol 2.536e-01 1.831e-02 13.848 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7356 on 5185 degrees of freedom
## Multiple R-squared: 0.2922, Adjusted R-squared: 0.2907
## F-statistic: 194.6 on 11 and 5185 DF, p-value: < 2.2e-16
# Same here
vip(object = vi_permute(
object = reg_mod,
feature_names = wine_train %>% select(-quality) %>% names,
train = wine_test,
target = wine_test$quality,
metric = 'mse',
type = 'ratio',
nsim = 100,
keep = T,
pred_wrapper = function(object, newdata) predict(object, newdata)),
num_features = ncol(wine_test) - 1,
mapping = aes(fill = Variable),
geom = 'boxplot'
)
caret
:train
function.trainControl
function to pass along to train
various options that will guide our hyperparameter tuning process.train
a ‘tune grid’ or ‘hyperparameter grid’, which specifies all of the combinations of hyperparemeter settings to train the model on (of which one will minimize cross validation error and determine the settings of the final model).train
function, we supply our data, the formula of the model, and also instruct caret
of which model we are using to do the modeling. Importantly, caret::train
doesn’t really fit a model itself; it requires us to specify (as an argument supplied to method
) a model from another R
package that it knows how to look for. For example, caret
has in-built codes/names for many of the models built by the many published R
packages, so that, for example, if we say method = "rf"
, caret
knows to build a random forest model by calling (under the hood) randomForest()
from the package randomForest
. The list of models available can be found in the caret::train
documentation. Below, we fit a GBM from the same gbm
package as above, using method = "gbm"
.
train_ctrl = trainControl(
# Set the method for calculating prediction error to be 'cross-validation':
method = 'cv',
# Set the number of cross-validation folds to be 5:
number = 5
)
# Define the values of the hyperparemeters over which to tune the model.
param_grid = expand.grid(
interaction.depth = c(1, 2, 3),
n.minobsinnode = c(5, 10),
shrinkage = c(.1, .8),
# Note, if we don't supply caret ALL of the hyperparameters it is looking for,
# we would get an error. So even though we won't tune over multiple values
# for n.trees, we'll supply it here anyway:
n.trees = 100 # to save time, I set it to 100 (may want a larger value in practice)
)
# Now run train() with the above settings:
caret_gbm <- caret::train(
form = quality ~., # supply the formula to model
data = wine_train, # supply the data
method = 'gbm', # request a gradient boosted machine
trControl = train_ctrl,
tuneGrid = param_grid,
verbose = F # we don't want it to print out results at every iteration!
)
Let’s look at the output
caret_gbm
## Stochastic Gradient Boosting
##
## 5197 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4157, 4157, 4158, 4158, 4158
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode RMSE Rsquared MAE
## 0.1 1 5 0.7277587 0.3107758 0.5795559
## 0.1 1 10 0.7282995 0.3101599 0.5795152
## 0.1 2 5 0.7027605 0.3547411 0.5536905
## 0.1 2 10 0.7038352 0.3526403 0.5555297
## 0.1 3 5 0.6946449 0.3681259 0.5462374
## 0.1 3 10 0.6943401 0.3691544 0.5463812
## 0.8 1 5 0.7260031 0.3145063 0.5674162
## 0.8 1 10 0.7242945 0.3163422 0.5672335
## 0.8 2 5 0.7343802 0.3193877 0.5703222
## 0.8 2 10 0.7343035 0.3155772 0.5723601
## 0.8 3 5 0.7557079 0.3089381 0.5894348
## 0.8 3 10 0.7553857 0.3041155 0.5849650
##
## Tuning parameter 'n.trees' was held constant at a value of 100
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 100, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
As the final line of the output tells us, the best values for our hyperparameters are splits per tree, allowing no less than
observations/subjects in a terminal node, and a shrinkage of . Note, our hyperparemeter grid was very imprecise (we only supplied a handful of values for each hyperparameter), so we likely haven’t yet found the absolute best model, but that’s okay for now.
We can also extract this information directly:
best_params <- caret_gbm$bestTune
best_params
## n.trees interaction.depth shrinkage n.minobsinnode
## 6 100 3 0.1 10
And the best model itself for use with other functions such as predict
or vip
final_model <- caret_gbm$finalModel
# Using the same functions we used above to retrieve test-set error:
data.frame(
R2 = R2_Score(
y_pred = predict(object = final_model,
newdata = wine_test,
n.trees = best_params$n.trees),
y_true = wine_test$quality),
MSE = MSE(
y_pred = predict(object = final_model,
newdata = wine_test,
n.trees = best_params$n.trees),
y_true = wine_test$quality)
)
## R2 MSE
## 1 0.3714385 0.478181
By trying out different values for the hyperparameters, we found a model that is better at predicting the quality of the held-out data better than the default values did.
python
This gives us a chance to re-run the analysis in environments that might have python
, but not R
, installed. In addition, we can compare the results between the two (more specifically the packages used in each language), which we'll expect to be similar, but not exactly the same due to stochasticity in the analysis or implementation differences.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
Read in the data
wine_df = pd.read_csv('~/all_wine_quality.csv')
Check the variable names
wine_df.columns
## Index(['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
## 'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
## 'pH', 'sulphates', 'alcohol', 'quality'],
## dtype='object')
Looks like everything ran in as expected.
X = wine_df.drop(columns = 'quality') # select all columns except 'quality'
y = wine_df['quality'] # select just quality
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 1, test_size = .2)
But wait, we already saved out our train and test data! The above is just for demonstration. For reproducibility, we’ll actually read in the same exact train and test split of the data that we used and wrote out above.
X_train = pd.read_csv("~/wine_train.csv")
X_test = pd.read_csv("~/wine_test.csv")
# We do need to drop the response from each of the above, and save them as y_*
y_train = X_train.quality # another way to select JUST quality
X_train = X_train.drop(columns = 'quality')
y_test = X_test.quality # another way to select JUST quality
X_test = X_test.drop(columns = 'quality')
# Check the dimensions for sanity
print(X_train.shape)
## (5197, 11)
y_test.shape
## (1300,)
sklearn
Just like in caret
, we conduct a sequential and exhaustive (to the extent that we make it so) search of the possible hyperparameters by creating parameter “grid” over which to iteratively build model. There are three main steps to tuning
hyerparameters with sklearn
:
sklearn
, such as support vector machines, logistic regressions, random forests, neural nets, etc.GridSearchCV()
. This will take as an argument the grid-like object containing all of the hyperparameter values (specific to the type of estimator) that we would like to assess.Here we define the parameter grid in the way compatible with the GridSearchCV
function.
The parameter grid is going to by a type of python
object called a dict
or “dictionary”. It will be nothing more than a character string, paired with a vector of values, then another character string to represent the name of another hyperparemeter, seperated by a :
from a list of possible values, and so on. Like so:
params = {'n_estimators': [100], # The number of trees to build
'max_depth': [1, 3], # same as 'interaction depth' (number of splits per tree)
'min_samples_split': [5, 10], # same as n.minobsinnode (# of samples per leaf)
'learning_rate': [.1, .8], # same as shrinkage (size of step along gradient)
'loss': ['ls']} # we will search for the least squared error (same as MSE)
Now we “instantiate” the model, by running sklearn
’s own GBM function, but we won’t supply any hyperparameters or data just yet.
GBM = GradientBoostingRegressor(random_state = 1) # setting a seed for reproducibility here as well
Now we supply our model to GridSearchCV
, along with the parameters to again instantiate an object, this time a grid-search-cross-validation object. Note, we are just “instantiating” again because we have not supplied data. Here, we supply those options to control the grid search, such as how many folds in our cross-validation tests, the estimator to use, and the parameters over which to tune (that we defined above).
gbm_gridCV = GridSearchCV(estimator = GBM, param_grid = params, cv = 5)
Now we have created an object, called gbm_gridCV
, which contains a model and a parameter grid. It is created with the sklearn
function GridSearchCV
, and so it has it’s own functions or methods
that we can call from it. One is fit
, which we will finally use to supply the data, and fit each model, as in caret::train
.
gbm_gridCV.fit(X_train, y_train)
## GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=1),
## param_grid={'learning_rate': [0.1, 0.8], 'loss': ['ls'],
## 'max_depth': [1, 3], 'min_samples_split': [5, 10],
## 'n_estimators': [100]})
Note, we didn’t have to save the above line or assign a new object using “=
”. With the GridSearchCV
class of objects, it simply updates itself when we run fit
.
Now we can view the results from running the grid search cross-validation.
results_table = gbm_gridCV.cv_results_
# The results table has a lot of info (see the documentation for GridSearchCV)
# We'll extract the hyperparameters values that were used, and the CV score.
pd.DataFrame(results_table)[['param_learning_rate','param_max_depth','param_min_samples_split','mean_test_score']]
## param_learning_rate param_max_depth param_min_samples_split mean_test_score
## 0 0.1 1 5 0.296276
## 1 0.1 1 10 0.296276
## 2 0.1 3 5 0.378865
## 3 0.1 3 10 0.379663
## 4 0.8 1 5 0.323059
## 5 0.8 1 10 0.323059
## 6 0.8 3 5 0.297600
## 7 0.8 3 10 0.295266
Note that sklearn
created these columns for us automatically, by just prepending the name of each parameter with “param_”. As we can see, the best iteration occurred when the learning rate was set to 0.1, the maximum depth was 3, and the number of samples allowed in a leaf were 10.
To access these values directly, you can use gbm_gridCV.best_params_
gbm_gridCV.best_params_
## {'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 100}
And as above, we can score our final model with the completely unseen test data. To do so, we can use another function like those above that take the predicted and true values of a given response, to calculate error. Here, when we call the predict
method directly from the gbm_gridCV
object that we fit, it will generate predictions automatically from the best model in the grid search based on the mean cross-validation score.
pd.DataFrame({'R2': [r2_score(y_test, gbm_gridCV.predict(X_test))],
'MSE': [mean_squared_error(y_test, gbm_gridCV.predict(X_test))]})
## R2 MSE
## 0 0.39737 0.458453
sklearn
Finally, sklearn
has the same capacity to generate permutation variable importance that are calculated on held-out test data.
result = permutation_importance(gbm_gridCV.best_estimator_, X_test, y_test, n_repeats=100,
random_state=1, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
fig = plt.figure(figsize=(12, 6));
plt.boxplot(result.importances[sorted_idx].T, vert = False,
labels=np.array(X_test.columns)[sorted_idx]);
plt.title("Permutation Importance (test set)")
fig.tight_layout();
plt.show();