Hyperparameter Tuning Comparison and Tutorial

updated on 2020/04/29

# import libraries
import numpy as np
import pandas as pd
from scipy.stats import randint

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

from sklearn.datasets import load_diabetes
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt.pyll.stochastic
# load regression data
dataset = load_diabetes()
X = dataset.data
y = dataset.target
print (X.shape, y.shape)
(442, 10) (442,)
# plot feature distribution
fig, axes = plt.subplots(2, 5, figsize = (15, 6))
axes = axes.ravel()

for i in range(X.shape[1]):
    sns.violinplot(X[:,i], ax = axes[i])
    axes[i].set_xlabel('X[{}]'.format(i))

plt.tight_layout
plt.show()
# plot target distribution
sns.violinplot(y)
plt.show()
# set up training specs
random_state = 42
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = random_state)
cv = KFold(n_splits = 5, shuffle=True, random_state = random_state)
model = XGBRegressor(random_state=random_state, n_jobs = -2)
# default performance
s = cross_val_score(model, train_X, train_y, cv = cv, scoring = 'r2', n_jobs = -1)
print ('score from five validation sets: ',s, '\nmean score:', s.mean())
score from five validation sets:  [0.36663975 0.40737173 0.26931265 0.41021941 0.42056251] 
mean score: 0.3748212106452563

1. GridSearch

Trying parameter sets one by one. 1. Slow - exhaustive search 2. Only discrete values - miss optimal values not on the grid 3. Combine the above, knowledge of where the optimal combo sits is required to be efficient.

# set parameter grid
param_grid = {'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
              'max_depth': np.linspace(2, 20, 4, dtype = int),
              'colsample_bytree': np.linspace(0.6, 1.0, 5)}
gs=GridSearchCV(model, param_grid, scoring='r2', n_jobs = -2, cv = cv)
gs.fit(train_X, train_y)
print('best_score: ', gs.best_score_, '\nbest_parameter: ', gs.best_params_)
[19:00:41] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
best_score:  0.42352938503280263 
best_parameter:  {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 2}
gs_results_df = pd.DataFrame(np.transpose([gs.cv_results_['mean_test_score'], 
                                         gs.cv_results_['param_learning_rate'].data,
                                         gs.cv_results_['param_colsample_bytree'].data,
                                         gs.cv_results_['param_max_depth'].data]), 
                                         columns=['score', 'learning_rate', 'colsample_bytree', 'max_depth'])
gs_results_df.plot(subplots=True, figsize=(10,10))
plt.show()

As observed, score doesn't move in a trend, rather, each iteration corresponds to a point from the grid. It appears that learning rate has the most impact on the score. as colsample_bytree increase, max_depth impacts increases as well.

2. RandomSearch

Advantage: 1. Variate all parameters for each search, not wasting time on meaningless parameters 2. Generally faster 3. Continuous

Disadvantage: 1. Independent steps that doesn't learn from previous result 2. May fail to find gloabl optimum

param_grid_rand = {'learning_rate': np.logspace(-5, 0, 100),
                  'max_depth': randint(2, 20),
                  'colsample_bytree': np.linspace(0.5, 1.0, 100)}
rs = RandomizedSearchCV(model, param_grid_rand, scoring='r2',
                        n_iter = 120, cv = cv, n_jobs = -1, random_state = random_state)
rs.fit(train_X, train_y)

print('rs_best_score: ', rs.best_score_, '\nrs_best_parameter: ', rs.best_params_)
[18:55:51] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
rs_best_score:  0.3910418215681818 
rs_best_parameter:  {'colsample_bytree': 0.5404040404040404, 'learning_rate': 0.2477076355991709, 'max_depth': 2}
rs_result_df = pd.DataFrame(np.transpose([rs.cv_results_['mean_test_score'], 
                                          rs.cv_results_['param_colsample_bytree'].data,
                                         rs.cv_results_['param_learning_rate'].data,
                                         rs.cv_results_['param_max_depth'].data]),
                           columns = ['score', 'colsample_bytree', 'leaning_rate', 'max_depth'])
rs_result_df.plot(subplots = True, figsize=(10,10))
plt.show()

Randomized search with the same number trials, improved upon the default model, but didn't out perform grid search. The sampleing is random, therefore, no trend in score progression.

3. Hyperopt

# function to minimize: validation scores of a model
def objective(params, random_state = random_state, cv = cv, X = train_X, y = train_y):
    params = {'colsample_bytree': params['colsample_bytree'],
             'max_depth': int(params['max_depth']),
             'learning_rate': params['learning_rate']}
    model = XGBRegressor(random_state = random_state, **params)
    score = -cross_val_score(model, X, y, cv=cv, 
                             scoring = 'r2', 
                             n_jobs = -1).mean()
    return score

TPE ( Tree-structured Parzen Estimator)

Bayesian approach for optimization:

  1. Random parameter x0
  2. evaluate objective function F(xt)
  3. probablity model P(F|xt)
  4. choose next xt+1 based on probablity model and acquisition function
  5. repeat step 2-4 until stop criteria is satisfied.
space = {'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1.0, 0.01),
        'max_depth': hp.quniform('max_depth', 2, 20, 1),
        'learning_rate': hp.loguniform('learning_rate', -5, 0)}
trials = Trials()
n_iter = 120
best = fmin(fn = objective,
           space = space,
           algo = tpe.suggest, #optimization algorithm
           max_evals = n_iter, #max number of iterations
           trials = trials, #logging
           rstate = np.random.RandomState(random_state) #reproducibility
           )
100%|██████████| 120/120 [00:28<00:00,  4.21it/s, best loss: -0.43141847283141077]
best
{'colsample_bytree': 0.58,
 'learning_rate': 0.06440882735652695,
 'max_depth': 2.0}
model = XGBRegressor(random_state = random_state, 
                     n_jobs = -2,
                     colsample_bytree = best['colsample_bytree'],
                     max_depth = int(best['max_depth']),
                     learning_rate = best['learning_rate'])
model.fit(train_X, train_y)
[18:27:12] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.58, gamma=0,
             importance_type='gain', learning_rate=0.06440882735652695,
             max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
             n_estimators=100, n_jobs=-2, nthread=None, objective='reg:linear',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)
tpe_test_score = mean_squared_error(test_y, model.predict(test_X))
objective(best)
-0.43141847283141077
tpe_results = np.array([[x['result']['loss'],
                        x['misc']['vals']['learning_rate'][0],
                        x['misc']['vals']['max_depth'][0],
                        x['misc']['vals']['colsample_bytree'][0]]
                       for x in trials.trials])
tpe_results_df = pd.DataFrame(tpe_results, columns = ['score', 'learning_rate','max_depth', 'colsample_bytree'])
tpe_results_df.plot(subplots = True, figsize = (10, 10))
plt.show()

It is obvious that score is trending down as hyperopt learns more about the objective function space. What's more interesintg is the decrease in oscillations across all hyperparameters, with learning_rate being the most obvious. This is consistent with earlier observation of learning rate has the most impact, thus lowering it reduces the loss the most.

4. hyperopt exploration

# hyperopt search setup
def objective(x):
    return {
        'loss': x ** 2,
        'status': STATUS_OK,
        # -- store other results like this
        'eval_time': time.time(),
        'other_stuff': {'type': None, 'value': [0, 1, 2]},
        # -- attachments are handled differently
        'attachments':
            {'time_module': pickle.dumps(time.time)}
        }
trials = Trials()
best = fmin(objective, # evaluation function
            space = hp.uniform('x', -10, 10), # hyperparameter space
            algo = tpe.suggest, # sampling method
            max_evals = 100, # iterations
            trials = trials) # where to save the result

print (best)
100%|██████████| 100/100 [00:00<00:00, 283.95it/s, best loss: 1.2068264187180965e-05]
{'x': -0.003473940728795033}
# about trials object
print (type(trials.trials))
trials.trials[0]
<class 'list'>
{'state': 2,
 'tid': 0,
 'spec': None,
 'result': {'loss': 86.87642335949964,
  'status': 'ok',
  'eval_time': 1588213480.033823,
  'other_stuff': {'type': None, 'value': [0, 1, 2]}},
 'misc': {'tid': 0,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'x': [0]},
  'vals': {'x': [-9.320752295791346]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2020, 4, 30, 2, 24, 40, 33000),
 'refresh_time': datetime.datetime(2020, 4, 30, 2, 24, 40, 33000)}
# minimum setup, only cares about the argmin
def objective(x):
    return {'loss': x ** 2, 'status': STATUS_OK }

best = fmin(objective,
            space=hp.uniform('x', -10, 10),
            algo=tpe.suggest,
            max_evals=100)

print (best)
100%|██████████| 100/100 [00:00<00:00, 293.24it/s, best loss: 4.9954120966897265e-06]
{'x': 0.002235041855690789}
# check sampling result
space = hp.quniform('n_estimators',9.5, 3000.5, 1 )
s = hyperopt.pyll.stochastic.sample(space) 
print (s)
2837.0
space = hp.quniform('n_estimators',np.log(9.5), np.log(3000.5), 1 )
s = hyperopt.pyll.stochastic.sample(space) 
print (s)
7.0

5. Reference

https://www.kaggle.com/ilialar/hyperparameters-tunning-with-hyperopt