Hyperparameter Tuning Comparison and Tutorial¶
updated on 2020/04/29
# import libraries
import numpy as np
import pandas as pd
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
from sklearn.datasets import load_diabetes
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt.pyll.stochastic
# load regression data
dataset = load_diabetes()
X = dataset.data
y = dataset.target
print (X.shape, y.shape)
# plot feature distribution
fig, axes = plt.subplots(2, 5, figsize = (15, 6))
axes = axes.ravel()
for i in range(X.shape[1]):
sns.violinplot(X[:,i], ax = axes[i])
axes[i].set_xlabel('X[{}]'.format(i))
plt.tight_layout
plt.show()
# plot target distribution
sns.violinplot(y)
plt.show()
# set up training specs
random_state = 42
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = random_state)
cv = KFold(n_splits = 5, shuffle=True, random_state = random_state)
model = XGBRegressor(random_state=random_state, n_jobs = -2)
# default performance
s = cross_val_score(model, train_X, train_y, cv = cv, scoring = 'r2', n_jobs = -1)
print ('score from five validation sets: ',s, '\nmean score:', s.mean())
1. GridSearch¶
Trying parameter sets one by one. 1. Slow - exhaustive search 2. Only discrete values - miss optimal values not on the grid 3. Combine the above, knowledge of where the optimal combo sits is required to be efficient.
# set parameter grid
param_grid = {'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
'max_depth': np.linspace(2, 20, 4, dtype = int),
'colsample_bytree': np.linspace(0.6, 1.0, 5)}
gs=GridSearchCV(model, param_grid, scoring='r2', n_jobs = -2, cv = cv)
gs.fit(train_X, train_y)
print('best_score: ', gs.best_score_, '\nbest_parameter: ', gs.best_params_)
gs_results_df = pd.DataFrame(np.transpose([gs.cv_results_['mean_test_score'],
gs.cv_results_['param_learning_rate'].data,
gs.cv_results_['param_colsample_bytree'].data,
gs.cv_results_['param_max_depth'].data]),
columns=['score', 'learning_rate', 'colsample_bytree', 'max_depth'])
gs_results_df.plot(subplots=True, figsize=(10,10))
plt.show()
As observed, score doesn't move in a trend, rather, each iteration corresponds to a point from the grid. It appears that learning rate has the most impact on the score. as colsample_bytree increase, max_depth impacts increases as well.
2. RandomSearch¶
Advantage: 1. Variate all parameters for each search, not wasting time on meaningless parameters 2. Generally faster 3. Continuous
Disadvantage: 1. Independent steps that doesn't learn from previous result 2. May fail to find gloabl optimum
param_grid_rand = {'learning_rate': np.logspace(-5, 0, 100),
'max_depth': randint(2, 20),
'colsample_bytree': np.linspace(0.5, 1.0, 100)}
rs = RandomizedSearchCV(model, param_grid_rand, scoring='r2',
n_iter = 120, cv = cv, n_jobs = -1, random_state = random_state)
rs.fit(train_X, train_y)
print('rs_best_score: ', rs.best_score_, '\nrs_best_parameter: ', rs.best_params_)
rs_result_df = pd.DataFrame(np.transpose([rs.cv_results_['mean_test_score'],
rs.cv_results_['param_colsample_bytree'].data,
rs.cv_results_['param_learning_rate'].data,
rs.cv_results_['param_max_depth'].data]),
columns = ['score', 'colsample_bytree', 'leaning_rate', 'max_depth'])
rs_result_df.plot(subplots = True, figsize=(10,10))
plt.show()
Randomized search with the same number trials, improved upon the default model, but didn't out perform grid search. The sampleing is random, therefore, no trend in score progression.
3. Hyperopt¶
# function to minimize: validation scores of a model
def objective(params, random_state = random_state, cv = cv, X = train_X, y = train_y):
params = {'colsample_bytree': params['colsample_bytree'],
'max_depth': int(params['max_depth']),
'learning_rate': params['learning_rate']}
model = XGBRegressor(random_state = random_state, **params)
score = -cross_val_score(model, X, y, cv=cv,
scoring = 'r2',
n_jobs = -1).mean()
return score
TPE ( Tree-structured Parzen Estimator)¶
Bayesian approach for optimization:
- Random parameter x0
- evaluate objective function F(xt)
- probablity model P(F|xt)
- choose next xt+1 based on probablity model and acquisition function
- repeat step 2-4 until stop criteria is satisfied.
space = {'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1.0, 0.01),
'max_depth': hp.quniform('max_depth', 2, 20, 1),
'learning_rate': hp.loguniform('learning_rate', -5, 0)}
trials = Trials()
n_iter = 120
best = fmin(fn = objective,
space = space,
algo = tpe.suggest, #optimization algorithm
max_evals = n_iter, #max number of iterations
trials = trials, #logging
rstate = np.random.RandomState(random_state) #reproducibility
)
best
model = XGBRegressor(random_state = random_state,
n_jobs = -2,
colsample_bytree = best['colsample_bytree'],
max_depth = int(best['max_depth']),
learning_rate = best['learning_rate'])
model.fit(train_X, train_y)
tpe_test_score = mean_squared_error(test_y, model.predict(test_X))
objective(best)
tpe_results = np.array([[x['result']['loss'],
x['misc']['vals']['learning_rate'][0],
x['misc']['vals']['max_depth'][0],
x['misc']['vals']['colsample_bytree'][0]]
for x in trials.trials])
tpe_results_df = pd.DataFrame(tpe_results, columns = ['score', 'learning_rate','max_depth', 'colsample_bytree'])
tpe_results_df.plot(subplots = True, figsize = (10, 10))
plt.show()
It is obvious that score is trending down as hyperopt learns more about the objective function space. What's more interesintg is the decrease in oscillations across all hyperparameters, with learning_rate being the most obvious. This is consistent with earlier observation of learning rate has the most impact, thus lowering it reduces the loss the most.
4. hyperopt exploration¶
# hyperopt search setup
def objective(x):
return {
'loss': x ** 2,
'status': STATUS_OK,
# -- store other results like this
'eval_time': time.time(),
'other_stuff': {'type': None, 'value': [0, 1, 2]},
# -- attachments are handled differently
'attachments':
{'time_module': pickle.dumps(time.time)}
}
trials = Trials()
best = fmin(objective, # evaluation function
space = hp.uniform('x', -10, 10), # hyperparameter space
algo = tpe.suggest, # sampling method
max_evals = 100, # iterations
trials = trials) # where to save the result
print (best)
# about trials object
print (type(trials.trials))
trials.trials[0]
# minimum setup, only cares about the argmin
def objective(x):
return {'loss': x ** 2, 'status': STATUS_OK }
best = fmin(objective,
space=hp.uniform('x', -10, 10),
algo=tpe.suggest,
max_evals=100)
print (best)
# check sampling result
space = hp.quniform('n_estimators',9.5, 3000.5, 1 )
s = hyperopt.pyll.stochastic.sample(space)
print (s)
space = hp.quniform('n_estimators',np.log(9.5), np.log(3000.5), 1 )
s = hyperopt.pyll.stochastic.sample(space)
print (s)
5. Reference¶
https://www.kaggle.com/ilialar/hyperparameters-tunning-with-hyperopt