algorithm - classification - emploration

Explores: - Logistic Regresion - Random Forest - XGBoost

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

1. Data

Simulation conditinal dataset: if x > 0.5, y = 1, else y = 0

x = np.linspace(0,1,101)
y = x > 0.5
plt.plot(x, y, c='k')
plt.show()
np.random.seed (555)
noise = np.random.uniform(-0.2, 0.2, 101)
y = ((x+noise)>0.5).astype(int)
plt.scatter(x, y)
plt.show()

2. Logistic Regression

from sklearn.linear_model import LogisticRegression, LinearRegression
X = x.reshape(101,1)
model = LinearRegression()
model.fit(X,y)

plt.scatter(X, y, c='k')
plt.plot(X, model.predict(X), 'r--')
plt.show()
def fit_and_plot_classification(model, C):
    #fit
    model.fit(X,y)
    #predict
    pred = model.predict_proba(X)[:,1]
    #plot
    plt.scatter(X, y, c='k')
    plt.plot(X, pred, 'r--')
    plt.title('C = {}'.format(C))
    plt.show()

    return model, pred
for C in [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000]:
    _, _ = fit_and_plot_classification(LogisticRegression(C = C), C)

3. Random Forest

from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
model = RandomForestClassifier(random_state = 123)
model.fit(X,y)
plt.scatter(X,y,c='k')
plt.plot(X, model.predict_proba(X)[:,1], 'r--')
plt.show()
# changing up n_estimators:
for n in [50, 100, 200, 400, 800]:
    print('no. of trees:', n)
    model = RandomForestClassifier(n_estimators = n)
    model.fit(X, y)
    plt.scatter(X,y, c='k')
    plt.plot(X, model.predict_proba(X)[:,1], 'r--')
    plt.show()
no. of trees: 50
no. of trees: 100
no. of trees: 200
no. of trees: 400
no. of trees: 800
for n in [2, 5, 15, 20]:
    print ('minimum leaf size:', n)
    model = RandomForestClassifier(random_state = 123,
                                  min_samples_leaf = n)
    model.fit(X, y)
    plt.scatter(X, y, c = 'k')
    plt.plot(X, model.predict_proba(X)[:, 1], 'r--')
    plt.show()
minimum leaf size: 2
minimum leaf size: 5
minimum leaf size: 15
minimum leaf size: 20

4. XGBoost

from xgboost import XGBClassifier
XGBClassifier()
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model = XGBClassifier()
model.fit(X, y)
plt.scatter(X, y, c = 'k')
plt.plot(X, model.predict_proba(X)[:, 1], 'r--')
plt.show()
model = XGBClassifier(n_estimators = 10)
model.fit(X, y)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
for n in [10, 20, 50, 200]:
    print ('n_estimators:', n)
    model = XGBClassifier(n_estimators = n)
    model.fit(X, y)
    pred = model.predict_proba(X)[:, 1]

    plt.scatter(X, y, c='k')
    plt.plot(X, pred, 'r--')
    plt.show()
n_estimators: 10
n_estimators: 20
n_estimators: 50
n_estimators: 200
for n in [1, 3, 5, 7, 9]:
    print('max_depth:',n)
    model = XGBClassifier(max_depth=n)
    model.fit(X, y)
    pred = model.predict_proba(X)[:, 1]

    plt.scatter(X, y, c='k')
    plt.plot(X, pred, 'r--')
    plt.show()
max_depth: 1
max_depth: 3
max_depth: 5
max_depth: 7
max_depth: 9