Tpot Optimized Classification

12 Jun 2018

Analysis of the Bottle Rocket pattern in the stock market

This is our second generation model. The models for the first generation analysis were summarized on October 17, 2017. A lot has happened since then. This analysis was done on June 9, 2018. We have acheived a major improvement in the results. Please see the Summary below.

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from imblearn.over_sampling import SMOTE
from tabulate import tabulate
import itertools
from copy import copy
from sklearn.externals import joblib

import tpot
from tpot import TPOTClassifier
from tpot.operator_utils import TPOTOperatorClassFactory
from tpot.config.classifier import classifier_config_dict
from tpot.builtins import StackingEstimator, CombineDFs

from tpot import TPOTClassifier, TPOTRegressor
from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name
from tpot.operator_utils import TPOTOperatorClassFactory
from tpot.config.classifier import classifier_config_dict

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import ShuffleSplit
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.pipeline import make_pipeline, make_union
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.metrics import f1_score, precision_score, recall_score

The LoadData routiine is used to read the Bottle Rocket dataset, and create training and testing datasets

def LoadData():
    global feature_names, response_name, n_features

    model_full = pd.read_csv("H:/ML/BlogExperiments/model-13-1.csv")

    response_name = ['Altitude']
    feature_names = ['BoxRatio', 'Thrust', 'Velocity', 'OnBalRun', 'vwapGain']
    n_features = len(feature_names)
    mask = feature_names + response_name

    model = model_full[mask]
    print('Model dataset:\n', model.head(5))
    # print('\nDescription of model dataset:\n', model[feature_names].describe(include='all'))

    # Correlation_plot(model)
    
    X = model[feature_names].values
    y = model[response_name].values.ravel()
    
    sm = SMOTE(random_state=12)
    X_resampled, y_resampled = sm.fit_sample(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                        y_resampled,
                                                        test_size = 0.3,
                                                        random_state = 0)
    print('Size of resampled data:')
    print(' train shape... ', X_train.shape, y_train.shape)
    print(' test shape.... ', X_test.shape, y_test.shape)

    return X_train, y_train, X_test, y_test, X_resampled, y_resampled

The ROC_Curve routine is used to plot the ROC curve.

def ROC_Curve():
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, color='darkorange', label='Random Forest')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve (area = %0.7f)' % auc)
    plt.legend(loc='best')
    plt.show()
    #path = "C:/Users/Charles/Desktop/ROC Curve.png"
    #plt.savefig(path)
    #plt.cla()
    #plt.close('all')

This routine is used to plot the predictor (a.k.a feature) importances.

def Plot_predictor_importance(best_model, feature_names):
    feature_importance = best_model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    y_pos = np.arange(sorted_idx.shape[0]) + .5
    fig, ax = plt.subplots()
    ax.barh(y_pos,
            feature_importance[sorted_idx],
            align='center',
            color='green',
            ecolor='black',
            height=0.5)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(feature_names)
    ax.invert_yaxis()
    ax.set_xlabel('Relative Importance')
    ax.set_title('Predictor Importance')
    plt.show()

This routine is used to summarize the metrics for the model.

def Print_model_metrics(exported_pipeline, X_train, X_test, cm_train, cm_test, auc_train, auc_test):
    true_negative_train  = cm_train[0, 0]
    true_positive_train  = cm_train[1, 1]
    false_negative_train = cm_train[1, 0]
    false_positive_train = cm_train[0, 1]
    total_train = true_negative_train + true_positive_train + false_negative_train + false_positive_train
    accuracy_train = (true_positive_train + true_negative_train)/total_train
    precision_train = (true_positive_train)/(true_positive_train + false_positive_train)
    recall_train = (true_positive_train)/(true_positive_train + false_negative_train)
    misclassification_rate_train = (false_positive_train + false_negative_train)/total_train
    F1_train = (2*true_positive_train)/(2*true_positive_train + false_positive_train + false_negative_train)
    
    true_negative_test  = cm_test[0, 0]
    true_positive_test  = cm_test[1, 1]
    false_negative_test = cm_test[1, 0]
    false_positive_test = cm_test[0, 1]
    total_test = true_negative_test + true_positive_test + false_negative_test + false_positive_test
    accuracy_test = (true_positive_test + true_negative_test)/total_test
    precision_test = (true_positive_test)/(true_positive_test + false_positive_test)
    recall_test = (true_positive_test)/(true_positive_test + false_negative_test)
    misclassification_rate_test = (false_positive_test + false_negative_test)/total_test
    F1_test = (2*true_positive_test)/(2*true_positive_test + false_positive_test + false_negative_test)
    
    y_predict_train = exported_pipeline.predict(X_train)
    y_predict_test  = exported_pipeline.predict(X_test)
    mse_train       = metrics.mean_squared_error(y_train, y_predict_train)
    mse_test        = metrics.mean_squared_error(y_test, y_predict_test)
    logloss_train   = metrics.log_loss(y_train, y_predict_train)
    logloss_test    = metrics.log_loss(y_test, y_predict_test)
    accuracy_train  = metrics.accuracy_score(y_train, y_predict_train)
    accuracy_test   = metrics.accuracy_score(y_test, y_predict_test)
    precision_test  = precision_score(y_test, y_predict_test, average='binary')
    recall_test     = recall_score(y_test, y_predict_test, average='binary')
    F1_train        = metrics.f1_score(y_train, y_predict_train)
    F1_test         = metrics.f1_score(y_test, y_predict_test)
    r2_train        = metrics.r2_score(y_train, y_predict_train)
    r2_test         = metrics.r2_score(y_test, y_predict_test)
    auc_train       = metrics.roc_auc_score(y_train, y_predict_train)
    auc_test        = metrics.roc_auc_score(y_test, y_predict_test)
    
    header = ["Metric", "Train", "Test"]
    table = [["accuracy",               accuracy_train,               accuracy_test],
             ["precision",              precision_train,              precision_test],
             ["recall",                 recall_train,                 recall_test],
             ["misclassification rate", misclassification_rate_train, misclassification_rate_test],
             ["F1",                     F1_train,                     F1_test],
             ["r2",                     r2_train,                     r2_test],
             ["AUC",                    auc_train,                    auc_test],
             ["mse",                    mse_train,                    mse_test],
             ["logloss",                logloss_train,                logloss_test]
            ]

    print(tabulate(table, header, tablefmt="fancy_grid"))

This routine plots the confusion matrix.

def Plot_confusion_matrix(cm, classes, title, normalize=False):
    cmap = plt.cm.Blues
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Load the training and test dataset

X_train, y_train, X_test, y_test, X, y = LoadData()

Model dataset:
    BoxRatio  Thrust  Velocity  OnBalRun  vwapGain  Altitude
0     0.215  -0.499     1.654     4.250     1.559         0
1     0.133  -0.493     1.002     2.777     0.947         0
2     0.198  -0.489     0.741     1.139     0.458         0
3     0.169  -0.264     2.056     4.856     2.093         0
4     0.346  -0.473     0.711     1.925     0.559         0
Size of resampled data:
 train shape...  (23060, 5) (23060,)
 test shape....  (9884, 5) (9884,)

Run the model obtained from the Google Colaboratory TPOT optimization

exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        Nystroem(gamma=0.45,
                 kernel="linear",
                 n_components=9)
    ),
    RandomForestClassifier(bootstrap=False,
                           criterion="gini",
                           max_features=0.25,
                           min_samples_leaf=1,
                           min_samples_split=2,
                           n_estimators=100)
)

exported_pipeline.fit(X_train, y_train)
print('\nScore: \n', exported_pipeline.score(X_test, y_test))

best_model = exported_pipeline._final_estimator
print("best model:\n", best_model)

joblib.dump(exported_pipeline, 'C:/sm/trained_models/tpot_model.pkl')

y_predicted_train = exported_pipeline.predict(X_train)
y_predicted_test  = exported_pipeline.predict(X_test)

# Since our target is (0,1), the classifier produces a probability matrix of (N,2).
# The first column refers to the probability that our data belong to class 0 (failure to reach altitude), 
# and the second column refers to the probability that the data belong to class 1 (it's a bottle rocket).
# Therefore, let's take the second column to compute the 'auc' metric.
y_probabilities_test = exported_pipeline.predict_proba(X_test)
y_probabilities_success = y_probabilities_test[:, 1]

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities_success)
auc = roc_auc_score(y_train, y_predicted_train)

Score: 
 0.9354512343180899
best model:
 RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.25, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Now let’s create a test dataset that has the predicted values append to it. This will be used by other programs to verify the results.

# reshape so we can append the columns
y_test_ = y_test.reshape(y_test.shape[0], 1)
y_predicted_test_ = y_predicted_test.reshape(y_predicted_test.shape[0], 1)

predicted_name = ['predicted']
probability_names= ['prob 1', 'prob 2']
headers = feature_names + response_name + predicted_name + probability_names
df = pd.DataFrame(columns=headers)
df = pd.DataFrame(X_test, columns=feature_names)
df[response_name] = pd.DataFrame(y_test_).astype(int)
df[predicted_name] = pd.DataFrame(y_predicted_test_).astype(int)
df[probability_names] = pd.DataFrame(y_probabilities_test, columns=probability_names)
print('\ntest_dataset:\n', df.head(), end='')
df.to_csv("test_dataset.csv")

test_dataset:
    BoxRatio    Thrust  Velocity  OnBalRun  vwapGain  Altitude  predicted  \
0.251000 -0.278000  0.520000  1.287000  0.442000         0          0   
0.078176 -0.034186  0.613033  2.632448  0.824791         1          1   
1.668955  2.428611  1.642349  2.932859  0.658687         1          1   
0.237893 -0.342475  0.780336  1.674925  0.430937         1          0   
2.164000  2.330000  2.500000  6.473000  4.360000         0          1   

   prob 1  prob 2  
  0.99    0.01  
  0.33    0.67  
  0.23    0.77  
  0.70    0.30  
  0.49    0.51  

Show a few of the predicitions, and how they were obtained

import lime
import lime.lime_tabular

train, test, labels_train, labels_test = train_test_split(X, y, train_size=0.80, test_size=0.20)
explainer = lime.lime_tabular.LimeTabularExplainer(train,
                                                   feature_names=feature_names,
                                                   class_names=response_name,
                                                   discretize_continuous=True)
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)

i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)

i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)

Plot the ROC curve

ROC_Curve()

png

Plot the feature importances

print('\nFeature Importances:', best_model.feature_importances_)
Plot_predictor_importance(best_model, feature_names)

Feature Importances: [0.06142848 0.06943697 0.08282327 0.10006172 0.09828068 0.07206218
 0.04960417 0.07309812 0.05043623 0.09924978 0.06053183 0.07064098
 0.06037296 0.05197263]

png

Plot of confusion matrices and print the correct classification on the test dataset

The class name “0” means that the profit goal was not met. Class “1” means the goal of at least 2% was met.

cm_train = confusion_matrix(y_train, y_predicted_train)
auc_train = roc_auc_score(y_train, y_predicted_train)
    
cm_test = confusion_matrix(y_test, y_predicted_test)
auc_test = roc_auc_score(y_test, y_predicted_test)

class_names = [0, 1]
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
Plot_confusion_matrix(cm_train, classes=class_names, title='Confusion matrix (Train)')
    
plt.subplot(1, 2, 2)
Plot_confusion_matrix(cm_test, classes=class_names, title='Confusion matrix (Test)')
plt.show()
    
c_report = classification_report(y_test, y_predicted_test)
print('\nClassification report:\n', c_report)

ntotal = len(y_test)
correct = y_test == y_predicted_test
numCorrect = sum(correct)
percent = round( (100.0*numCorrect)/ntotal, 6)
print("\nCorrect classifications on test data: {0:d}/{1:d} {2:8.3f}%".format(numCorrect,
                                                                              ntotal,
                                                                              percent))
prediction_score = 100.0*exported_pipeline.score(X_test, y_test)
assert (round(percent,3) == round(prediction_score, 3)), "prediction scores do not agree"

png

Classification report:
              precision    recall  f1-score   support

          0       0.95      0.92      0.93      4933
          1       0.92      0.95      0.94      4951

avg / total       0.94      0.94      0.94      9884


Correct classifications on test data: 9246/9884   93.545%

Print the model metrics

Print_model_metrics(exported_pipeline, X_train, X_test, cm_train, cm_test, auc_train, auc_test)

╒════════════════════════╤═════════════╤═══════════╕
│ Metric                 │       Train │      Test │
╞════════════════════════╪═════════════╪═══════════╡
│ accuracy               │ 1           │ 0.935451  │
├────────────────────────┼─────────────┼───────────┤
│ precision              │ 1           │ 0.922264  │
├────────────────────────┼─────────────┼───────────┤
│ recall                 │ 1           │ 0.951323  │
├────────────────────────┼─────────────┼───────────┤
│ misclassification rate │ 0           │ 0.0645488 │
├────────────────────────┼─────────────┼───────────┤
│ F1                     │ 1           │ 0.936568  │
├────────────────────────┼─────────────┼───────────┤
│ r2                     │ 1           │ 0.741804  │
├────────────────────────┼─────────────┼───────────┤
│ AUC                    │ 1           │ 0.935422  │
├────────────────────────┼─────────────┼───────────┤
│ mse                    │ 0           │ 0.0645488 │
├────────────────────────┼─────────────┼───────────┤
│ logloss                │ 9.99201e-16 │ 2.22947   │
╘════════════════════════╧═════════════╧═══════════╛

Summary

The major improvements are a result of implementing:

SMOTE. Dr. Jason Brownlee suggested that I use SMOTE click here to balance the dataset (it’s 13:1).
TPOT. I learned about TPOT when I used Google Colaboratory click here

The implementation of these two suggestions are responsible for the greatly improved results.