Home About Blog Projects Contact

Tpot_optimized_classification

04 Aug 2018

Analysis of the Bottle Rocket pattern in the stock market

This is our second generation model. The models for the first generation analysis were summarized on October 17, 2017. A lot has happened since then. This analysis was done on August 4, 2018. We have achieved a major improvement in the results. Please see the Summary below.

import sys
import platform
import numpy as np
import pandas as pd
import platform
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from imblearn.over_sampling import SMOTE
from tabulate import tabulate
import itertools
from copy import copy
from sklearn.externals import joblib

import tpot
from tpot import TPOTClassifier
from tpot.builtins import StackingEstimator
from tpot.export_utils import export_pipeline
from tpot.export_utils import generate_pipeline_code, get_by_name
from tpot.operator_utils import TPOTOperatorClassFactory

import sklearn
from sklearn.externals import joblib
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import Normalizer, FunctionTransformer
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, r2_score
from sklearn.metrics import precision_recall_curve
print('Operating system version....', platform.platform())
print("Python version is........... %s.%s.%s" % sys.version_info[:3])
print('scikit-learn version is.....', sklearn.__version__)
print('pandas version is...........', pd.__version__)
print('numpy version is............', np.__version__)
print('matplotlib version is.......', matplotlib.__version__)
print('tpot version is.............', tpot.__version__)
Operating system version.... Windows-10-10.0.17134-SP0
Python version is........... 3.6.5
scikit-learn version is..... 0.19.1
pandas version is........... 0.23.0
numpy version is............ 1.14.2
matplotlib version is....... 2.2.2
tpot version is............. 0.9.3

The LoadData routiine is used to read the Bottle Rocket dataset, and create training and testing datasets

def LoadData():
    global feature_names, response_name, n_features

    pth = 'https://raw.githubusercontent.com/CBrauer/CypressPoint.github.io/master/model-13-1.csv'
    model_full = pd.read_csv(pth)

    response_name = ['Altitude']
    feature_names = ['BoxRatio', 'Thrust', 'Velocity', 'OnBalRun', 'vwapGain']
    n_features = len(feature_names)
    mask = feature_names + response_name

    model = model_full[mask]
    print('Model dataset:\n', model.head(5))
    # print('\nDescription of model dataset:\n', model[feature_names].describe(include='all'))

    # Correlation_plot(model)

    X = model[feature_names].values
    y = model[response_name].values.ravel()

    sm = SMOTE(random_state=12)
    X_resampled, y_resampled = sm.fit_sample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                        y_resampled,
                                                        test_size = 0.3,
                                                        random_state = 0)
    print('Size of resampled data:')
    print(' train shape... ', X_train.shape, y_train.shape)
    print(' test shape.... ', X_test.shape, y_test.shape)

    return X_train, y_train, X_test, y_test, X_resampled, y_resampled

Plot the ROC_Curve and Precision-Recall curve.

def Plot_ROC_Precision_Recall():
    class_names = [0, 1]
    fig, (left, right) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, color='darkorange', label='Random Forest')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve (area = %0.7f)' % auc)
    plt.legend(loc='best')

    plt.subplot(1, 2, 2)
    precision, recall, _ = precision_recall_curve(y_test, y_probabilities_success)

    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

    plt.tight_layout()
    plt.show()

This routine is used to plot the predictor (a.k.a feature) importances.

def Plot_Predictor_Importance(best_model, feature_names):
    feature_importance = best_model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    y_pos = np.arange(sorted_idx.shape[0]) + .5
    fig, ax = plt.subplots()
    fig.set_size_inches(8, 5)
    ax.barh(y_pos,
            feature_importance[sorted_idx],
            align='center',
            color='green',
            ecolor='black',
            height=0.5)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(feature_names)
    ax.invert_yaxis()
    ax.set_xlabel('Relative Importance')
    ax.set_title('Predictor Importance')
    plt.show()

This routine is used to summarize the metrics for the model.

def Print_Model_Metrics():
    true_negative  = cm[0, 0]
    true_positive  = cm[1, 1]
    false_negative = cm[1, 0]
    false_positive = cm[0, 1]
    total = true_negative + true_positive + false_negative + false_positive

    accuracy_ = (true_positive + true_negative)/total
    precision_ = (true_positive)/(true_positive + false_positive)
    recall_ = (true_positive)/(true_positive + false_negative)
    misclassification_rate = (false_positive + false_negative)/total
    F1_ = (2*true_positive)/(2*true_positive + false_positive + false_negative)
    assert accuracy == accuracy_, "accuracy score does not agree"
    assert precision == precision_, "precision score does not agree"
    assert recall == recall_, "recall score does not agree"
    assert round(F1,6) == round(F1_,6), "F1: " + str(F1) + " != F1_: " + str(F1_)

    header = ["Metric", "Test"]
    table = [["accuracy",               accuracy],
             ["precision",              precision],
             ["recall",                 recall],
             ["misclassification rate", misclassification_rate],
             ["F1",                     F1],
             ["r2",                     r2],
             ["AUC",                    auc],
             ["mse",                    mse],
             ["logloss",                logloss]
            ]

    print(tabulate(table, header, tablefmt="fancy_grid"))

This routine plots the confusion matrix.

def Plot_Confusion_Matrix():
    cmap = plt.cm.Blues
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    title='Confusion matrix (on test data)'
    classes = [0, 1]
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    c_report = classification_report(y_test, y_predicted_test)
    print('\nClassification report:\n', c_report)

    ntotal = len(y_test)
    correct = y_test == y_predicted_test
    numCorrect = sum(correct)
    percent = round( (100.0*numCorrect)/ntotal, 6)
    print("\nCorrect classifications on test data: {0:d}/{1:d} {2:8.3f}%".format(numCorrect,
                                                                                 ntotal,
                                                                                 percent))
    prediction_score = 100.0*exported_pipeline.score(X_test, y_test)
    assert (round(percent,3) == round(prediction_score, 3)), "prediction score does not agree"

Load the training and test dataset

X_train, y_train, X_test, y_test, X, y = LoadData()
Model dataset:
    BoxRatio   Thrust  Velocity  OnBalRun  vwapGain  Altitude
0     0.831   -0.076     0.381     1.006     0.444         0
1     0.497    0.333     0.489     1.453     0.411         0
2     0.667   -0.127     0.740     2.157     0.455         0
3     0.171   -0.428     0.454     0.940     0.451         0
4   265.390  183.215     8.967    29.467    20.560         0
Size of resampled data:
 train shape...  (23083, 5) (23083,)
 test shape....  (9893, 5) (9893,)

Run the model obtained from the TPOT optimization

# Score on the training set was:0.9377722249192809
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(
            Nystroem(gamma=0.8500000000000001,
                     kernel="cosine",
                     n_components=6),
            make_union(
                make_pipeline(
                    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False,
                                                                     criterion="entropy",
                                                                     max_features=0.3,
                                                                     min_samples_leaf=8,
                                                                     min_samples_split=11,
                                                                     n_estimators=100)),
                    Normalizer(norm="l1")
                ),
                FunctionTransformer(copy)
            )
        )
    ),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.7000000000000001,
                         min_samples_leaf=1,
                         min_samples_split=2,
                         n_estimators=500)
)

exported_pipeline.fit(X_train, y_train)
print('\nScore: \n', exported_pipeline.score(X_test, y_test))

best_model = exported_pipeline._final_estimator
print("Final Estimator:\n", best_model)

joblib.dump(exported_pipeline, 'C:/sm/trained_models/tpot_model.pkl')

y_predicted_test  = exported_pipeline.predict(X_test)

# Since our target is (0,1), the classifier produces a probability matrix of (N,2).
# The first column refers to the probability that our data belong to class 0 (failure to reach altitude),
# and the second column refers to the probability that the data belong to class 1 (it's a bottle rocket).
# Therefore, let's take the second column to compute the 'auc' metric.
y_probabilities_test = exported_pipeline.predict_proba(X_test)
y_probabilities_success = y_probabilities_test[:, 1]

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_probabilities_success)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities_success)

mse        = mean_squared_error(y_test, y_predicted_test)
logloss    = log_loss(y_test, y_predicted_test)
accuracy   = accuracy_score(y_test, y_predicted_test)
precision  = precision_score(y_test, y_predicted_test, average='binary')
recall     = recall_score(y_test, y_predicted_test, average='binary')
F1         = f1_score(y_test, y_predicted_test)
r2         = r2_score(y_test, y_predicted_test)
auc        = roc_auc_score(y_test, y_predicted_test)
cm         = confusion_matrix(y_test, y_predicted_test)
Score:
 0.9392499747296068
Final Estimator:
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=0.7000000000000001,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Average precision-recall score: 0.99

Now let’s create a test dataset that has the predicted values append to it. This will be used by other programs to verify the results.

# reshape so we can append the columns
y_test_ = y_test.reshape(y_test.shape[0], 1)
y_predicted_test_ = y_predicted_test.reshape(y_predicted_test.shape[0], 1)

predicted_name = ['predicted']
probability_names= ['prob 1', 'prob 2']
headers = feature_names + response_name + predicted_name + probability_names
df = pd.DataFrame(columns=headers)
df = pd.DataFrame(X_test, columns=feature_names)
df[response_name] = pd.DataFrame(y_test_).astype(int)
df[predicted_name] = pd.DataFrame(y_predicted_test_).astype(int)
df[probability_names] = pd.DataFrame(y_probabilities_test, columns=probability_names)
print('\ntest_dataset:\n', df.head(), end='')
df.to_csv("test_dataset.csv")
test_dataset:
    BoxRatio    Thrust  Velocity  OnBalRun  vwapGain  Altitude  predicted  \
0  0.051000 -0.419000  0.351000  1.173000  0.571000         0          0   
1  3.832137  0.933219  0.918407  3.290789  0.501965         1          1   
2  1.252387  2.489781  1.461396  4.268061  1.055923         1          1   
3  0.000000 -0.814201  0.883223  1.960900  0.269550         1          1   
4  0.222000 -0.296000  1.057000  2.714000  0.808000         0          0   

   prob 1  prob 2  
0   1.000   0.000  
1   0.000   1.000  
2   0.004   0.996  
3   0.000   1.000  
4   0.880   0.120  

Show a few of the predicitions, and how they were obtained

import lime
import lime.lime_tabular

train, test, labels_train, labels_test = train_test_split(X, y, train_size=0.80, test_size=0.20)
explainer = lime.lime_tabular.LimeTabularExplainer(train,
                                                   feature_names=feature_names,
                                                   class_names=response_name,
                                                   discretize_continuous=True)
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)

i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)

i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)