Home About Blog Projects Contact

Tpot Optimized Classification

12 Nov 2019

Analysis of the Bottle Rocket pattern in the stock market

This is our third-generation model. The models for the first-generation analysis were summarized on October 17, 2017. The second-generation model was done on August 3, 2018, and the third-generation model was completed on November 12, 2019.

We have achieved a major improvement in the results by using TPOT, and a dataset that was computed by the HedgeTools Watch program.

Please see the Summary below.

Charles R. Brauer (CBrauer@CypressPoint.com).

import itertools
import sys

import collections as count
import joblib
import matplotlib
import numpy as np
import pandas as pd
import platform
import seaborn as sns
import sklearn
import tpot
import warnings
from datetime import date
from matplotlib import gridspec, pyplot as plt
from pylab import rcParams
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    classification_report,
    cohen_kappa_score,
    confusion_matrix,
    f1_score,
    log_loss,
    mean_squared_error,
    precision_score,
    r2_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from IPython.core.display import display, HTML

warnings.simplefilter('ignore')

%matplotlib inline

Our computing environment for this analysis is:

print('date........................', date.today())
print('Operating system version....', platform.platform())
print("Python version is........... %s.%s.%s" % sys.version_info[:3])
print('scikit-learn version is.....', sklearn.__version__)
print('pandas version is...........', pd.__version__)
print('numpy version is............', np.__version__)
print('matplotlib version is.......', matplotlib.__version__)
print('tpot version is.............', tpot.__version__)
date........................ 2019-11-14
Operating system version.... Windows-10-10.0.18362-SP0
Python version is........... 3.7.3
scikit-learn version is..... 0.21.3
pandas version is........... 0.25.2
numpy version is............ 1.17.3
matplotlib version is....... 3.1.2
tpot version is............. 0.10.2

Here we load the Bottle Rocket dataset, and create training and verifying datasets

try:
    model_smote = pd.read_csv('H:/HedgeTools/Datasets/rocket-train-classify-smote.csv') 
except FileNotFoundError:
    print('file not found')

response_name = ['Altitude']
feature_names = ['BoxRatio', 'Thrust', 'Acceleration', 'Velocity', 'OnBalRun', 'vwapGain']
headers = feature_names + response_name

# model_smote = model_full[headers]
model_smote = shuffle(model_smote)
pd.set_option('display.expand_frame_repr', False)
print('Model dataset:\n', model_smote.head(5))
print('\nDescription of model dataset:\n', model_smote[feature_names].describe(include='all'))

X = model_smote[feature_names].values
y = model_smote[response_name].values.ravel()

X_train, X_verify, y_train, y_verify = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=7)

print('\nSize of Dataset:')
print(' train shape..... ', X_train.shape, y_train.shape)
print(' verify shape.... ', X_verify.shape, y_verify.shape)
Model dataset:
        BoxRatio    Thrust  Acceleration  Velocity  OnBalRun  vwapGain  Altitude
3035    0.80360  0.154400       1.52810  2.362500  0.642400 -0.931900         0
7362   -0.14750 -0.221900       1.00820  1.073200  0.341500 -0.603900         0
7612   -1.70190  0.177300       1.00190  1.025900  0.151900 -0.928600         0
10907  -0.50635  0.187522       0.65996  0.469045  0.646353 -0.285353         1
11083  -0.63990 -1.565400       0.63420  0.440200  0.511000 -0.290600         0

Description of model dataset:
            BoxRatio        Thrust  Acceleration      Velocity      OnBalRun      vwapGain
count  13965.000000  13965.000000  13965.000000  13965.000000  13965.000000  13965.000000
mean       0.083432     -0.375001      1.138810      1.704769      0.747328     -0.072832
std        1.195185      1.358033      0.651726      2.188283      0.484117      0.584789
min       -4.073700     -6.907800      0.010000     -2.431500     -0.141600     -0.936200
25%       -0.659500     -1.158400      0.726068      0.540300      0.375700     -0.520565
50%       -0.023700     -0.307900      1.066300      1.141700      0.684600     -0.193798
75%        0.770400      0.556200      1.461300      2.154400      1.089200      0.241559
max        4.940900      3.783000      5.322058     30.493565      2.804900      2.610900

Size of Dataset:
 train shape.....  (11172, 6) (11172,)
 verify shape....  (2793, 6) (2793,)

Here we show the distribution of the data before and after SMOTE upsampling.

try:
    model = pd.read_csv('H:/HedgeTools/Datasets/rocket-train-classify.csv')
except FileNotFoundError:
    print('model not found')

count = pd.value_counts(model['Altitude'], sort=True)
ratio = np.int(count[0]/count[1])
print('The training dataset is out-of-balance by a factor of ', ratio)
print('goal failed: ', count[0], ' goal met: ', count[1])

resample = ratio > 2
if resample:  
    fig, (left, right) = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
    
    datas = [{'label':'No',  'color': 'r', 'height': count[0]},
             {'label':'Yes', 'color': 'g', 'height': count[1]}]
    # plot 1 ____________________________________________________________________
    plt.subplot(1, 2, 1)
    i = 0
    for data in datas:
        plt.bar(i, data['height'], align='center', color=data['color'], width=0.4)
        i += 1

    labels = [data['label'] for data in datas]
    pos = [i for i in range(len(datas)) ]
    font_size = 14
    plt.xticks(pos, labels, size=font_size)
    plt.yticks(size=font_size)
    plt.ylabel('Frequency', size=font_size)
    plt.title('Reached altitude before SMOTE', size=font_size)
    plt.rc('legend',**{'fontsize':font_size})
    plt.legend(labels)
    plt.tight_layout()
     # plot 2 ____________________________________________________________________
    plt.subplot(1, 2, 2)
    count = pd.value_counts(model_smote['Altitude'], sort = True)

    datas = [{'label':'No',  'color': 'r', 'height': count[0]},
             {'label':'Yes', 'color': 'g', 'height': count[1]}]
    i = 0
    for data in datas:
        plt.bar(i, data['height'], align='center', color=data['color'], width=0.4)
        i += 1

    labels = [data['label'] for data in datas]
    pos = [i for i in range(len(datas)) ]
    font_size = 14
    plt.xticks(pos, labels, size=font_size)
    plt.yticks(size=font_size)
    plt.ylabel('Frequency', size=font_size)
    plt.title('Reached altitude after SMOTE', size=font_size)
    plt.rc('legend',**{'fontsize':font_size})
    plt.legend(labels)
    plt.tight_layout()
    plt.show()
else:
    count = pd.value_counts(model_smote['Altitude'], sort = True)
    datas = [{'label':'No',  'color': 'r', 'height': count[0]},
             {'label':'Yes', 'color': 'g', 'height': count[1]}]
    i = 0
    for data in datas:
        plt.bar(i, data['height'], align='center', color=data['color'], width=0.4)
        i += 1

    labels = [data['label'] for data in datas]
    pos = [i for i in range(len(datas)) ]
    font_size = 14
    plt.xticks(pos, labels, size=font_size)
    plt.yticks(size=font_size)
    plt.ylabel('Frequency', size=font_size)
    plt.title('Reached altitude', size=font_size)
    plt.rc('legend',**{'fontsize':font_size})
    plt.legend(labels)
    plt.tight_layout()
    plt.show()
The training dataset is out-of-balance by a factor of  8
goal failed:  6994  goal met:  779

png

Identify Highly Correlated Features

Thanks to Chris Albon (https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/ for this code. When we print the correlation list we see that the highest correlation is 0.883630. Since this is well below the 0.95 threshold, we can keep all the features.

# Create correlation matrix
corr_matrix = model.corr().abs()

d = model.drop(['Altitude'], axis=1)
df = pd.DataFrame([[(i, j), d.corr().loc[i, j]]
                   for i, j in list(itertools.combinations(d.corr(), 2))],
                  columns=['pairs', 'corr'])
print(df.sort_values(by='corr', ascending=False))

# Select upper triangle of correlation matrix
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
if not to_drop:
    print('Features to drop: None.')
else:
    print('Features to drop: ', to_drop)
                       pairs      corr
9   (Acceleration, Velocity)  0.883630
0         (BoxRatio, Thrust)  0.571252
14      (OnBalRun, vwapGain)  0.569123
10  (Acceleration, OnBalRun)  0.458538
13      (Velocity, vwapGain)  0.407995
11  (Acceleration, vwapGain)  0.381272
12      (Velocity, OnBalRun)  0.379095
4       (BoxRatio, vwapGain)  0.252447
7         (Thrust, OnBalRun)  0.167866
8         (Thrust, vwapGain)  0.166920
2       (BoxRatio, Velocity)  0.165867
3       (BoxRatio, OnBalRun)  0.165848
1   (BoxRatio, Acceleration)  0.154044
5     (Thrust, Acceleration)  0.108360
6         (Thrust, Velocity)  0.102975
Features to drop: None.

Plot of the distribution of each feature.

Good news, the above correlation analysis shows that we can keep all the features. Rather that use a Seaborn “pairplot” to show this, I wanted to show each distribution in greater detail.

warnings.filterwarnings('ignore')
plt.figure(figsize=(10, 4))

df_br = model[['BoxRatio']]
v = df_br.BoxRatio.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='BoxRatio')
df1 = pd.DataFrame(data=v, columns=['BoxRatio'])
plt.show()

plt.figure(figsize=(10, 4))
df_th = model[['Thrust']]
v = df_th.Thrust.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='Thrust')
df2 = pd.DataFrame(data=v, columns=['Thrust'])
df3 = pd.concat([df1, df2], axis=1).reindex(df1.index)
plt.show()

plt.figure(figsize=(10,4))
df_ac = model[['Acceleration']]
v = df_ac.Acceleration.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='Acceleration')
df4 = pd.DataFrame(data=v, columns=['Acceleration'])
df5 = pd.concat([df3, df4], axis=1).reindex(df1.index)
plt.show()

plt.figure(figsize=(10, 4))
df_ve = model[['Velocity']]
v = df_ve.Velocity.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='Velocity')
df6 = pd.DataFrame(data=v, columns=['Velocity'])
df7 = pd.concat([df5, df6], axis=1).reindex(df1.index)
plt.show()

plt.figure(figsize=(10, 4))
df_on = model[['OnBalRun']]
v = df_on.OnBalRun.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='OnBalRun')
df8 = pd.DataFrame(data=v, columns=['OnBalRun'])
df9 = pd.concat([df7, df8], axis=1).reindex(df1.index)
plt.show()

plt.figure(figsize=(10, 4))
df_vw = model[['vwapGain']]
v = df_vw.vwapGain.values
plt.subplot(1, 2, 1)
sns.scatterplot(data=v)
plt.subplot(1, 2, 2)
sns.distplot(v, axlabel='vwapGain')
df10 = pd.DataFrame(data=v, columns=['vwapGain'])
df11 = pd.concat([df9, df10], axis=1).reindex(df1.index)
plt.show()

plt.tight_layout()

png

png

png

png

png

png

<Figure size 432x288 with 0 Axes>

Here we show Pearson’s correlation between features the first five pairs

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
sns.set(font_scale=1.2)


class SeabornFig2Grid():

    def __init__(self, seaborngrid, fig,  subplot_spec):
        self.fig = fig
        self.sg = seaborngrid
        self.subplot = subplot_spec
        if isinstance(self.sg, sns.axisgrid.FacetGrid) or \
                isinstance(self.sg, sns.axisgrid.PairGrid):
            self._movegrid()
        elif isinstance(self.sg, sns.axisgrid.JointGrid):
            self._movejointgrid()
        self._finalize()

    def _movegrid(self):
        """ Move PairGrid or Facetgrid """
        self._resize()
        n = self.sg.axes.shape[0]
        m = self.sg.axes.shape[1]
        self.subgrid = gridspec.GridSpecFromSubplotSpec(
            n, m, subplot_spec=self.subplot)
        for i in range(n):
            for j in range(m):
                self._moveaxes(self.sg.axes[i, j], self.subgrid[i, j])

    def _movejointgrid(self):
        """ Move Jointgrid """
        h = self.sg.ax_joint.get_position().height
        h2 = self.sg.ax_marg_x.get_position().height
        r = int(np.round(h/h2))
        self._resize()
        self.subgrid = gridspec.GridSpecFromSubplotSpec(
            r+1, r+1, subplot_spec=self.subplot)

        self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
        self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
        self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])

    def _moveaxes(self, ax, gs):
        ax.remove()
        ax.figure = self.fig
        self.fig.axes.append(ax)
        self.fig.add_axes(ax)
        ax._subplotspec = gs
        ax.set_position(gs.get_position(self.fig))
        ax.set_subplotspec(gs)

    def _finalize(self):
        plt.close(self.sg.fig)
        self.fig.canvas.mpl_connect("resize_event", self._resize)
        self.fig.canvas.draw()

    def _resize(self, evt=None):
        self.sg.fig.set_size_inches(self.fig.get_size_inches())


g0 = sns.jointplot("BoxRatio", "Thrust",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g0.annotate(stats.pearsonr)

g1 = sns.jointplot("Velocity", "vwapGain",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g1.annotate(stats.pearsonr)

g2 = sns.jointplot("Velocity", "OnBalRun",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g2.annotate(stats.pearsonr)

g3 = sns.jointplot("OnBalRun", "vwapGain",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g3.annotate(stats.pearsonr)

g4 = sns.jointplot("BoxRatio", "Velocity",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g4.annotate(stats.pearsonr)

g5 = sns.jointplot("BoxRatio", "vwapGain",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g5.annotate(stats.pearsonr)

g6 = sns.jointplot("Thrust", "OnBalRun",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g6.annotate(stats.pearsonr)

g7 = sns.jointplot("BoxRatio", "OnBalRun",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g7.annotate(stats.pearsonr)


g8 = sns.jointplot("Thrust", "Velocity",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g8.annotate(stats.pearsonr)

g9 = sns.jointplot("Thrust", "vwapGain",
                   data=model,
                   fit_reg=True,
                   kind='reg',
                   height=7,
                   ratio=3,
                   color="b",
                   scatter_kws={"s": 5})
g9.annotate(stats.pearsonr)

fig = plt.figure(figsize=(11, 20))
gs = gridspec.GridSpec(5, 2)

mg0 = SeabornFig2Grid(g0, fig, gs[0])
mg1 = SeabornFig2Grid(g1, fig, gs[1])

mg2 = SeabornFig2Grid(g2, fig, gs[2])
mg3 = SeabornFig2Grid(g3, fig, gs[3])

mg4 = SeabornFig2Grid(g4, fig, gs[4])
mg5 = SeabornFig2Grid(g5, fig, gs[5])

mg6 = SeabornFig2Grid(g6, fig, gs[6])
mg7 = SeabornFig2Grid(g7, fig, gs[7])

mg8 = SeabornFig2Grid(g8, fig, gs[8])
mg9 = SeabornFig2Grid(g9, fig, gs[9])

gs.tight_layout(fig)

plt.show()

png

Run the model obtained from the TPOT optimization

# Average CV score on the training set was:0.9501523778671478
exported_pipeline = make_pipeline(
    make_union(
        PCA(iterated_power=10, 
            svd_solver="randomized"),
        RFE(estimator=ExtraTreesClassifier(criterion="entropy",
                                           max_features=0.5,
                                           n_estimators=100),
            step=0.3)
    ),
    MinMaxScaler(),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.9000000000000001,
                         min_samples_leaf=1,
                         min_samples_split=2,
                         n_estimators=100)
)

exported_pipeline.fit(X_train, y_train)
score = exported_pipeline.score(X_verify, y_verify)
print('\nScore: ', score)

joblib.dump(exported_pipeline, 'tpot_classify.pkl')
Score:  0.9570354457572503





['tpot_classify.pkl']

Compute some metrics

Note that I load a test dataset. This data has not been seen by the training computation.

new_pipeline = joblib.load('tpot_classify.pkl')

try:
    model_test = pd.read_csv('H:/HedgeTools/Datasets/rocket-test-classify.csv') 
except FileNotFoundError:
    print('file not found')
    
X_test = model_test[feature_names].values
y_test = model_test[response_name].values.ravel()

y_predicted_test = new_pipeline.predict(X_test)

try:
    mse = mean_squared_error(y_test, y_predicted_test)
    logloss = log_loss(y_test, y_predicted_test)
    accuracy = accuracy_score(y_test, y_predicted_test)
    precision = precision_score(y_test, y_predicted_test, average='binary')
    recall = recall_score(y_test, y_predicted_test, average='binary')
    F1 = f1_score(y_test, y_predicted_test)
    r2 = r2_score(y_test, y_predicted_test)
    auc = roc_auc_score(y_test, y_predicted_test)
    cm = confusion_matrix(y_test, y_predicted_test)
    y_predicted_train = new_pipeline.predict(X_train)
    y_predicted_test = new_pipeline.predict(X_test)
    print('Test accuracy: ', accuracy_score(y_test, y_predicted_test))
    y_predicted_train_cv = cross_val_predict(new_pipeline, X_train, y_train, cv=10)
    y_predicted_test_cv = cross_val_predict(new_pipeline, X_test, y_test, cv=10)
    print('Test cross validated accuracy: ', accuracy_score(y_test, y_predicted_test_cv))
    print('Test Cohen Kappa score: ', cohen_kappa_score(y_test, y_predicted_test))
except:
    print("Cannot compute metrics: ", sys.exc_info()[0])

ntotal = len(y_test)
correct = y_test == y_predicted_test
numCorrect = sum(correct)
percent = round((100.0*numCorrect)/ntotal, 6)
print("Correct classifications on test data: {0:d}/{1:d} {2:8.3f}%".format(numCorrect, ntotal, percent))

# Since our target is (0,1), the classifier produces a probability matrix of (N,2).
# The first column refers to the probability that our data belong to class 0 (failure
# to reach altitude), and the second column refers to the probability that the data belong
# to class 1 (it's a bottle rocket). Therefore, let's take the second column to compute
# the 'auc' metric.
try:
    y_probabilities_test = new_pipeline.predict_proba(X_test)
    y_probabilities_success = y_probabilities_test[:, 1]
    average_precision = average_precision_score(y_test, y_probabilities_success)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities_success)
except:
    print("Cannot compute probabilities: ", sys.exc_info()[0])

auc_test = roc_auc_score(y_test, y_predicted_test)
print('auc_test: ', auc_test)

score_new = new_pipeline.score(X_test, y_test)
print('Cross validation score on test dataset: ', score)
y_predicted_test_new = new_pipeline.predict(X_test)
print('type(X_test)', type(X_test))
print('X_test.shape: ', X_test.shape)

ntotal = len(y_test)
correct_new = y_test == y_predicted_test_new

len_correct_new = len(correct_new)
len_correct = len(correct)
assert len_correct_new == len_correct, "lengths do not agree"
for k in range(len_correct):
    if correct[k] != correct_new[k]:
        print("{0}: {1} != {2}", k, correct[k], correct_new[k])
Test accuracy:  0.9705882352941176
Test cross validated accuracy:  0.9057623049219687
Test Cohen Kappa score:  0.8526536564938956
Correct classifications on test data: 3234/3332   97.059%
Average precision-recall score: 0.99
auc_test:  0.9742911646586345
Cross validation score on test dataset:  0.9570354457572503
type(X_test) <class 'numpy.ndarray'>
X_test.shape:  (3332, 6)

Plot of the ROC curve

This is the best ROC curve I have been able to achieve on my dataset.

try:
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, color='darkorange', label='TPOT estimator')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')

    print('type(X_test)', type(X_test))
    print('X_test.shape: ', X_test.shape)

    auc_test = roc_auc_score(y_test, y_predicted_test)
    plt.title('ROC curve on test dataset: %f' % auc_test)
    plt.legend(loc='best')
    plt.show()
    plt.close()
except:
    print("Cannot compute ROC curve: ", sys.exc_info()[0])
type(X_test) <class 'numpy.ndarray'>
X_test.shape:  (3332, 6)

png

Plot of the feature importances

Notice that TPOT added features to the model, and our predictors are of lesser importance. Oh well, that’s feature engineering for you!

try:
    best_model = exported_pipeline._final_estimator
    feature_importances = best_model.feature_importances_
    importances = 100.0 * (feature_importances / feature_importances.max())
    sorted_idx = np.argsort(importances)
    y_pos = np.arange(sorted_idx.shape[0]) + .5
    fig, ax = plt.subplots()
    fig.set_size_inches(6.0, 6.0)
    ax.barh(y_pos,
            feature_importances[sorted_idx],
            align='center',
            color='green',
            ecolor='black',
            height=0.5)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(feature_names)
    ax.invert_yaxis()
    ax.set_xlabel('Relative Importance')
    ax.set_title('Predictor Importance')
    plt.show()
except:
    print('Could not display the feature importances.')
plt.close()

png

# shap crashes this notebook. Too bad. 

# import shap
# df = pd.DataFrame(X_test, columns=feature_names)
# shap_values = shap.TreeExplainer(best_model).shap_values(df)
# shap.summary_plot(shap_values, df)

# shap.dependence_plot("vwapGain", shap_values, df)

Plot of confusion matrices and print the correct classification on the test dataset

The class name “0” means that the profit goal was not met. Class “1” means the goal of at least 1.5% per day-trade was met.

Note that I use the Confusion Matrix as defined by Wikipedia: https://en.wikipedia.org/wiki/Confusion_matrix The matrtix is: (True_Positive, False_Positive)/(False_Negative, True_Negative)

tn, fp, fn, tp = confusion_matrix(y_test, y_predicted_test).ravel()
print('true positive: ', tp, ', false positive: ', fp, ', false negative: ', fn, ', true negative: ', tn)

cm_test = np.array([[tp, fp], [fn, tn]])
print('\nTest Confusion matrix:\n', cm_test)

c_report = classification_report(y_test, y_predicted_test)
print('\nClassification report:\n', c_report)

ntotal = len(y_test)
correct = y_test == y_predicted_test
numCorrect = sum(correct)
percent = round((100.0*numCorrect)/ntotal, 6)
print("\nCorrect classifications on test data: {0:d}/{1:d} {2:8.3f}%".format(numCorrect,
                                                                             ntotal,
                                                                             percent))
prediction_score = 100.0*new_pipeline.score(X_test, y_test)
assert (round(percent, 3) == round(prediction_score, 3)), "prediction score does not agree"
true positive:  325 , false positive:  91 , false negative:  7 , true negative:  2909

Test Confusion matrix:
 [[ 325   91]
 [   7 2909]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      3000
           1       0.78      0.98      0.87       332

    accuracy                           0.97      3332
   macro avg       0.89      0.97      0.93      3332
weighted avg       0.98      0.97      0.97      3332


Correct classifications on test data: 3234/3332   97.059%
plt.clf()
plt.figure(figsize=(5, 5), clear=True)
cmap = plt.cm.Blues
plt.imshow(cm_test, interpolation='nearest', cmap=cmap)
title = 'Confusion matrix (on test data)'
classes = [1, 0]
plt.title(title)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)

thresh = cm_test.max() / 2.
for i, j in itertools.product(range(cm_test.shape[0]), range(cm_test.shape[1])):
    plt.text(j, i, cm_test[i, j],
             ha="center", va="center",
             color="white" if cm_test[i, j] > thresh else "black")

plt.ylabel('True Altitude')
plt.xlabel('Predicted Altitude')
plt.show()
<Figure size 432x288 with 0 Axes>

png

We use the Confusion Matrix as defined by Wikipedia: https://en.wikipedia.org/wiki/Confusion_matrix. The matrtix is: (True_Positive, False_Positive)/(False_Negative, True_Negative)

true_positive_test = cm_test[0, 0]
false_positive_test = cm_test[0, 1]
false_negative_test = cm_test[1, 0]
true_negative_test = cm_test[1, 1]

total_test = true_positive_test + false_positive_test + false_negative_test + true_negative_test

accuracy_test_ = (true_positive_test + true_negative_test)/total_test
precision_test_ = (true_positive_test) / (true_positive_test + false_positive_test)
recall_test_ = (true_positive_test)/(true_positive_test + false_negative_test)
misclassification_rate_test = (false_positive_test + false_negative_test)/total_test
F1_test_ = (2*true_positive_test)/(2*true_positive_test + false_positive_test + false_negative_test)

y_predict_test = new_pipeline.predict(X_test)
mse_test = mean_squared_error(y_test, y_predict_test)
logloss_test = log_loss(y_test, y_predict_test)
accuracy_test = accuracy_score(y_test, y_predict_test)
precision_test = precision_score(y_test, y_predict_test)
recall_test = recall_score(y_test, y_predict_test)
F1_test = f1_score(y_test, y_predict_test)
r2_test = r2_score(y_test, y_predict_test)
auc_test = roc_auc_score(y_test, y_predict_test)

header = ["Metric", "Test dataset"]
          
table1 = [["accuracy",               accuracy_test],
          ["precision",              precision_test],
          ["recall",                 recall_test],
          ["misclassification rate", misclassification_rate_test],
          ["F1",                     F1_test],
          ["r2",                     r2_test],
          ["AUC",                    auc_test],
          ["mse",                    mse_test],
          ["logloss",                logloss_test]
          ]

table2 = [['accuracy',               0.91943799],
          ['precision',              0.89705603],
          ['recall',                 0.94877461],
          ['misclassification rate', 0.08056201],
          ['F1',                     0.92219076],
          ['r2',                     0.67773888],
          ['AUC',                    0.91924997],
          ['mse',                    0.08056201],
          ['logloss',                2.78255718]]

def display_side_by_side(dfs:list, captions:list):
    output = ""
    combined = dict(zip(captions, dfs))
    styles = [dict(selector="caption", 
        props=[("text-align", "center"),
        ("font-size", "120%"),
        ("color", 'black')])]   
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline; font-size:100%' ").set_caption(caption).set_table_styles(styles)._repr_html_()
        output += "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
    display(HTML(output))

df1 = pd.DataFrame(table1, columns=header)
df2 = pd.DataFrame(table2, columns=header)

display_side_by_side([df1, df2], ['Current Performance', 'Previous Performance'])

png          

Summary

This is the best performance so far, and it is now integrated into HedgeTools. The performance is due to TPOT. I would like to thank Dr. Randy Olsen and his team for their great work (http://www.randalolson.com/2015/11/15/introducing-tpot-the-data-science-assistant/).


blog comments powered by Disqus