This is our second generation model. The models for the first generation analysis were summarized on October 17, 2017. A lot has happened since then. This analysis was done on June 9, 2018. We have acheived a major improvement in the results. Please see the Summary below.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from imblearn.over_sampling import SMOTE
from tabulate import tabulate
import itertools
from copy import copy
from sklearn.externals import joblib
import tpot
from tpot import TPOTClassifier
from tpot.operator_utils import TPOTOperatorClassFactory
from tpot.config.classifier import classifier_config_dict
from tpot.builtins import StackingEstimator, CombineDFs
from tpot import TPOTClassifier, TPOTRegressor
from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name
from tpot.operator_utils import TPOTOperatorClassFactory
from tpot.config.classifier import classifier_config_dict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import ShuffleSplit
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.pipeline import make_pipeline, make_union
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.metrics import f1_score, precision_score, recall_score
def LoadData():
global feature_names, response_name, n_features
model_full = pd.read_csv("H:/ML/BlogExperiments/model-13-1.csv")
response_name = ['Altitude']
feature_names = ['BoxRatio', 'Thrust', 'Velocity', 'OnBalRun', 'vwapGain']
n_features = len(feature_names)
mask = feature_names + response_name
model = model_full[mask]
print('Model dataset:\n', model.head(5))
# print('\nDescription of model dataset:\n', model[feature_names].describe(include='all'))
# Correlation_plot(model)
X = model[feature_names].values
y = model[response_name].values.ravel()
sm = SMOTE(random_state=12)
X_resampled, y_resampled = sm.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
y_resampled,
test_size = 0.3,
random_state = 0)
print('Size of resampled data:')
print(' train shape... ', X_train.shape, y_train.shape)
print(' test shape.... ', X_test.shape, y_test.shape)
return X_train, y_train, X_test, y_test, X_resampled, y_resampled
def ROC_Curve():
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(false_positive_rate, true_positive_rate, color='darkorange', label='Random Forest')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (area = %0.7f)' % auc)
plt.legend(loc='best')
plt.show()
#path = "C:/Users/Charles/Desktop/ROC Curve.png"
#plt.savefig(path)
#plt.cla()
#plt.close('all')
def Plot_predictor_importance(best_model, feature_names):
feature_importance = best_model.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
y_pos = np.arange(sorted_idx.shape[0]) + .5
fig, ax = plt.subplots()
ax.barh(y_pos,
feature_importance[sorted_idx],
align='center',
color='green',
ecolor='black',
height=0.5)
ax.set_yticks(y_pos)
ax.set_yticklabels(feature_names)
ax.invert_yaxis()
ax.set_xlabel('Relative Importance')
ax.set_title('Predictor Importance')
plt.show()
def Print_model_metrics(exported_pipeline, X_train, X_test, cm_train, cm_test, auc_train, auc_test):
true_negative_train = cm_train[0, 0]
true_positive_train = cm_train[1, 1]
false_negative_train = cm_train[1, 0]
false_positive_train = cm_train[0, 1]
total_train = true_negative_train + true_positive_train + false_negative_train + false_positive_train
accuracy_train = (true_positive_train + true_negative_train)/total_train
precision_train = (true_positive_train)/(true_positive_train + false_positive_train)
recall_train = (true_positive_train)/(true_positive_train + false_negative_train)
misclassification_rate_train = (false_positive_train + false_negative_train)/total_train
F1_train = (2*true_positive_train)/(2*true_positive_train + false_positive_train + false_negative_train)
true_negative_test = cm_test[0, 0]
true_positive_test = cm_test[1, 1]
false_negative_test = cm_test[1, 0]
false_positive_test = cm_test[0, 1]
total_test = true_negative_test + true_positive_test + false_negative_test + false_positive_test
accuracy_test = (true_positive_test + true_negative_test)/total_test
precision_test = (true_positive_test)/(true_positive_test + false_positive_test)
recall_test = (true_positive_test)/(true_positive_test + false_negative_test)
misclassification_rate_test = (false_positive_test + false_negative_test)/total_test
F1_test = (2*true_positive_test)/(2*true_positive_test + false_positive_test + false_negative_test)
y_predict_train = exported_pipeline.predict(X_train)
y_predict_test = exported_pipeline.predict(X_test)
mse_train = metrics.mean_squared_error(y_train, y_predict_train)
mse_test = metrics.mean_squared_error(y_test, y_predict_test)
logloss_train = metrics.log_loss(y_train, y_predict_train)
logloss_test = metrics.log_loss(y_test, y_predict_test)
accuracy_train = metrics.accuracy_score(y_train, y_predict_train)
accuracy_test = metrics.accuracy_score(y_test, y_predict_test)
precision_test = precision_score(y_test, y_predict_test, average='binary')
recall_test = recall_score(y_test, y_predict_test, average='binary')
F1_train = metrics.f1_score(y_train, y_predict_train)
F1_test = metrics.f1_score(y_test, y_predict_test)
r2_train = metrics.r2_score(y_train, y_predict_train)
r2_test = metrics.r2_score(y_test, y_predict_test)
auc_train = metrics.roc_auc_score(y_train, y_predict_train)
auc_test = metrics.roc_auc_score(y_test, y_predict_test)
header = ["Metric", "Train", "Test"]
table = [["accuracy", accuracy_train, accuracy_test],
["precision", precision_train, precision_test],
["recall", recall_train, recall_test],
["misclassification rate", misclassification_rate_train, misclassification_rate_test],
["F1", F1_train, F1_test],
["r2", r2_train, r2_test],
["AUC", auc_train, auc_test],
["mse", mse_train, mse_test],
["logloss", logloss_train, logloss_test]
]
print(tabulate(table, header, tablefmt="fancy_grid"))
def Plot_confusion_matrix(cm, classes, title, normalize=False):
cmap = plt.cm.Blues
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
X_train, y_train, X_test, y_test, X, y = LoadData()
Model dataset:
BoxRatio Thrust Velocity OnBalRun vwapGain Altitude
0 0.215 -0.499 1.654 4.250 1.559 0
1 0.133 -0.493 1.002 2.777 0.947 0
2 0.198 -0.489 0.741 1.139 0.458 0
3 0.169 -0.264 2.056 4.856 2.093 0
4 0.346 -0.473 0.711 1.925 0.559 0
Size of resampled data:
train shape... (23060, 5) (23060,)
test shape.... (9884, 5) (9884,)
exported_pipeline = make_pipeline(
make_union(
FunctionTransformer(copy),
Nystroem(gamma=0.45,
kernel="linear",
n_components=9)
),
RandomForestClassifier(bootstrap=False,
criterion="gini",
max_features=0.25,
min_samples_leaf=1,
min_samples_split=2,
n_estimators=100)
)
exported_pipeline.fit(X_train, y_train)
print('\nScore: \n', exported_pipeline.score(X_test, y_test))
best_model = exported_pipeline._final_estimator
print("best model:\n", best_model)
joblib.dump(exported_pipeline, 'C:/sm/trained_models/tpot_model.pkl')
y_predicted_train = exported_pipeline.predict(X_train)
y_predicted_test = exported_pipeline.predict(X_test)
# Since our target is (0,1), the classifier produces a probability matrix of (N,2).
# The first column refers to the probability that our data belong to class 0 (failure to reach altitude),
# and the second column refers to the probability that the data belong to class 1 (it's a bottle rocket).
# Therefore, let's take the second column to compute the 'auc' metric.
y_probabilities_test = exported_pipeline.predict_proba(X_test)
y_probabilities_success = y_probabilities_test[:, 1]
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities_success)
auc = roc_auc_score(y_train, y_predicted_train)
Score:
0.9354512343180899
best model:
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
max_depth=None, max_features=0.25, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
# reshape so we can append the columns
y_test_ = y_test.reshape(y_test.shape[0], 1)
y_predicted_test_ = y_predicted_test.reshape(y_predicted_test.shape[0], 1)
predicted_name = ['predicted']
probability_names= ['prob 1', 'prob 2']
headers = feature_names + response_name + predicted_name + probability_names
df = pd.DataFrame(columns=headers)
df = pd.DataFrame(X_test, columns=feature_names)
df[response_name] = pd.DataFrame(y_test_).astype(int)
df[predicted_name] = pd.DataFrame(y_predicted_test_).astype(int)
df[probability_names] = pd.DataFrame(y_probabilities_test, columns=probability_names)
print('\ntest_dataset:\n', df.head(), end='')
df.to_csv("test_dataset.csv")
test_dataset:
BoxRatio Thrust Velocity OnBalRun vwapGain Altitude predicted \
0 0.251000 -0.278000 0.520000 1.287000 0.442000 0 0
1 0.078176 -0.034186 0.613033 2.632448 0.824791 1 1
2 1.668955 2.428611 1.642349 2.932859 0.658687 1 1
3 0.237893 -0.342475 0.780336 1.674925 0.430937 1 0
4 2.164000 2.330000 2.500000 6.473000 4.360000 0 1
prob 1 prob 2
0 0.99 0.01
1 0.33 0.67
2 0.23 0.77
3 0.70 0.30
4 0.49 0.51
import lime
import lime.lime_tabular
train, test, labels_train, labels_test = train_test_split(X, y, train_size=0.80, test_size=0.20)
explainer = lime.lime_tabular.LimeTabularExplainer(train,
feature_names=feature_names,
class_names=response_name,
discretize_continuous=True)
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], exported_pipeline.predict_proba, num_features=2, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)
ROC_Curve()
print('\nFeature Importances:', best_model.feature_importances_)
Plot_predictor_importance(best_model, feature_names)
Feature Importances: [0.06142848 0.06943697 0.08282327 0.10006172 0.09828068 0.07206218
0.04960417 0.07309812 0.05043623 0.09924978 0.06053183 0.07064098
0.06037296 0.05197263]
The class name “0” means that the profit goal was not met. Class “1” means the goal of at least 2% was met.
cm_train = confusion_matrix(y_train, y_predicted_train)
auc_train = roc_auc_score(y_train, y_predicted_train)
cm_test = confusion_matrix(y_test, y_predicted_test)
auc_test = roc_auc_score(y_test, y_predicted_test)
class_names = [0, 1]
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
Plot_confusion_matrix(cm_train, classes=class_names, title='Confusion matrix (Train)')
plt.subplot(1, 2, 2)
Plot_confusion_matrix(cm_test, classes=class_names, title='Confusion matrix (Test)')
plt.show()
c_report = classification_report(y_test, y_predicted_test)
print('\nClassification report:\n', c_report)
ntotal = len(y_test)
correct = y_test == y_predicted_test
numCorrect = sum(correct)
percent = round( (100.0*numCorrect)/ntotal, 6)
print("\nCorrect classifications on test data: {0:d}/{1:d} {2:8.3f}%".format(numCorrect,
ntotal,
percent))
prediction_score = 100.0*exported_pipeline.score(X_test, y_test)
assert (round(percent,3) == round(prediction_score, 3)), "prediction scores do not agree"
Classification report:
precision recall f1-score support
0 0.95 0.92 0.93 4933
1 0.92 0.95 0.94 4951
avg / total 0.94 0.94 0.94 9884
Correct classifications on test data: 9246/9884 93.545%
Print_model_metrics(exported_pipeline, X_train, X_test, cm_train, cm_test, auc_train, auc_test)
╒════════════════════════╤═════════════╤═══════════╕
│ Metric │ Train │ Test │
╞════════════════════════╪═════════════╪═══════════╡
│ accuracy │ 1 │ 0.935451 │
├────────────────────────┼─────────────┼───────────┤
│ precision │ 1 │ 0.922264 │
├────────────────────────┼─────────────┼───────────┤
│ recall │ 1 │ 0.951323 │
├────────────────────────┼─────────────┼───────────┤
│ misclassification rate │ 0 │ 0.0645488 │
├────────────────────────┼─────────────┼───────────┤
│ F1 │ 1 │ 0.936568 │
├────────────────────────┼─────────────┼───────────┤
│ r2 │ 1 │ 0.741804 │
├────────────────────────┼─────────────┼───────────┤
│ AUC │ 1 │ 0.935422 │
├────────────────────────┼─────────────┼───────────┤
│ mse │ 0 │ 0.0645488 │
├────────────────────────┼─────────────┼───────────┤
│ logloss │ 9.99201e-16 │ 2.22947 │
╘════════════════════════╧═════════════╧═══════════╛
The major improvements are a result of implementing:
The implementation of these two suggestions are responsible for the greatly improved results.