import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator
import seaborn as sns
import time, sys
def printf(format, *args):
sys.stdout.write(format % args)
def Get_Model_Data():
global x_train, y_train, x_valid, y_valid, x_test, y_test
global train, valid, test
global feature_columns, response_column
model = h2o.import_file(path="C:/sm/BottleRockets/model-8-1.csv")
mask = list(['BoxRatio','Thrust', 'Velocity', 'OnBalRun', 'vwapGain', 'Altitude'])
response_column = 'Altitude'
df_new = model[mask].as_data_frame()
plt.ioff()
red_green = ["#ff0000", "#00ff00"]
sns.set_palette(red_green)
np.seterr(divide='ignore', invalid='ignore')
g = sns.pairplot(df_new,
diag_kind='kde',
hue=response_column,
markers=["o", "D"],
size=1.5,
aspect=1,
plot_kws={"s": 10})
g.fig.subplots_adjust(right=0.9)
plt.show()
# Split the data into Train/Validation/Test with Train having 70% and test and validation 15% each
train_full, valid_full, test_full = model.split_frame(ratios=[.7, .15])
feature_columns = ['BoxRatio','Thrust', 'Velocity', 'OnBalRun', 'vwapGain']
train_ = train_full[mask].as_data_frame(use_pandas=True, header=True)
valid_ = valid_full[mask].as_data_frame(use_pandas=True, header=True)
test_ = test_full[mask].as_data_frame(use_pandas=True, header=True)
print('train_: \n', train_.head(5))
print('valid_: \n', valid_.head(5))
print('test_: \n', test_.head(5))
x_train = train_full[feature_columns]
y_train = train_full[response_column].asfactor()
x_valid = valid_full[feature_columns]
y_valid = valid_full[response_column].asfactor()
x_test = test_full[feature_columns]
y_test = test_full[response_column].asfactor()
train = train_full[mask]
train[response_column] = y_train
valid = valid_full[mask]
valid[response_column] = y_valid
test = test_full[mask]
test[response_column] = y_test
return
def ROC_Curve(model, df):
performance = model.model_performance(df)
auc = performance.auc()
false_positive_rate = performance.fprs
true_positive_rate = performance.tprs
plt.style.use('ggplot')
plt.figure()
plt.plot(false_positive_rate, true_positive_rate, 'k--')
plt.plot(false_positive_rate,
true_positive_rate,
color='darkorange',
lw = 2,
label='ROC curve (area = %0.2f)' % auc)
plt.plot([0,1], [0,1], color = 'navy', lw = 2, linestyle = '--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
def Plot_predictor_importance(saved_rf):
fig, ax = plt.subplots()
variables = saved_rf._model_json['output']['variable_importances']['variable']
y_pos = np.arange(len(variables))
scaled_importance = saved_rf._model_json['output']['variable_importances']['scaled_importance']
ax.barh(y_pos,
scaled_importance,
align='center',
color='green',
ecolor='black',
height=0.5)
ax.set_yticks(y_pos)
ax.set_yticklabels(variables)
ax.invert_yaxis()
ax.set_xlabel('Scaled Importance')
ax.set_title('Variable Importance')
plt.show()
def Print_Metrics():
print('\nModel performance on validate and test data set:')
performance_valid = saved_rf.model_performance(valid)
# accuracy, precision and F1 produce two numbers, which are the threshold and the value respectively.
# we index them to extract just the value.
mse = performance_valid.mse()
logloss_valid = performance_valid.logloss()
accuracy_valid = performance_valid.accuracy()[0][1]
precision_valid = performance_valid.precision()[0][1]
F1_valid = performance_valid.F1()[0][1]
r2_valid = performance_valid.r2()
auc_valid = performance_valid.auc()
predictions = saved_rf.predict(x_test)
accuracy = (predictions['predict'] == y_test).as_data_frame(use_pandas=True).mean()
print('Percent correct predictions on test set (accuracy): ', accuracy[0])
performance_test = saved_rf.model_performance(test)
mse = performance_test.mse()
logloss_test = performance_test.logloss()
accuracy_test = performance_test.accuracy()[0][1]
precision_test = performance_test.precision()[0][1]
F1_test = performance_test.F1()[0][1]
auc_test = performance_test.auc()
r2_test = performance_test.r2()
test_auc = h2o.get_model("best_rf").model_performance(test_data=test).auc()
print('Best model performance based on auc: ', test_auc)
header = ["Metric", "Validate", "Test"]
table = [
["logloss", logloss_valid, logloss_test],
["accuracy", accuracy_valid, accuracy_test],
["precision", precision_valid, precision_test],
["F1", F1_valid, F1_test],
["r2", r2_valid, r2_test],
["AUC", auc_valid, auc_test]
]
h2o.display.H2ODisplay(table, header)
start_time = int(time.time())
localH2O = h2o.init(ip = "localhost",
port = 54321,
max_mem_size="24G",
nthreads = 2)
h2o.no_progress()
h2o.remove_all()
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.151-b12, mixed mode)
Starting server from C:\Users\Charles\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
Ice root: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w
JVM stdout: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w\h2o_Charles_started_from_python.out
JVM stderr: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w\h2o_Charles_started_from_python.err
Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: 03 secs
H2O cluster version: 3.14.0.6
H2O cluster version age: 19 days
H2O cluster name: H2O_from_python_Charles
H2O cluster total nodes: 1
H2O cluster free memory: 21.33 Gb
H2O cluster total cores: 4
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Algos, AutoML, Core V3, Core V4
Python version: 3.6.2 final
Get_Model_Data()
train_:
BoxRatio Thrust Velocity OnBalRun vwapGain Altitude
0 0.166 0.166 0.317 0.455 -0.068 0
1 0.071 0.068 0.170 0.482 -0.231 0
2 -0.031 -0.031 0.109 0.531 0.115 0
3 -0.186 -0.193 0.344 0.548 0.111 0
4 -0.147 -0.147 0.326 0.597 0.157 1
valid_:
BoxRatio Thrust Velocity OnBalRun vwapGain Altitude
0 0.023 0.023 0.182 0.711 0.131 1
1 0.173 0.257 -0.015 0.812 0.213 0
2 0.330 0.249 -0.031 0.816 -0.006 0
3 0.276 0.229 -0.012 0.821 0.152 0
4 0.185 0.142 -0.014 0.822 0.498 0
test_:
BoxRatio Thrust Velocity OnBalRun vwapGain Altitude
0 0.910 -0.030 -0.006 0.818 0.159 0
1 0.921 1.181 -0.001 0.825 0.109 0
2 1.893 2.146 -0.004 0.825 0.044 0
3 0.634 0.550 -0.016 0.836 0.538 0
4 0.500 1.362 -0.011 0.837 0.170 0
estimator = H2ORandomForestEstimator(
# Stops fitting new trees when 10-tree rolling average is within 0.00001
stopping_rounds = 10,
stopping_tolerance = 0.00001,
stopping_metric = 'auto',
score_each_iteration = True,
balance_classes = True,
seed = 7)
hyper_parameters = {'ntrees':[5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'max_depth':[5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
criteria = {"strategy": "RandomDiscrete",
"stopping_rounds": 10,
"stopping_tolerance": 0.00001,
"stopping_metric": "misclassification"}
grid_search = H2OGridSearch(model = estimator,
hyper_params = hyper_parameters,
search_criteria = criteria)
grid_search.train(x = feature_columns,
y = response_column,
training_frame = train,
validation_frame = valid)
sorted_grid = grid_search.get_grid(sort_by='auc',decreasing=True)
# print('Best model sorted by auc:\n', sorted_grid.models[0])
best_max_depth = sorted_grid.sorted_metric_table()['max_depth'][0]
best_ntrees = sorted_grid.sorted_metric_table()['ntrees'][0]
best_auc = sorted_grid.sorted_metric_table()['auc'][0]
sorted_grid_mse = grid_search.get_grid(sort_by='mse',decreasing=True)
best_mse = sorted_grid_mse.sorted_metric_table()['mse'][0]
print('Best max_depth.....', best_max_depth)
print('Best ntrees........', best_ntrees)
print('Best auc...........', best_auc)
print('Best mse...........', best_mse)
Best max_depth..... 5
Best ntrees........ 10
Best auc........... 0.9246937169832528
Best mse........... 0.073014147897621
best_rf = H2ORandomForestEstimator(
model_id = "best_rf",
ntrees = int(best_ntrees),
max_depth = int(best_max_depth),
stopping_rounds = 5,
score_each_iteration = True,
seed = 7)
best_rf.train(feature_columns,
response_column,
training_frame = train,
validation_frame = valid)
ROC_Curve(best_rf, train)
print('Confusion matrix computed by precision:\n', best_rf.confusion_matrix(metrics='accuracy'))
Confusion matrix computed by precision:
Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.6785714328289032:
0 | 1 | Error | Rate | |
0 | 1226.0 | 14.0 | 0.0113 | (14.0/1240.0) |
1 | 133.0 | 24.0 | 0.8471 | (133.0/157.0) |
Total | 1359.0 | 38.0 | 0.1052 | (147.0/1397.0) |
model_path = h2o.save_model(model=best_rf, path="C:/sm/BottleRockets/rf_model", force=True)
saved_rf = h2o.load_model(model_path)
Plot_predictor_importance(saved_rf)
Print_Metrics()
Model performance on validate and test data set:
Percent correct predictions on test set (accuracy): 0.878378378378
Best model performance based on auc: 0.8740918803418803
Metric | Validate | Test |
logloss | 0.1929474 | 0.2353532 |
accuracy | 0.9150943 | 0.9054054 |
precision | 1.0 | 1.0 |
F1 | 0.6197183 | 0.6741573 |
r2 | 0.3329372 | 0.3543782 |
AUC | 0.9159267 | 0.8740919 |
end_time = int(time.time())
d = divmod(end_time - start_time,86400) # days
h = divmod(d[1],3600) # hours
m = divmod(h[1],60) # minutes
s = m[1] # seconds
print('%d days, %d hours, %d minutes, %d seconds' % (d[0],h[0],m[0],s))
0 days, 0 hours, 1 minutes, 31 seconds