H2o Classification Random Forest Grid Search

19 Oct 2017

Analyze the Bottle Rocket dataset using Random Forest and Grid Search

import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator
import seaborn as sns
import time, sys

def printf(format, *args):
    sys.stdout.write(format % args)

Define a method to get the data set, and split it into train, validate, test

def Get_Model_Data():
    global x_train, y_train, x_valid, y_valid, x_test, y_test
    global train, valid, test 
    global feature_columns, response_column
    
    model = h2o.import_file(path="C:/sm/BottleRockets/model-8-1.csv")
    
    mask = list(['BoxRatio','Thrust', 'Velocity', 'OnBalRun', 'vwapGain', 'Altitude'])
    response_column = 'Altitude'
    
    df_new = model[mask].as_data_frame()
    plt.ioff()
    red_green = ["#ff0000", "#00ff00"]
    sns.set_palette(red_green)
    np.seterr(divide='ignore', invalid='ignore')
    g = sns.pairplot(df_new,
                     diag_kind='kde',
                     hue=response_column, 
                     markers=["o", "D"],
                     size=1.5,
                     aspect=1,
                     plot_kws={"s": 10})
    g.fig.subplots_adjust(right=0.9)
    plt.show()

    # Split the data into Train/Validation/Test with Train having 70% and test and validation 15% each
    train_full, valid_full, test_full = model.split_frame(ratios=[.7, .15])

    feature_columns = ['BoxRatio','Thrust', 'Velocity', 'OnBalRun', 'vwapGain']

    train_ = train_full[mask].as_data_frame(use_pandas=True, header=True)
    valid_ = valid_full[mask].as_data_frame(use_pandas=True, header=True)
    test_  = test_full[mask].as_data_frame(use_pandas=True, header=True)
    print('train_: \n', train_.head(5))
    print('valid_: \n', valid_.head(5))
    print('test_: \n',  test_.head(5))

    x_train = train_full[feature_columns]
    y_train = train_full[response_column].asfactor()
    x_valid = valid_full[feature_columns]
    y_valid = valid_full[response_column].asfactor()
    x_test  = test_full[feature_columns]
    y_test  = test_full[response_column].asfactor()
    
    train = train_full[mask]
    train[response_column] = y_train
    valid = valid_full[mask]
    valid[response_column] = y_valid
    test  = test_full[mask]
    test[response_column] = y_test

    return

Plot the ROC Cuve

def ROC_Curve(model, df):
    performance = model.model_performance(df)
    auc = performance.auc()
    false_positive_rate = performance.fprs
    true_positive_rate = performance.tprs

    plt.style.use('ggplot')
    plt.figure()
    plt.plot(false_positive_rate, true_positive_rate, 'k--')
    plt.plot(false_positive_rate, 
             true_positive_rate, 
             color='darkorange',
             lw = 2,
             label='ROC curve (area = %0.2f)' % auc)
    plt.plot([0,1], [0,1], color = 'navy', lw = 2, linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

Plot the performance of each predictor

def Plot_predictor_importance(saved_rf):
    fig, ax = plt.subplots()
    variables = saved_rf._model_json['output']['variable_importances']['variable']
    y_pos = np.arange(len(variables))
    scaled_importance = saved_rf._model_json['output']['variable_importances']['scaled_importance']
    ax.barh(y_pos, 
            scaled_importance, 
            align='center', 
            color='green', 
            ecolor='black', 
            height=0.5)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(variables)
    ax.invert_yaxis()
    ax.set_xlabel('Scaled Importance')
    ax.set_title('Variable Importance')
    plt.show()

Print the model metrics for the validate and test data sets

def Print_Metrics():
    print('\nModel performance on validate and test data set:')
    performance_valid = saved_rf.model_performance(valid)
    # accuracy, precision and F1 produce two numbers, which are the threshold and the value respectively.
    # we index them to extract just the value.
    mse             = performance_valid.mse()
    logloss_valid   = performance_valid.logloss()
    accuracy_valid  = performance_valid.accuracy()[0][1]
    precision_valid = performance_valid.precision()[0][1]
    F1_valid        = performance_valid.F1()[0][1]
    r2_valid        = performance_valid.r2()
    auc_valid       = performance_valid.auc()
    
    predictions = saved_rf.predict(x_test)
    accuracy = (predictions['predict'] == y_test).as_data_frame(use_pandas=True).mean()
    print('Percent correct predictions on test set (accuracy): ', accuracy[0])
    
    performance_test = saved_rf.model_performance(test)
    mse            = performance_test.mse()
    logloss_test   = performance_test.logloss()
    accuracy_test  = performance_test.accuracy()[0][1]
    precision_test = performance_test.precision()[0][1]
    F1_test        = performance_test.F1()[0][1]
    auc_test       = performance_test.auc()
    r2_test        = performance_test.r2()

    test_auc       = h2o.get_model("best_rf").model_performance(test_data=test).auc()
    print('Best model performance based on auc: ', test_auc)
    
    header = ["Metric", "Validate", "Test"]
    table  = [
               ["logloss",   logloss_valid,   logloss_test],
               ["accuracy",  accuracy_valid,  accuracy_test],
               ["precision", precision_valid, precision_test],
               ["F1",        F1_valid,        F1_test],
               ["r2",        r2_valid,        r2_test],
               ["AUC",       auc_valid,       auc_test]
             ]
    h2o.display.H2ODisplay(table, header)

Start the h2o server

start_time = int(time.time())

localH2O = h2o.init(ip = "localhost",
                    port = 54321,
                    max_mem_size="24G",
                    nthreads = 2)
h2o.no_progress()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.151-b12, mixed mode)
  Starting server from C:\Users\Charles\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w
  JVM stdout: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w\h2o_Charles_started_from_python.out
  JVM stderr: C:\Users\Charles\AppData\Local\Temp\tmpkx4xc57w\h2o_Charles_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.

H2O cluster uptime: 03 secs
H2O cluster version: 3.14.0.6
H2O cluster version age: 19 days
H2O cluster name: H2O_from_python_Charles
H2O cluster total nodes: 1
H2O cluster free memory: 21.33 Gb
H2O cluster total cores: 4
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Algos, AutoML, Core V3, Core V4
Python version: 3.6.2 final

Get the training, validation, and testing datasets, and display a pairs plot

Get_Model_Data()

train_: 
    BoxRatio  Thrust  Velocity  OnBalRun  vwapGain  Altitude
   0.166   0.166     0.317     0.455    -0.068         0
   0.071   0.068     0.170     0.482    -0.231         0
  -0.031  -0.031     0.109     0.531     0.115         0
  -0.186  -0.193     0.344     0.548     0.111         0
  -0.147  -0.147     0.326     0.597     0.157         1
valid_: 
    BoxRatio  Thrust  Velocity  OnBalRun  vwapGain  Altitude
   0.023   0.023     0.182     0.711     0.131         1
   0.173   0.257    -0.015     0.812     0.213         0
   0.330   0.249    -0.031     0.816    -0.006         0
   0.276   0.229    -0.012     0.821     0.152         0
   0.185   0.142    -0.014     0.822     0.498         0
test_: 
    BoxRatio  Thrust  Velocity  OnBalRun  vwapGain  Altitude
   0.910  -0.030    -0.006     0.818     0.159         0
   0.921   1.181    -0.001     0.825     0.109         0
   1.893   2.146    -0.004     0.825     0.044         0
   0.634   0.550    -0.016     0.836     0.538         0
   0.500   1.362    -0.011     0.837     0.170         0

Define the grid parameters

estimator = H2ORandomForestEstimator(
    # Stops fitting new trees when 10-tree rolling average is within 0.00001
    stopping_rounds = 10,
    stopping_tolerance = 0.00001,
    stopping_metric = 'auto',
    score_each_iteration = True,
    balance_classes = True,
    seed = 7)

hyper_parameters = {'ntrees':[5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 
                    'max_depth':[5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

criteria = {"strategy": "RandomDiscrete", 
            "stopping_rounds": 10,
            "stopping_tolerance": 0.00001,
            "stopping_metric": "misclassification"}

grid_search = H2OGridSearch(model = estimator, 
                            hyper_params = hyper_parameters,
                            search_criteria = criteria)

Now train the network

grid_search.train(x = feature_columns,
                  y = response_column,
                  training_frame = train,
                  validation_frame = valid)

Sort the grid models by decreasing AUC

sorted_grid = grid_search.get_grid(sort_by='auc',decreasing=True)
# print('Best model sorted by auc:\n', sorted_grid.models[0])

best_max_depth  = sorted_grid.sorted_metric_table()['max_depth'][0]
best_ntrees     = sorted_grid.sorted_metric_table()['ntrees'][0]
best_auc        = sorted_grid.sorted_metric_table()['auc'][0]
sorted_grid_mse = grid_search.get_grid(sort_by='mse',decreasing=True)
best_mse        = sorted_grid_mse.sorted_metric_table()['mse'][0]

print('Best max_depth.....', best_max_depth)
print('Best ntrees........', best_ntrees)
print('Best auc...........', best_auc)
print('Best mse...........', best_mse)

Best max_depth..... 5
Best ntrees........ 10
Best auc........... 0.9246937169832528
Best mse........... 0.073014147897621

Now let’s use the best parameters

best_rf = H2ORandomForestEstimator(
    model_id = "best_rf",
    ntrees = int(best_ntrees),
    max_depth = int(best_max_depth),
    stopping_rounds = 5,
    score_each_iteration = True,
    seed = 7)

best_rf.train(feature_columns, 
              response_column, 
              training_frame = train, 
              validation_frame = valid)

Plot the ROC Curve

ROC_Curve(best_rf, train)

Print the confusion matrix

print('Confusion matrix computed by precision:\n', best_rf.confusion_matrix(metrics='accuracy'))

Confusion matrix computed by precision:
 Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.6785714328289032: 

	0	1	Error	Rate
0	1226.0	14.0	0.0113	(14.0/1240.0)
1	133.0	24.0	0.8471	(133.0/157.0)
Total	1359.0	38.0	0.1052	(147.0/1397.0)

Save the model

model_path = h2o.save_model(model=best_rf, path="C:/sm/BottleRockets/rf_model", force=True)

Load the model

saved_rf = h2o.load_model(model_path)

Plot predictor importance

Plot_predictor_importance(saved_rf)

Print the performance of the model

Print_Metrics()

Model performance on validate and test data set:
Percent correct predictions on test set (accuracy):  0.878378378378
Best model performance based on auc:  0.8740918803418803

Metric	Validate	Test
logloss	0.1929474	0.2353532
accuracy	0.9150943	0.9054054
precision	1.0	1.0
F1	0.6197183	0.6741573
r2	0.3329372	0.3543782
AUC	0.9159267	0.8740919

Print the computation time

end_time = int(time.time())
d = divmod(end_time - start_time,86400)  # days
h = divmod(d[1],3600)  # hours
m = divmod(h[1],60)  # minutes
s = m[1]  # seconds
print('%d days, %d hours, %d minutes, %d seconds' % (d[0],h[0],m[0],s))

0 days, 0 hours, 1 minutes, 31 seconds