Random Hyperboxes

This example shows how to use a random hyperboxes classifier, in which each base hyperbox-based model is trained on a subset of features and a subset of samples.

[1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from hbbrain.numerical_data.ensemble_learner.random_hyperboxes import RandomHyperboxesClassifier
from hbbrain.numerical_data.incremental_learner.onln_gfmm import OnlineGFMM

Load dataset.

This example will use the breast cancer dataset available in sklearn to demonstrate how to use this ensemble classifier.

[2]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
[3]:
df = load_breast_cancer()
X = df.data
y = df.target
[4]:
# Normailise data into the range of [0, 1] as hyperbox-based models only work in the unit cube
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
[5]:
# Split data into training, validation and testing sets
Xtr_val, X_test, ytr_val, y_test = train_test_split(X, y, train_size=0.8, random_state=0)
Xtr, X_val, ytr, y_val = train_test_split(X, y, train_size=0.75, random_state=0)

This example will use the GFMM classifier with the original online learning algorithm as base learners. However, any type of hyperbox-based learning algorithms in this library can also be used to train base learners.

1. Using random subsampling to generate training sets for various base learners

a. The number of features used in each base learner is different and is bounded by a maximum number of features

Training

[6]:
# Initialise parameters
n_estimators = 20 # number of base learners
max_samples = 0.5 # sampling rate for samples
max_features = 0.5 # sampling rate to generate the maximum number of features
class_balanced = False # do not use the class-balanced sampling mode
feature_balanced = False # use different numbers of features for base learners
n_jobs = 4 # number of processes is used to build base learners
[7]:
# Init a hyperbox-based model used to train base learners
# Using the GFMM classifier with the original online learning algorithm with the maximum hyperbox size 0.1
base_estimator = OnlineGFMM(theta=0.1)
[8]:
rh_subsampling_diff_num_features_clf = RandomHyperboxesClassifier(base_estimator=base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, class_balanced=class_balanced, feature_balanced=feature_balanced, n_jobs=n_jobs, random_state=0)
rh_subsampling_diff_num_features_clf.fit(Xtr, ytr)
[8]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           max_features=0.5, n_estimators=20, n_jobs=4,
                           random_state=0)
[9]:
print("Training time: %.3f (s)"%(rh_subsampling_diff_num_features_clf.elapsed_training_time))
Training time: 4.155 (s)
[10]:
print('Total number of hyperboxes from all base learners = %d'%rh_subsampling_diff_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners = 2212

Prediction

[11]:
y_pred = rh_subsampling_diff_num_features_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Testing accuracy = {acc * 100: .2f}%')
Testing accuracy =  92.11%

Apply pruning for base learners

[12]:
acc_threshold=0.5 # minimum accuracy score of the unpruned hyperboxes
keep_empty_boxes=False # False means hyperboxes that do not join the prediction process within the pruning procedure are also eliminated
rh_subsampling_diff_num_features_clf.simple_pruning_base_estimators(X_val, y_val, acc_threshold, keep_empty_boxes)
[12]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           max_features=0.5, n_estimators=20, n_jobs=4,
                           random_state=0)
[13]:
print('Total number of hyperboxes from all base learners after pruning = %d'%rh_subsampling_diff_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners after pruning = 1219

Prediction after doing a pruning procedure

[14]:
y_pred_2 = rh_subsampling_diff_num_features_clf.predict(X_test)
acc_pruned = accuracy_score(y_test, y_pred_2)
print(f'Testing accuracy (after pruning) = {acc_pruned * 100: .2f}%')
Testing accuracy (after pruning) =  95.61%

b. The number of features used in each base learner is the same and is equal to the given maximum number of features

[15]:
# Initialise parameters
n_estimators = 20 # number of base learners
max_samples = 0.5 # sampling rate for samples
max_features = 0.5 # sampling rate to generate the maximum number of features
class_balanced = False # do not use the class-balanced sampling mode
# use the same numbers of features for base learners and the number of used features is the given maximum number of features
feature_balanced = True
n_jobs = 4 # number of processes is used to build base learners
[16]:
# Init a hyperbox-based model used to train base learners
# Using the GFMM classifier with the original online learning algorithm with the maximum hyperbox size 0.1
base_estimator = OnlineGFMM(theta=0.1)
[17]:
rh_subsampling_same_num_features_clf = RandomHyperboxesClassifier(base_estimator=base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, class_balanced=class_balanced, feature_balanced=feature_balanced, n_jobs=n_jobs, random_state=0)
rh_subsampling_same_num_features_clf.fit(Xtr, ytr)
[17]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           feature_balanced=True, max_features=0.5,
                           n_estimators=20, n_jobs=4, random_state=0)
[18]:
print("Training time: %.3f (s)"%(rh_subsampling_same_num_features_clf.elapsed_training_time))
Training time: 0.841 (s)
[19]:
print('Total number of hyperboxes from all base learners = %d'%rh_subsampling_same_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners = 3241

Prediction

[20]:
y_pred = rh_subsampling_same_num_features_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Testing accuracy = {acc * 100: .2f}%')
Testing accuracy =  94.74%

Apply pruning for base learners

[21]:
acc_threshold=0.5 # minimum accuracy score of the unpruned hyperboxes
keep_empty_boxes=False # False means hyperboxes that do not join the prediction process within the pruning procedure are also eliminated
rh_subsampling_same_num_features_clf.simple_pruning_base_estimators(X_val, y_val, acc_threshold, keep_empty_boxes)
[21]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           feature_balanced=True, max_features=0.5,
                           n_estimators=20, n_jobs=4, random_state=0)

Prediction after doing a pruning procedure

[22]:
y_pred_2 = rh_subsampling_same_num_features_clf.predict(X_test)
acc_pruned = accuracy_score(y_test, y_pred_2)
print(f'Testing accuracy (after pruning) = {acc_pruned * 100: .2f}%')
Testing accuracy (after pruning) =  96.49%

2. Using random undersampling to generate class-balanced training sets for various base learners

a. The number of features used in each base learner is different and is bounded by a maximum number of features

Training

[23]:
# Initialise parameters
n_estimators = 20 # number of base learners
max_samples = 0.5 # sampling rate for samples
max_features = 0.5 # sampling rate to generate the maximum number of features
class_balanced = True # use the class-balanced sampling mode
feature_balanced = False # use different numbers of features for base learners
n_jobs = 4 # number of processes is used to build base learners
[24]:
# Init a hyperbox-based model used to train base learners
# Using the GFMM classifier with the original online learning algorithm with the maximum hyperbox size 0.1
base_estimator = OnlineGFMM(theta=0.1)
[25]:
rh_class_balanced_diff_num_features_clf = RandomHyperboxesClassifier(base_estimator=base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, class_balanced=class_balanced, feature_balanced=feature_balanced, n_jobs=n_jobs, random_state=0)
rh_class_balanced_diff_num_features_clf.fit(Xtr, ytr)
[25]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           class_balanced=True, max_features=0.5,
                           n_estimators=20, n_jobs=4, random_state=0)
[26]:
print("Training time: %.3f (s)"%(rh_class_balanced_diff_num_features_clf.elapsed_training_time))
Training time: 4.061 (s)
[27]:
print('Total number of hyperboxes from all base learners = %d'%rh_class_balanced_diff_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners = 2288

Prediction

[28]:
y_pred = rh_class_balanced_diff_num_features_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Testing accuracy = {acc * 100: .2f}%')
Testing accuracy =  91.23%

Apply pruning for base learners

[29]:
acc_threshold=0.5 # minimum accuracy score of the unpruned hyperboxes
keep_empty_boxes=False # False means hyperboxes that do not join the prediction process within the pruning procedure are also eliminated
rh_class_balanced_diff_num_features_clf.simple_pruning_base_estimators(X_val, y_val, acc_threshold, keep_empty_boxes)
[29]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           class_balanced=True, max_features=0.5,
                           n_estimators=20, n_jobs=4, random_state=0)
[30]:
print('Total number of hyperboxes from all base learners after pruning = %d'%rh_class_balanced_diff_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners after pruning = 1546

Prediction after doing a pruning procedure

[31]:
y_pred_2 = rh_class_balanced_diff_num_features_clf.predict(X_test)
acc_pruned = accuracy_score(y_test, y_pred_2)
print(f'Testing accuracy (after pruning) = {acc_pruned * 100: .2f}%')
Testing accuracy (after pruning) =  97.37%

b. The number of features used in each base learner is the same and is equal to the given maximum number of features

[32]:
# Initialise parameters
n_estimators = 20 # number of base learners
max_samples = 0.5 # sampling rate for samples
max_features = 0.5 # sampling rate to generate the maximum number of features
class_balanced = True # use the class-balanced sampling mode
# use the same numbers of features for base learners and the number of used features is the given maximum number of features
feature_balanced = True
n_jobs = 4 # number of processes is used to build base learners
[33]:
# Init a hyperbox-based model used to train base learners
# Using the GFMM classifier with the original online learning algorithm with the maximum hyperbox size 0.1
base_estimator = OnlineGFMM(theta=0.1)
[34]:
rh_class_balanced_same_num_features_clf = RandomHyperboxesClassifier(base_estimator=base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, class_balanced=class_balanced, feature_balanced=feature_balanced, n_jobs=n_jobs, random_state=0)
rh_class_balanced_same_num_features_clf.fit(Xtr, ytr)
[34]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           class_balanced=True, feature_balanced=True,
                           max_features=0.5, n_estimators=20, n_jobs=4,
                           random_state=0)
[35]:
print("Training time: %.3f (s)"%(rh_class_balanced_same_num_features_clf.elapsed_training_time))
Training time: 0.474 (s)
[36]:
print('Total number of hyperboxes from all base learners = %d'%rh_class_balanced_same_num_features_clf.get_n_hyperboxes())
Total number of hyperboxes from all base learners = 3356

Prediction

[37]:
y_pred = rh_class_balanced_same_num_features_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Testing accuracy = {acc * 100: .2f}%')
Testing accuracy =  91.23%

Apply pruning for base learners

[38]:
acc_threshold=0.5 # minimum accuracy score of the unpruned hyperboxes
keep_empty_boxes=False # False means hyperboxes that do not join the prediction process within the pruning procedure are also eliminated
rh_class_balanced_same_num_features_clf.simple_pruning_base_estimators(X_val, y_val, acc_threshold, keep_empty_boxes)
[38]:
RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),
                                                     V=array([], dtype=float64),
                                                     W=array([], dtype=float64),
                                                     theta=0.1),
                           class_balanced=True, feature_balanced=True,
                           max_features=0.5, n_estimators=20, n_jobs=4,
                           random_state=0)

Prediction after doing a pruning procedure

[39]:
y_pred_2 = rh_class_balanced_same_num_features_clf.predict(X_test)
acc_pruned = accuracy_score(y_test, y_pred_2)
print(f'Testing accuracy (after pruning) = {acc_pruned * 100: .2f}%')
Testing accuracy (after pruning) =  96.49%