Quickstart

Training a model

Simply use an estimator by initialising, fitting and predicting:

from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hbbrain.numerical_data.incremental_learner.onln_gfmm import OnlineGFMM
# Load dataset
X, y = load_iris(return_X_y=True)
# Normalise features into the range of [0, 1] because hyperbox-based models only work in a unit range
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Training a model
clf = OnlineGFMM(theta=0.1).fit(X_train, y_train)
# Make prediction
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy = {acc * 100: .2f}%')

In an sklearn Pipeline

Using hyperbox-based estimators in a sklearn Pipeline:

from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from hbbrain.numerical_data.incremental_learner.onln_gfmm import OnlineGFMM

# Load dataset
X, y = load_iris(return_X_y=True)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a GFMM model
onln_gfmm_clf = OnlineGFMM(theta=0.1)
# Create a pipeline
pipe = Pipeline([
   ('scaler', MinMaxScaler()),
   ('onln_gfmm', onln_gfmm_clf)
])
# Training
pipe.fit(X_train, y_train)
# Make prediction
acc = pipe.score(X_test, y_test)
print(f'Testing accuracy = {acc * 100: .2f}%')

Hyper-parameter search

This example shows how to use hyperbox-based models with sklearn random search:

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from hbbrain.numerical_data.ensemble_learner.random_hyperboxes import RandomHyperboxesClassifier
from hbbrain.numerical_data.incremental_learner.onln_gfmm import OnlineGFMM

# Load dataset
X, y = load_breast_cancer(return_X_y=True)
# Normalise features into the range of [0, 1] because hyperbox-based models only work in a unit range
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialise search ranges for hyper-parameters
parameters = {'n_estimators': [20, 30, 50, 100, 200, 500],
           'max_samples': [0.2, 0.3, 0.4, 0.5, 0.6],
           'max_features' : [0.2, 0.3, 0.4, 0.5, 0.6],
           'class_balanced' : [True, False],
           'feature_balanced' : [True, False],
           'n_jobs' : [4],
           'random_state' : [0],
           'base_estimator__theta' : np.arange(0.05, 0.61, 0.05),
           'base_estimator__gamma' : [0.5, 1, 2, 4, 8, 16]}
# Init base learner. This example uses the original online learning algorithm to train a GFMM classifier
base_estimator = OnlineGFMM()
# Using random search with only 40 random combinations of parameters
random_hyperboxes_clf = RandomHyperboxesClassifier(base_estimator=base_estimator)
clf_rd_search = RandomizedSearchCV(random_hyperboxes_clf, parameters, n_iter=40, cv=5, random_state=0)
# Fit model
clf_rd_search.fit(X_train, y_train)
# Print out best scores and hyper-parameters
print("Best average score = ", clf_rd_search.best_score_)
print("Best params: ", clf_rd_search.best_params_)
# Using the best model to make prediction
best_gfmm_rd_search = clf_rd_search.best_estimator_
y_pred_rd_search = best_gfmm_rd_search.predict(X_test)
acc_rd_search = accuracy_score(y_test, y_pred_rd_search)
print(f'Accuracy (random-search) = {acc_rd_search * 100: .2f}%')