{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Integration of Ensemble Models with Hyper-parameter Optimisation in Sklearn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This example shows how to integrate the random hyperboxes classifier with the Random Search Cross-Validation functionality implemented by scikit-learn.\n", "\n", "Note that this example uses the random hyperboxes model and Random Search for illustration. However, other hyperbox-based ensemble learning algorithms in the library can be used similarly for any hyper-parameter tunning methods." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from sklearn.model_selection import train_test_split\n", "from hbbrain.numerical_data.ensemble_learner.random_hyperboxes import RandomHyperboxesClassifier\n", "from hbbrain.numerical_data.incremental_learner.onln_gfmm import OnlineGFMM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load dataset, normalize numerical features into the range of [0, 1] and build training and testing datasets.\n", "This example will use the breast cancer dataset in sklearn for illustration purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_breast_cancer\n", "from sklearn.preprocessing import MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = load_breast_cancer()\n", "X = df.data\n", "y = df.target" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "scaler = MinMaxScaler()\n", "X = scaler.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Random Search with 5-fold cross-validation." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "parameters = {'n_estimators': [20, 30, 50, 100, 200, 500], \n", " 'max_samples': [0.2, 0.3, 0.4, 0.5, 0.6],\n", " 'max_features' : [0.2, 0.3, 0.4, 0.5, 0.6],\n", " 'class_balanced' : [True, False],\n", " 'feature_balanced' : [True, False],\n", " 'n_jobs' : [4],\n", " 'random_state' : [0],\n", " 'base_estimator__theta' : np.arange(0.05, 0.61, 0.05),\n", " 'base_estimator__gamma' : [0.5, 1, 2, 4, 8, 16]}" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Init base learner. This example uses the original online learning algorithm to train a GFMM classifier\n", "base_estimator = OnlineGFMM()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Using random search with only 40 random combinations of parameters\n", "random_hyperboxes_clf = RandomHyperboxesClassifier(base_estimator=base_estimator)\n", "clf_rd_search = RandomizedSearchCV(random_hyperboxes_clf, parameters, n_iter=40, cv=5, random_state=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomizedSearchCV(cv=5,\n", " estimator=RandomHyperboxesClassifier(base_estimator=OnlineGFMM(C=array([], dtype=float64),\n", " V=array([], dtype=float64),\n", " W=array([], dtype=float64))),\n", " n_iter=40,\n", " param_distributions={'base_estimator__gamma': [0.5, 1, 2, 4,\n", " 8, 16],\n", " 'base_estimator__theta': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,\n", " 0.6 ]),\n", " 'class_balanced': [True, False],\n", " 'feature_balanced': [True, False],\n", " 'max_features': [0.2, 0.3, 0.4, 0.5,\n", " 0.6],\n", " 'max_samples': [0.2, 0.3, 0.4, 0.5,\n", " 0.6],\n", " 'n_estimators': [20, 30, 50, 100, 200,\n", " 500],\n", " 'n_jobs': [4], 'random_state': [0]},\n", " random_state=0)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_rd_search.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best average score = 0.9714285714285715\n", "Best params: {'random_state': 0, 'n_jobs': 4, 'n_estimators': 500, 'max_samples': 0.6, 'max_features': 0.5, 'feature_balanced': True, 'class_balanced': False, 'base_estimator__theta': 0.15000000000000002, 'base_estimator__gamma': 16}\n" ] } ], "source": [ "print(\"Best average score = \", clf_rd_search.best_score_)\n", "print(\"Best params: \", clf_rd_search.best_params_)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "best_gfmm_rd_search = clf_rd_search.best_estimator_" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Testing the performance on the test set\n", "y_pred_rd_search = best_gfmm_rd_search.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy (random-search) = 96.49%\n" ] } ], "source": [ "acc_rd_search = accuracy_score(y_test, y_pred_rd_search)\n", "print(f'Accuracy (random-search) = {acc_rd_search * 100: .2f}%')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 4 }