# -*- coding: utf-8 -*-
# License: BSD-3-Clause
# Author: LKouadio <etanoyau@gmail.com>
from __future__ import annotations
import re
import sys
import inspect
import itertools
import numpy as np
from collections import defaultdict
from warnings import warn
from ._watexlog import watexlog
from ._docstring import (
DocstringComponents,
_core_docs
)
from ._typing import (
List,
Optional,
DataFrame ,
Tuple
)
from .exlib.sklearn import (
clone,
LabelEncoder,
_name_estimators ,
BaseEstimator,
ClassifierMixin,
accuracy_score,
recall_score,
precision_score,
roc_auc_score,
train_test_split
)
from .exceptions import NotFittedError
from .utils.coreutils import _is_readable
from .utils.funcutils import (
_assert_all_types,
repr_callable_obj,
smart_strobj_recognition,
smart_format ,
sanitize_frame_cols
)
from .utils._dependency import import_optional_dependency
from .utils.validator import (
array_to_frame,
check_array,
check_X_y,
get_estimator_name
)
__all__=[
"Data",
"Missing",
"AdalineGradientDescent",
"AdalineStochasticGradientDescent",
"SequentialBackwardSelection",
"MajorityVoteClassifier",
"GreedyPerceptron",
"existfeatures",
"selectfeatures" ,
"get_params"
]
# +++ add base documentations +++
_base_params = dict (
axis="""
axis: {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.
* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.
Changed in version 1.0.0: Pass tuple or list to drop on multiple
axes. Only a single axis is allowed.
""",
columns="""
columns: str or list of str
columns to replace which contain the missing data. Can use the axis
equals to '1'.
""",
name="""
name: str, :attr:`pandas.Series.name`
A singluar column name. If :class:`pandas.Series` is given, 'name'
denotes the attribute of the :class:`pandas.Series`. Preferably `name`
must correspond to the label name of the target.
""",
sample="""
sample: int, Optional,
Number of row to visualize or the limit of the number of sample to be
able to see the patterns. This is usefull when data is composed of
many rows. Skrunked the data to keep some sample for visualization is
recommended. ``None`` plot all the samples ( or examples) in the data
""",
kind="""
kind: str, Optional
type of visualization. Can be ``dendrogramm``, ``mbar`` or ``bar``.
``corr`` plot for dendrogram , :mod:`msno` bar, :mod:`plt`
and :mod:`msno` correlation visualization respectively:
* ``bar`` plot counts the nonmissing data using pandas
* ``mbar`` use the :mod:`msno` package to count the number
of nonmissing data.
* dendrogram`` show the clusterings of where the data is missing.
leaves that are the same level predict one onother presence
(empty of filled). The vertical arms are used to indicate how
different cluster are. short arms mean that branch are
similar.
* ``corr` creates a heat map showing if there are correlations
where the data is missing. In this case, it does look like
the locations where missing data are corollated.
* ``None`` is the default vizualisation. It is useful for viewing
contiguous area of the missing data which would indicate that
the missing data is not random. The :code:`matrix` function
includes a sparkline along the right side. Patterns here would
also indicate non-random missing data. It is recommended to limit
the number of sample to be able to see the patterns.
Any other value will raise an error.
""",
inplace="""
inplace: bool, default False
Whether to modify the DataFrame rather than creating a new one.
"""
)
_param_docs = DocstringComponents.from_nested_components(
core=_core_docs["params"],
base = DocstringComponents(_base_params)
)
# +++ end base documentations +++
_logger = watexlog().get_watex_logger(__name__)
class _Base:
"""Base class for all classes in watex for parameters retrievals
Notes
-----
All class defined should specify all the parameters that can be set
at the class level in their ``__init__`` as explicit keyword
arguments (no ``*args`` or ``**kwargs``).
"""
@classmethod
def _get_param_names(cls):
"""Get parameter names for the estimator"""
# fetch the constructor or the original constructor before
# deprecation wrapping if any
init = getattr(cls.__init__, "deprecated_original", cls.__init__)
if init is object.__init__:
# No explicit constructor to introspect
return []
# introspect the constructor arguments to find the model parameters
# to represent
init_signature = inspect.signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [
p
for p in init_signature.parameters.values()
if p.name != "self" and p.kind != p.VAR_KEYWORD
]
for p in parameters:
if p.kind == p.VAR_POSITIONAL:
raise RuntimeError(
"watex classes should always "
"specify their parameters in the signature"
" of their __init__ (no varargs)."
" %s with constructor %s doesn't "
" follow this convention." % (cls, init_signature)
)
# Extract and sort argument names excluding 'self'
return sorted([p.name for p in parameters])
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Parameters
----------
deep : bool, default=True
If True, will return the parameters for this class and
contained subobjects.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
out = dict()
for key in self._get_param_names():
value = getattr(self, key)
if deep and hasattr(value, "get_params"):
deep_items = value.get_params().items()
out.update((key + "__" + k, val) for k, val in deep_items)
out[key] = value
return out
def set_params(self, **params):
"""Set the parameters of this estimator.
The method works on simple classes as well as on nested objects
(such as :class:`~sklearn.pipeline.Pipeline`). The latter have
parameters of the form ``<component>__<parameter>`` so that it's
possible to update each component of a nested object.
Parameters
----------
**params : dict
Estimator parameters.
Returns
-------
self : estimator instance
Estimator instance.
"""
if not params:
# Simple optimization to gain speed (inspect is slow)
return self
valid_params = self.get_params(deep=True)
nested_params = defaultdict(dict) # grouped by prefix
for key, value in params.items():
key, delim, sub_key = key.partition("__")
if key not in valid_params:
local_valid_params = self._get_param_names()
raise ValueError(
f"Invalid parameter {key!r} for estimator {self}. "
f"Valid parameters are: {local_valid_params!r}."
)
if delim:
nested_params[key][sub_key] = value
else:
setattr(self, key, value)
valid_params[key] = value
for key, sub_params in nested_params.items():
valid_params[key].set_params(**sub_params)
return self
[docs]
class Data:
def __init__ (self, verbose: int =0):
self._logging= watexlog().get_watex_logger(self.__class__.__name__)
self.verbose=verbose
self.data_=None
@property
def data (self ):
""" return verified data """
return self.data_
@data.setter
def data (self, d):
""" Read and parse the data"""
self.data_ = _is_readable (d)
@property
def describe (self):
""" Get summary stats as well as see the cound of non-null data.
Here is the default behaviour of the method i.e. it is to only report
on numeric columns. To have have full control, do it manually by
yourself.
"""
return self.data.describe()
[docs]
def fit(self, data: str | DataFrame=None):
""" Read, assert and fit the data.
Parameters
------------
data: Dataframe or shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
Returns
---------
:class:`Data` instance
Returns ``self`` for easy method chaining.
"""
if data is not None:
self.data = data
check_array(
self.data,
force_all_finite='allow-nan',
dtype =object ,
input_name='Data',
to_frame =True
)
# for consistency if not a frame, set to aframe
self.data = array_to_frame (
self.data, to_frame = True , input_name= 'col_', force =True
)
data= sanitize_frame_cols (self.data, fill_pattern='_' )
for col in data.columns :
setattr (self, col, data[col])
return self
[docs]
def shrunk (self,
columns: list[str],
data: str | DataFrame = None,
**kwd
):
""" Reduce the data with importance features
Parameters
------------
data: Dataframe or shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
columns: str or list of str
Columns or features to keep in the datasets
kwd: dict,
additional keywords arguments from :func:`watex.utils.mlutils.selectfeatures`
Returns
---------
:class:`Data` instance
Returns ``self`` for easy method chaining.
"""
self.inspect
self.data = selectfeatures(
self.data , features = columns, **kwd)
return self
@property
def inspect(self):
""" Inspect data and trigger plot after checking the data entry.
Raises `NotFittedError` if `ExPlot` is not fitted yet."""
msg = ( "{dobj.__class__.__name__} instance is not fitted yet."
" Call 'fit' with appropriate arguments before using"
" this method"
)
if self.data_ is None:
raise NotFittedError(msg.format(
dobj=self)
)
return 1
[docs]
def profilingReport (self, data: str | DataFrame= None, **kwd):
"""Generate a report in a notebook.
It will summarize the types of the columns and allow yuou to view
details of quatiles statistics, a histogram, common values and extreme
values.
Parameters
------------
data: Dataframe or shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
Returns
---------
:class:`Data` instance
Returns ``self`` for easy method chaining.
Examples
---------
>>> from watex.base import Data
>>> Data().fit(data).profilingReport()
"""
extra_msg =("'Data.profilingReport' method uses 'pandas-profiling'"
" as a dependency.")
import_optional_dependency("pandas_profiling", extra=extra_msg )
self.inspect
self.data = data or self.data
try :
from pandas_profiling import ProfileReport
except ImportError:
msg=(f"Missing of 'pandas_profiling package. {extra_msg}"
" Cannot plot profiling report. Install it using pip"
" or conda.")
warn(msg)
raise ImportError (msg)
return ProfileReport( self.data , **kwd)
[docs]
def rename (self,
data: str | DataFrame= None,
columns: List[str]=None,
pattern:Optional[str] = None
):
"""
rename columns of the dataframe with columns in lowercase and spaces
replaced by underscores.
Parameters
-----------
data: Dataframe of shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
columns: str or list of str, Optional
the specific columns in dataframe to renames. However all columns
is put in lowercase. If columns not in dataframe, error raises.
pattern: str, Optional,
Regular expression pattern to strip the data. By default, the
pattern is ``'[ -@*#&+/]'``.
Return
-------
``self``: :class:`~watex.base.Data` instance
returns ``self`` for easy method chaining.
"""
pattern = str (pattern)
if pattern =='None':
pattern = r'[ -@*#&+/]'
regex =re.compile (pattern, flags=re.IGNORECASE)
if data is not None:
self.data = data
self.data.columns= self.data.columns.str.strip()
if columns is not None:
existfeatures(self.data, columns, 'raise')
if columns is not None:
self.data[columns].columns = self.data[columns].columns.str.lower(
).map(lambda o: regex.sub('_', o))
if columns is None:
self.data.columns = self.data.columns.str.lower().map(
lambda o: regex.sub('_', o))
return self
#XXX TODO # use logical and to quick merge two frames
[docs]
def merge (self) :
""" Merge two series whatever the type with operator `&&`.
When series as dtype object as non numeric values, dtypes should be
change into a object
"""
# try :
# self.data []
# __and__= __rand__ = merge
[docs]
def drop (
self,
labels: list[str |int] = None,
columns: List[str]=None,
inplace:bool = False,
axis:int = 0 , **kws
):
""" Drop specified labels from rows or columns.
Remove rows or columns by specifying label names and corresponding
axis, or by specifying directly index or column names. When using a
multi-index, labels on different levels can be removed by specifying
the level.
Parameters
-----------
labels: single label or list-like
Index or column labels to drop. A tuple will be used as a single
label and not treated as a list-like.
axis: {0 or 'index', 1 or 'columns'}, default 0
Whether to drop labels from the index (0 or 'index')
or columns (1 or 'columns').
columns: single label or list-like
Alternative to specifying axis
(labels, axis=1 is equivalent to columns=labels)
kws: dict,
Additionnal keywords arguments passed to :meth:`pd.DataFrame.drop`.
Returns
----------
DataFrame or None
DataFrame without the removed index or column labels or
None if `inplace` equsls to ``True``.
"""
self.inspect
data = self.data.drop(labels= labels, inplace = inplace,
columns = columns , axis =axis , **kws )
return data
def __repr__(self):
""" Pretty format for programmer guidance following the API... """
return repr_callable_obj (self, skip ='y')
def __getattr__(self, name):
if name.endswith ('_'):
if name not in self.__dict__.keys():
if name in ('data_', 'X_'):
raise NotFittedError (
f'Fit the {self.__class__.__name__!r} object first'
)
rv = smart_strobj_recognition(name, self.__dict__, deep =True)
appender = "" if rv is None else f'. Do you mean {rv!r}'
raise AttributeError (
f'{self.__class__.__name__!r} object has no attribute {name!r}'
f'{appender}{"" if rv is None else "?"}'
)
Data.__doc__="""\
Data base class
Typically, we train a model with a matrix of data. Note that pandas Dataframe
is the most used because it is very nice to have columns lables even though
Numpy arrays work as well.
For supervised Learning for instance, suc as regression or clasification, our
intent is to have a function that transforms features into a label. If we
were to write this as an algebra formula, it would be look like:
.. math::
y = f(X)
:code:`X` is a matrix. Each row represent a `sample` of data or information
about individual. Every columns in :code:`X` is a `feature`.The output of
our function, :code:`y`, is a vector that contains labels (for classification)
or values (for regression).
In Python, by convention, we use the variable name :code:`X` to hold the
sample data even though the capitalization of variable is a violation of
standard naming convention (see PEP8).
Parameters
-----------
{params.core.data}
{params.base.columns}
{params.base.axis}
{params.base.sample}
{params.base.kind}
{params.base.inplace}
{params.core.verbose}
Returns
-------
{returns.self}
Examples
--------
.. include:: ../docs/data.rst
""".format(
params=_param_docs,
returns=_core_docs["returns"],
)
[docs]
class Missing (Data) :
""" Deal with missing values in Data
Most algorithms will not work with missing data. Notable exceptions are the
recent boosting libraries such as the XGBoost
(:doc:`watex.documentation.xgboost.__doc__`) CatBoost and LightGBM.
As with many things in machine learning , there are no hard answaers for how
to treat a missing data. Also, missing data could represent different
situations. There are three warious way to handle missing data::
* Remove any row with missing data
* Remove any columns with missing data
* Impute missing values
* Create an indicator columns to indicator data was missing
Parameters
-----------
in_percent: bool,
give the statistic of missing data in percentage if ser to ``True``.
sample: int, Optional,
Number of row to visualize or the limit of the number of sample to be
able to see the patterns. This is usefull when data is composed of
many rows. Skrunked the data to keep some sample for visualization is
recommended. ``None`` plot all the samples ( or examples) in the data
kind: str, Optional
type of visualization. Can be ``dendrogramm``, ``mbar`` or ``bar``.
``corr`` plot for dendrogram , :mod:`msno` bar, :mod:`plt`
and :mod:`msno` correlation visualization respectively:
* ``bar`` plot counts the nonmissing data using pandas
* ``mbar`` use the :mod:`msno` package to count the number
of nonmissing data.
* dendrogram`` show the clusterings of where the data is missing.
leaves that are the same level predict one onother presence
(empty of filled). The vertical arms are used to indicate how
different cluster are. short arms mean that branch are
similar.
* ``corr` creates a heat map showing if there are correlations
where the data is missing. In this case, it does look like
the locations where missing data are corollated.
* ``None`` is the default vizualisation. It is useful for viewing
contiguous area of the missing data which would indicate that
the missing data is not random. The :code:`matrix` function
includes a sparkline along the right side. Patterns here would
also indicate non-random missing data. It is recommended to limit
the number of sample to be able to see the patterns.
Any other value will raise an error
Examples
--------
>>> from watex.base import Missing
>>> data ='data/geodata/main.bagciv.data.csv'
>>> ms= Missing().fit(data)
>>> ms.plot_.fig_size = (12, 4 )
>>> ms.plot ()
"""
def __init__(self,
in_percent = False,
sample = None,
kind = None,
drop_columns: List[str]=None,
**kws):
self.in_percent = in_percent
self.kind = kind
self.sample= sample
self.drop_columns=drop_columns
self.isnull_ = None
super().__init__(**kws)
@property
def isnull(self):
""" Check the mean values in the data in percentge"""
self.isnull_= self.data.isnull().mean(
) * 1e2 if self.in_percent else self.data.isnull().mean()
return self.isnull_
[docs]
def plot(self, figsize:Tuple [int] = None, **kwd ):
"""
Vizualize patterns in the missing data.
Parameters
------------
data: Dataframe of shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
kind: str, Optional
kind of visualization. Can be ``dendrogramm``, ``mbar`` or ``bar`` plot
for dendrogram , :mod:`msno` bar and :mod:`plt` visualization
respectively:
* ``bar`` plot counts the nonmissing data using pandas
* ``mbar`` use the :mod:`msno` package to count the number
of nonmissing data.
* dendrogram`` show the clusterings of where the data is missing.
leaves that are the same level predict one onother presence
(empty of filled). The vertical arms are used to indicate how
different cluster are. short arms mean that branch are
similar.
* ``corr` creates a heat map showing if there are correlations
where the data is missing. In this case, it does look like
the locations where missing data are corollated.
* ``None`` is the default vizualisation. It is useful for viewing
contiguous area of the missing data which would indicate that
the missing data is not random. The :code:`matrix` function
includes a sparkline along the right side. Patterns here would
also indicate non-random missing data. It is recommended to limit
the number of sample to be able to see the patterns.
Any other value will raise an error
sample: int, Optional
Number of row to visualize. This is usefull when data is composed of
many rows. Skrunked the data to keep some sample for visualization is
recommended. ``None`` plot all the samples ( or examples) in the data
kws: dict
Additional keywords arguments of :mod:`msno.matrix` plot.
Return
-------
``self``: :class:`~watex.base.Missing` instance
returns ``self`` for easy method chaining.
Examples
--------
>>> from watex.base import Missing
>>> data ='data/geodata/main.bagciv.data.csv'
>>> ms= Missing().fit(data)
>>> ms.plot(figsize = (12, 4 ) )
"""
self.inspect
from .view.plot import ExPlot
ExPlot(fig_size=figsize).fit(self.data).plotmissing(
kind = self.kind, sample = self.sample,**kwd )
return self
@property
def get_missing_columns(self):
""" return columns with Nan Values """
return list(self.data.columns [self.data.isna().any()])
[docs]
def drop (self,
data : str | DataFrame =None,
columns: List[str] = None,
inplace = False,
axis = 1 ,
**kwd
):
"""Remove missing data
Parameters
-----------
data: Dataframe of shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
columns: str or list of str
columns to drop which contain the missing data. Can use the axis
equals to '1'.
axis: {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.
* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.
Changed in version 1.0.0: Pass tuple or list to drop on multiple
axes. Only a single axis is allowed.
how: {'any', 'all'}, default 'any'
Determine if row or column is removed from DataFrame, when we
have at least one NA or all NA.
* 'any': If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh: int, optional
Require that many non-NA values. Cannot be combined with how.
subset: column label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
inplace: bool, default False
Whether to modify the DataFrame rather than creating a new one.
Returns
-------
``self``: :class:`~watex.base.Missing` instance
returns ``self`` for easy method chaining.
"""
if data is not None:
self.data = data
self.inspect
if columns is not None:
self.drop_columns = columns
existfeatures(self.data , self.drop_columns, error ='raise')
if self.drop_columns is None:
if inplace :
self.data.dropna (axis = axis , inplace = True, **kwd )
else : self.data = self.data .dropna (
axis = axis , inplace = False, **kwd )
elif self.drop_columns is not None:
if inplace :
self.data.drop (columns = self.drop_columns ,
axis = axis, inplace = True,
**kwd)
else :
self.data.drop (columns = self.columns , axis = axis ,
inplace = False , **kwd)
return self
@property
def sanity_check (self):
"""Ensure that we have deal with all missing values. The following
code returns a single boolean if there is any cell that is missing
in a DataFrame """
return self.data.isna().any().any()
[docs]
def replace (self,
data:str |DataFrame = None ,
columns: List[str] = None,
fill_value: float = None ,
new_column_name: str= None,
return_non_null: bool = False,
**kwd):
"""
Replace the missing values to consider.
Use the :code:`coalease` function of :mod:`pyjanitor`. It takes a
dataframe and a list of columns to consider. This is a similar to
functionality found in Excel and SQL databases. It returns the first
non null value of each row.
Parameters
-----------
data: Dataframe of shape (M, N) from :class:`pandas.DataFrame`
Dataframe containing samples M and features N
columns: str or list of str
columns to replace which contain the missing data. Can use the axis
equals to '1'.
axis: {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.
* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.
Changed in version 1.0.0: Pass tuple or list to drop on multiple
axes. Only a single axis is allowed.
Returns
-------
``self``: :class:`~watex.base.Missing` instance
returns ``self`` for easy method chaining.
"""
if data is not None:
self.data = data
self.inspect
existfeatures(self.data , columns )
if return_non_null :
new_column_name = _assert_all_types(new_column_name, str )
if 'pyjanitor' not in sys.modules:
raise ModuleNotFoundError(" 'pyjanitor' is missing.Install it"
" mannualy using conda or pip.")
import pyjanitor as jn
return jn.coalease (self.data ,
columns = columns,
new_column_name = new_column_name,
)
if fill_value is not None:
# fill missing values with a particular values.
try :
self.data = self.data .fillna(fill_value , **kwd)
except :
if 'pyjanitor' in sys.modules:
import pyjanitor as jn
jn.fill_empty (
self.data , columns = columns or list(self.data.columns),
value = fill_value
)
return self
[docs]
class SequentialBackwardSelection (_Base ):
r"""
Sequential Backward Selection (SBS) is a feature selection algorithm which
aims to reduce dimensionality of the initial feature subspace with a
minimum decay in the performance of the classifier to improve upon
computationan efficiency. In certains cases, SBS can even improve the
predictive power of the model if a model suffers from overfitting.
The idea behind the SBS is simple: it sequentially removes features
from the full feature subset until the new feature subspace contains the
desired number of features. In order to determine which feature is to be
removed at each stage, the criterion fonction :math:`J` is needed for
minimization [1]_.
Indeed, the criterion calculated from the criteria function can simply be
the difference in performance of the classifier before and after the
removal of this particular feature. Then, the feature to be remove at each
stage can simply be the defined as the feature that maximizes this
criterion; or in more simple terms, at each stage, the feature that causes
the least performance is eliminated loss after removal. Based on the
preceding definition of SBS, the algorithm can be outlibe with a few steps:
- Initialize the algorithm with :math:`k=d`, where :math:`d` is the
dimensionality of the full feature space, :math:`X_d`.
- Determine the feature :math:`x^{-}`,that maximizes the criterion:
:math:`x^{-}= argmax J(X_k-x)`, where :math:`x\in X_k`.
- Remove the feature :math:`x^{-}` from the feature set
:math:`X_{k+1}= X_k -x^{-}; k=k-1`.
-Terminate if :math:`k` equals to the number of desired features;
otherwise go to the step 2. [2]_
Parameters
-----------
estimator: callable or instanciated object,
callable or instance object that has a fit method.
k_features: int, default=1
the number of features from where starting the selection. It must be
less than the number of feature in the training set, otherwise it
does not make sense.
scoring: callable or str , default='accuracy'
metric for scoring. availabe metric are 'precision', 'recall',
'roc_auc' or 'accuracy'. Any other metric with raise an errors.
test_size : float or int, default=None
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. If ``train_size`` is also None, it will
be set to 0.25.
random_state : int, RandomState instance or None, default=None
Controls the shuffling applied to the data before applying the split.
Pass an int for reproducible output across multiple function calls.
References
-----------
.. [1] Raschka, S., Mirjalili, V., 2019. Python Machine Learning, 3rd ed. Packt.
.. [2] Ferri F., Pudil F., Hatef M., and Kittler J., Comparative study of
the techniques for Large-scale feature selection, pages 403-413, 1994.
Attributes
-----------
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
indices_: tuple of dimensionnality X
Collect the indices of subset of the best validated models
subsets_: list,
list of `indices_`
scores_: list,
Collection of the scores of the best model got during the
cross-validating
k_score_: float,
The score of the desired feature.
Examples
--------
>>> from watex.exlib.sklearn import KNeighborsClassifier , train_test_split
>>> from watex.datasets import fetch_data
>>> from watex.base import SequentialBackwardSelection
>>> X, y = fetch_data('bagoue analysed') # data already standardized
>>> Xtrain, Xt, ytrain, yt = train_test_split(X, y)
>>> knn = KNeighborsClassifier(n_neighbors=5)
>>> sbs= SequentialBackwardSelection (knn)
>>> sbs.fit(Xtrain, ytrain )
"""
_scorers = dict (accuracy = accuracy_score , recall = recall_score ,
precision = precision_score, roc_auc= roc_auc_score
)
def __init__ (self, estimator=None , k_features=1 ,
scoring ='accuracy', test_size = .25 ,
random_state = 42 ):
self.estimator=estimator
self.k_features=k_features
self.scoring=scoring
self.test_size=test_size
self.random_state=random_state
[docs]
def fit(self, X, y) :
""" Fit the training data
Note that SBS splits the datasets into a test and training insite the
fit function. :math:`X` is still fed to the algorithm. Indeed, SBS
will then create a new training subsets for testing (validation) and
training , which is why this test set is also called the validation
dataset. This approach is necessary to prevent our original test set
to becoming part of the training data.
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `SequentialBackwardSelection` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self ),
to_frame= True,
)
self._check_sbs_args(X)
if hasattr(X, 'columns'):
self.feature_names_in = list(X.columns )
X = X.values
Xtr, Xt, ytr, yt = train_test_split(X, y , test_size=self.test_size,
random_state=self.random_state
)
dim = Xtr.shape [1]
self.indices_= tuple (range (dim))
self.subsets_= [self.indices_]
score = self._compute_score(Xtr, Xt, ytr, yt, self.indices_)
self.scores_=[score]
# compute the score for p indices in
# list indices in dimensions
while dim > self.k_features:
scores , subsets = [], []
for p in itertools.combinations(self.indices_, r=dim-1):
score = self._compute_score(Xtr, Xt, ytr, yt, p)
scores.append (score)
subsets.append (p)
best = np.argmax (scores)
self.indices_= subsets [best]
self.subsets_.append(self.indices_)
dim -=1 # go back for -1
self.scores_.append (scores[best])
# set the k_feature score
self.k_score_= self.scores_[-1]
return self
def _compute_score (self, Xtr, Xt, ytr, yt, indices):
""" Compute score from splitting `X` and indices """
self.estimator.fit(Xtr[:, indices], ytr)
y_pred = self.estimator.predict (Xt [:, indices])
score = self.scoring (yt, y_pred)
return score
def _check_sbs_args (self, X):
""" Assert SBS main arguments """
if not hasattr(self.estimator, 'fit'):
raise TypeError ("Estimator must have a 'fit' method.")
try :
self.k_features = int (self.k_features)
except Exception as err:
raise TypeError ("Expect an integer for number of feature k,"
f" got {type(self.k_features).__name__!r}"
) from err
if self.k_features > X.shape [1] :
raise ValueError ("Too many number of features."
f" Expect max-features={X.shape[1]}")
if (
callable(self.scoring)
or inspect.isfunction ( self.scoring )
):
self.scoring = self.scoring.__name__.replace ('_score', '')
if self.scoring not in self._scorers.keys():
raise ValueError (
f"Accept only scorers {list (self._scorers.keys())}"
f"for scoring, not {self.scoring!r}")
self.scoring = self._scorers[self.scoring]
self.scorer_name_ = self.scoring.__name__.replace (
'_score', '').title ()
def __repr__(self):
""" Represent the Sequential Backward Selection class """
get_params = self.get_params()
get_params.pop('scoring')
if hasattr (self, 'scorer_name_'):
get_params ['scoring'] =self.scorer_name_
tup = tuple (f"{key}={val}".replace ("'", '') for key, val in
get_params.items() )
return self.__class__.__name__ + str(tup).replace("'", "")
[docs]
class GreedyPerceptron (_Base):
r""" Perceptron classifier
Inspired from Rosenblatt concept of perceptron rules. Indeed, Rosenblatt
published the first concept of perceptron learning rule based on the MCP
(McCulloth-Pitts) neuron model. With the perceptron rule, Rosenblatt
proposed an algorithm thar would automatically learn the optimal weights
coefficients that would them be multiplied by the input features in order
to make the decision of whether a neuron fires (transmits a signal) or not.
In the context of supervised learning and classification, such algirithm
could them be used to predict whether a new data points belongs to one
class or the other.
Rosenblatt initial perceptron rule and the perceptron algorithm can be
summarized by the following steps:
- initialize the weights at 0 or small random numbers.
- For each training examples, :math:`x^{(i)}`:
- Compute the output value :math:`\hat{y}`.
- update the weighs.
the weights :math:`w` vector can be fromally written as:
.. math::
w := w_j + \delta w_j
Parameters
-----------
eta: float,
Learning rate between (0. and 1.)
n_iter: int ,
number of iteration passes over the training set
random_state: int, default is 42
random number generator seed for random weight initialization.
Attributes
----------
w_: Array-like,
Weight after fitting
errors_: list
Number of missclassification (updates ) in each epoch
References
------------
.. [1] Rosenblatt F, 1957, The perceptron:A perceiving and Recognizing
Automaton,Cornell Aeoronautical Laboratory 1957
.. [2] McCulloch W.S and W. Pitts, 1943. A logical calculus of Idea of
Immanent in Nervous Activity, Bulleting of Mathematical Biophysics,
5(4): 115-133, 1943.
"""
def __init__(self, eta:float = .01 , n_iter: int = 50 ,
random_state:int = 42 ) :
super().__init__()
self.eta=eta
self.n_iter=n_iter
self.random_state=random_state
[docs]
def fit(self , X, y ):
""" Fit the training data
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `Perceptron` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self ),
to_frame= True,
)
rgen = np.random.RandomState(self.random_state)
self.w_ = rgen.normal(loc=0. , scale =.01 , size = 1 + X.shape[1]
)
self.errors_ =list()
for _ in range (self.n_iter):
errors =0
for xi, target in zip(X, y):
update = self.eta * (target - self.predict(xi))
self.w_[1:] += update * xi
self.w_[0] += update
errors += int(update !=0.)
self.errors_.append(errors)
return self
[docs]
def predict (self, X):
"""
Predict the class label after unit step
Parameters
----------
X : Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
ypred: predicted class label after the unit step (1, or -1)
"""
if not hasattr (self, 'w_'):
raise NotFittedError("Can't call 'predict' method with estimator"
" not fitted yet. Fit estimator by calling"
" the 'fit' method first.")
return np.where (self.net_input(X) >=.0 , 1 , -1 )
def __repr__(self):
""" Represent the output class """
tup = tuple (f"{key}={val}".replace ("'", '') for key, val in
self.get_params().items() )
return self.__class__.__name__ + str(tup).replace("'", "")
[docs]
class MajorityVoteClassifier (BaseEstimator, ClassifierMixin ):
r"""
A majority vote Ensemble classifier
Combine different classification algorithms associate with individual
weights for confidence. The goal is to build a stronger meta-classifier
that balance out of the individual classifiers weaknes on a particular
datasets. In more precise in mathematical terms, the weighs majority
vote can be expressed as follow:
.. math::
\hat{y} = arg \max{i} \sum {j=1}^{m} w_j\chi_A (C_j(x)=1)
where :math:`w_j` is a weight associated with a base classifier, :math:`C_j`;
:math:`\hat{y}` is the predicted class label of the ensemble. :math:`A` is
the set of the unique class label; :math:`\chi_A` is the characteristic
function or indicator function which returns 1 if the predicted class of
the jth clasifier matches :math:`i(C_j(x)=1)`. For equal weights, the equation
is simplified as follow:
.. math::
\hat{y} = mode {{C_1(x), C_2(x), ... , C_m(x)}}
Parameters
------------
clfs: {array_like}, shape (n_classifiers)
Differents classifier for ensembles
vote: str , ['classlabel', 'probability'], default is {'classlabel'}
If 'classlabel' the prediction is based on the argmax of the class
label. Otherwise, if 'probability', the argmax of the sum of the
probabilities is used to predict the class label. Note it is
recommended for calibrated classifiers.
weights:{array-like}, shape (n_classifiers, ), Optional, default=None
If a list of `int` or `float`, values are provided, the classifier
are weighted by importance; it uses the uniform weights if 'weights' is
``None``.
Attributes
------------
classes_: array_like, shape (n_classifiers)
array of classifiers withencoded classes labels
classifiers_: list,
list of fitted classifiers
Examples
---------
>>> from watex.exlib.sklearn import (
LogisticRegression,DecisionTreeClassifier ,KNeighborsClassifier,
Pipeline , cross_val_score , train_test_split , StandardScaler ,
SimpleImputer )
>>> from watex.datasets import fetch_data
>>> from watex.base import MajorityVoteClassifier
>>> from watex.base import selectfeatures
>>> data = fetch_data('bagoue original').get('data=dfy1')
>>> X0 = data.iloc [:, :-1]; y0 = data ['flow'].values
>>> # exclude the categorical value for demonstration
>>> # binarize the target y
>>> y = np.asarray (list(map (lambda x: 0 if x<=1 else 1, y0)))
>>> X = selectfeatures (X0, include ='number')
>>> X = SimpleImputer().fit_transform (X)
>>> X, Xt , y, yt = train_test_split(X, y)
>>> clf1 = LogisticRegression(penalty ='l2', solver ='lbfgs')
>>> clf2= DecisionTreeClassifier(max_depth =1 )
>>> clf3 = KNeighborsClassifier( p =2 , n_neighbors=1)
>>> pipe1 = Pipeline ([('sc', StandardScaler()),
('clf', clf1)])
>>> pipe3 = Pipeline ([('sc', StandardScaler()),
('clf', clf3)])
(1) -> Test the each classifier results taking individually
>>> clf_labels =['Logit', 'DTC', 'KNN']
>>> # test the results without using the MajorityVoteClassifier
>>> for clf , label in zip ([pipe1, clf2, pipe3], clf_labels):
scores = cross_val_score(clf, X, y , cv=10 , scoring ='roc_auc')
print("ROC AUC: %.2f (+/- %.2f) [%s]" %(scores.mean(),
scores.std(),
label))
... ROC AUC: 0.91 (+/- 0.05) [Logit]
ROC AUC: 0.73 (+/- 0.07) [DTC]
ROC AUC: 0.77 (+/- 0.09) [KNN]
(2) _> Implement the MajorityVoteClassifier
>>> # test the resuls with Majority vote
>>> mv_clf = MajorityVoteClassifier(clfs = [pipe1, clf2, pipe3])
>>> clf_labels += ['Majority voting']
>>> all_clfs = [pipe1, clf2, pipe3, mv_clf]
>>> for clf , label in zip (all_clfs, clf_labels):
scores = cross_val_score(clf, X, y , cv=10 , scoring ='roc_auc')
print("ROC AUC: %.2f (+/- %.2f) [%s]" %(scores.mean(),
scores.std(), label))
... ROC AUC: 0.91 (+/- 0.05) [Logit]
ROC AUC: 0.73 (+/- 0.07) [DTC]
ROC AUC: 0.77 (+/- 0.09) [KNN]
ROC AUC: 0.92 (+/- 0.06) [Majority voting] # give good score & less errors
"""
def __init__(self, clfs, weights = None , vote ='classlabel'):
self.clfs=clfs
self.weights=weights
self.vote=vote
self.classifier_names_={}
[docs]
def fit(self, X, y):
"""
Fit classifiers
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `MajorityVoteClassifier` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self ),
to_frame= True,
)
self._check_clfs_vote_and_weights ()
# use label encoder to ensure that class start by 0
# which is important for np.argmax call in predict
self._labenc = LabelEncoder ()
self._labenc.fit(y)
self.classes_ = self._labenc.classes_
self.classifiers_ = list()
for clf in self.clfs:
fitted_clf= clone (clf).fit(X, self._labenc.transform(y))
self.classifiers_.append (fitted_clf )
return self
@property
def inspect (self):
""" Inspect object whether is fitted or not"""
msg = ( "{obj.__class__.__name__} instance is not fitted yet."
" Call 'fit' with appropriate arguments before using"
" this method"
)
if not hasattr (self, 'classifiers_'):
raise NotFittedError(msg.format(
obj=self)
)
return 1
[docs]
def predict(self, X):
"""
Predict the class label of X
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
maj_vote:{array_like}, shape (n_examples, )
Predicted class label array
"""
self.inspect
if self.vote =='proba':
maj_vote = np.argmax (self.predict_proba(X), axis =1 )
if self.vote =='label':
# collect results from clf.predict
preds = np.asarray(
[clf.predict(X) for clf in self.classifiers_ ]).T
maj_vote = np.apply_along_axis(
lambda x : np.argmax(
np.bincount(x , weights = self.weights )),
axis = 1 ,
arr= preds
)
maj_vote = self._labenc.inverse_transform(maj_vote )
return maj_vote
[docs]
def predict_proba (self, X):
"""
Predict the class probabilities an return average probabilities which
is usefull when computing the the receiver operating characteristic
area under the curve (ROC AUC ).
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
avg_proba: {array_like }, shape (n_examples, n_classes)
weights average probabilities for each class per example.
"""
self.inspect
probas = np.asarray (
[ clf.predict_proba(X) for clf in self.classifiers_ ])
avg_proba = np.average (probas , axis = 0 , weights = self.weights )
return avg_proba
[docs]
def get_params( self , deep = True ):
""" Overwrite the get params from `_Base` class and get
classifiers parameters from GridSearch . """
if not deep :
return super().get_params(deep =False )
if deep :
out = self.classifier_names_.copy()
for name, step in self.classifier_names_.items() :
for key, value in step.get_params (deep =True).items ():
out['%s__%s'% (name, key)]= value
return out
def _check_clfs_vote_and_weights (self):
""" assert the existence of classifiers, vote type and the
classfifers weigths """
l = "https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html"
if self.clfs is None:
raise TypeError( "Expect at least one classifiers. ")
if hasattr(self.clfs , '__class__') and hasattr(
self.clfs , '__dict__'):
self.clfs =[self.clfs ]
s = set ([ (hasattr(o, '__class__') and hasattr(o, '__dict__')) for o
in self.clfs])
if not list(s)[0] or len(s)!=1:
raise TypeError(
"Classifier should be a class object, not {0!r}. Please refer"
" to Scikit-Convention to write your own estimator <{1!r}>."
.format('type(self.clfs).__name__', l)
)
self.classifier_names_ = {
k : v for k, v in _name_estimators(self.clfs)
}
regex= re.compile(r'(class|label|target)|(proba)')
v= regex.search(self.vote)
if v is None :
raise ValueError ("Vote argument must be 'probability' or "
"'classlabel', got %r"%self.vote )
if v is not None:
if v.group (1) is not None:
self.vote ='label'
elif v.group(2) is not None:
self.vote ='proba'
if self.weights and len(self.weights)!= len(self.clfs):
raise ValueError(" Number of classifier must be consistent with "
" the weights. got {0} and {1} respectively."
.format(len(self.clfs), len(self.weights))
)
[docs]
class AdalineStochasticGradientDescent (_Base) :
r""" Adaptative Linear Neuron Classifier with batch (stochastic)
gradient descent
A stochastic gradient descent is a popular alternative algorithm which is
sometimes also called iterative or online gradient descent [1]_. It updates
the weights based on the sum of accumulated errors over all training
examples :math:`x^{(i)}`:
.. math::
\delta w: \sum{i} (y^{(i)} -\phi( z^{(i)}))x^(i)
the weights are updated incremetally for each training examples:
.. math::
\eta(y^{(i)} - \phi(z^{(i)})) x^{(i)}
Parameters
-----------
eta: float,
Learning rate between (0. and 1.)
n_iter: int,
number of iteration passes over the training set
suffle: bool,
shuffle training data every epoch if True to prevent cycles.
random_state: int, default is 42
random number generator seed for random weight initialization.
Attributes
----------
w_: Array-like,
Weight after fitting
cost_: list
Sum of squares cost function (updates ) in each epoch
See also
---------
AdelineGradientDescent: :class:`~watex.base.AdalineGradientDescent`
References
-----------
.. [1] Windrow and al., 1960. An Adaptative "Adaline" Neuron Using Chemical
"Memistors", Technical reports Number, 1553-2,B Windrow and al.,
standford Electron labs, Standford, CA,October 1960.
"""
def __init__(self, eta:float = .01 , n_iter: int = 50 , shuffle=True,
random_state:int = 42 ) :
super().__init__()
self.eta=eta
self.n_iter=n_iter
self.shuffle=shuffle
self.random_state=random_state
self.w_initialized =False
[docs]
def fit(self , X, y ):
""" Fit the training data
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `Perceptron` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self),
)
self._init_weights (X.shape[1])
self.cost_=list()
for i in range(self.n_iter ):
if self.shuffle:
X, y = self._shuffle (X, y)
cost =[]
for xi , target in zip(X, y) :
cost.append(self._update_weights(xi, target))
avg_cost = sum(cost)/len(y)
self.cost_.append(avg_cost)
return self
@property
def inspect (self):
""" Inspect object whether is fitted or not"""
msg = ( "{obj.__class__.__name__} instance is not fitted yet."
" Call 'fit' with appropriate arguments before using"
" this method"
)
if not hasattr (self, 'w_'):
raise NotFittedError(msg.format(
obj=self)
)
return 1
[docs]
def partial_fit(self, X, y):
"""
Fit training data without reinitialising the weights
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `Perceptron` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self),
)
if not self.w_initialized :
self._init_weights (X.shape[1])
if y.ravel().shape [0]> 1:
for xi, target in zip(X, y):
self._update_weights (xi, target)
else:
self._update_weights (X, y)
return self
def _shuffle (self, X, y):
"""
Shuffle training data
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
-------
Training and target data shuffled
"""
r= self.rgen.permutation(len(y))
return X[r], y[r]
def _init_weights (self, m):
"""
Initialize weights with small random numbers
Parameters
----------
m : int
random number for weights initialization .
"""
self.rgen = np.random.RandomState(self.random_state)
self.w_ = self.rgen.normal(loc=.0 , scale=.01, size = 1+ m)
self.w_initialized = True
def _update_weights (self, X, y):
"""
Adeline learning rules to update the weights
Parameters
----------
X : Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set for initializing
y :array-like, shape (M, ) ``M=m-samples``,
train target for initializing
Returns
-------
cost: list,
sum-squared errors
"""
output = self.activation (self.net_input(X))
errors =(y - output )
self.w_[1:] += self.eta * X.dot(errors)
cost = errors **2 /2.
return cost
[docs]
def activation (self, X):
"""
Compute the linear activation
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
X: activate NDArray
"""
return X
[docs]
def predict (self, X):
"""
Predict the class label after unit step
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
ypred: predicted class label after the unit step (1, or -1)
"""
return np.where (self.activation(self.net_input(X))>=0. , 1, -1)
def __repr__(self):
""" Represent the output class """
tup = tuple (f"{key}={val}".replace ("'", '') for key, val in
self.get_params().items() )
return self.__class__.__name__ + str(tup).replace("'", "")
[docs]
class AdalineGradientDescent (_Base):
r"""Adaptative Linear Neuron Classifier
ADAptative LInear NEuron (Adaline) was published by Bernard Widrow and
his doctoral studentTeed Hoff only a few uears after Rosenblatt's
perceptron algorithm. It can be considered as impovrment of the latter
Windrow and al., 1960.
Adaline illustrates the key concepts of defining and minimizing continuous
cost function. This lays the groundwork for understanding more advanced
machine learning algorithm for classification, such as Logistic Regression,
Support Vector Machines,and Regression models.
The key difference between Adaline rule (also know as the WIdrow-Hoff rule)
and Rosenblatt's perceptron is that the weights are updated based on linear
activation function rather than unit step function like in the perceptron.
In Adaline, this linear activation function :math:`\phi(z)` is simply
the identifu function of the net input so that:
.. math::
\phi (w^Tx)= w^Tx
while the linear activation function is used for learning the weights.
Parameters
-----------
eta: float,
Learning rate between (0. and 1.)
n_iter: int ,
number of iteration passes over the training set
random_state: int, default is 42
random number generator seed for random weight initialization.
Attributes
----------
w_: Array-like,
Weight after fitting
cost_: list
Sum of squares cost function (updates ) in each epoch
References
-----------
.. [1] Windrow and al., 1960. An Adaptative "Adeline" Neuron Using Chemical
"Memistors", Technical reports Number, 1553-2,B Windrow and al.,
standford Electron labs, Standford, CA,October 1960.
"""
def __init__(self, eta:float = .01 , n_iter: int = 50 ,
random_state:int = 42 ) :
super().__init__()
self.eta=eta
self.n_iter=n_iter
self.random_state=random_state
[docs]
def fit(self , X, y ):
""" Fit the training data
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
y: array-like, shape (M, ) ``M=m-samples``,
train target; Denotes data that may be observed at training time
as the dependent variable in learning, but which is unavailable
at prediction time, and is usually the target of prediction.
Returns
--------
self: `Perceptron` instance
returns ``self`` for easy method chaining.
"""
X, y = check_X_y(
X,
y,
estimator = get_estimator_name(self),
)
rgen = np.random.RandomState(self.random_state)
self.w_ = rgen.normal(loc=0. , scale =.01 , size = 1 + X.shape[1]
)
self.cost_ =list()
for i in range (self.n_iter):
net_input = self.net_input (X)
output = self.activation (net_input)
errors = ( y - output )
self.w_[1:] += self.eta * X.T.dot(errors)
self.w_[0] += self.eta * errors.sum()
cost = (errors **2 ).sum() / 2.
self.cost_.append(cost)
return self
@property
def inspect (self):
""" Inspect object whether is fitted or not"""
msg = ( "{obj.__class__.__name__} instance is not fitted yet."
" Call 'fit' with appropriate arguments before using"
" this method"
)
if not hasattr (self, 'w_'):
raise NotFittedError(msg.format(
obj=self)
)
return 1
[docs]
def activation (self, X):
"""
Compute the linear activation
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
X: activate NDArray
"""
return X
[docs]
def predict (self, X):
"""
Predict the class label after unit step
Parameters
----------
X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``)
Training set; Denotes data that is observed at training and
prediction time, used as independent variables in learning.
When a matrix, each sample may be represented by a feature vector,
or a vector of precomputed (dis)similarity with each training
sample. :code:`X` may also not be a matrix, and may require a
feature extractor or a pairwise metric to turn it into one before
learning a model.
Returns
-------
ypred: predicted class label after the unit step (1, or -1)
"""
return np.where (self.activation(self.net_input(X))>=0. , 1, -1)
def __repr__(self):
""" Represent the output class """
tup = tuple (f"{key}={val}".replace ("'", '') for key, val in
self.get_params().items() )
return self.__class__.__name__ + str(tup).replace("'", "")
[docs]
def get_params (obj: object
) -> dict:
"""
Get object parameters.
Object can be callable or instances
:param obj: object , can be callable or instance
:return: dict of parameters values
:examples:
>>> from sklearn.svm import SVC
>>> from watex.base import get_params
>>> sigmoid= SVC (
**{
'C': 512.0,
'coef0': 0,
'degree': 1,
'gamma': 0.001953125,
'kernel': 'sigmoid',
'tol': 1.0
}
)
>>> pvalues = get_params( sigmoid)
>>> {'decision_function_shape': 'ovr',
'break_ties': False,
'kernel': 'sigmoid',
'degree': 1,
'gamma': 0.001953125,
'coef0': 0,
'tol': 1.0,
'C': 512.0,
'nu': 0.0,
'epsilon': 0.0,
'shrinking': True,
'probability': False,
'cache_size': 200,
'class_weight': None,
'verbose': False,
'max_iter': -1,
'random_state': None
}
"""
if hasattr (obj, '__call__'):
cls_or_func_signature = inspect.signature(obj)
PARAMS_VALUES = {k: None if v.default is (inspect.Parameter.empty
or ...) else v.default
for k, v in cls_or_func_signature.parameters.items()
# if v.default is not inspect.Parameter.empty
}
elif hasattr(obj, '__dict__'):
PARAMS_VALUES = {k:v for k, v in obj.__dict__.items()
if not (k.endswith('_') or k.startswith('_'))}
return PARAMS_VALUES
[docs]
def existfeatures (df, features, error='raise'):
"""Control whether the features exists or not
:param df: a dataframe for features selections
:param features: list of features to select. Lits of features must be in the
dataframe otherwise an error occurs.
:param error: str - raise if the features don't exist in the dataframe.
*default* is ``raise`` and ``ignore`` otherwise.
:return: bool
assert whether the features exists
"""
isf = False
error= 'raise' if error.lower().strip().find('raise')>= 0 else 'ignore'
if isinstance(features, str):
features =[features]
features = _assert_all_types(features, list, tuple, np.ndarray)
set_f = set (features).intersection (set(df.columns))
if len(set_f)!= len(features):
nfeat= len(features)
msg = f"Feature{'s' if nfeat >1 else ''}"
if len(set_f)==0:
if error =='raise':
raise ValueError (f"{msg} {smart_format(features)} "
f"{'does not' if nfeat <2 else 'dont'}"
" exist in the dataframe")
isf = False
# get the difference
diff = set (features).difference(set_f) if len(
features)> len(set_f) else set_f.difference (set(features))
nfeat= len(diff)
if error =='raise':
raise ValueError(f"{msg} {smart_format(diff)} not found in"
" the dataframe.")
isf = False
else : isf = True
return isf
[docs]
def selectfeatures (
df: DataFrame,
features: List[str] =None,
include = None,
exclude = None,
coerce: bool=False,
**kwd
):
""" Select features and return new dataframe.
:param df: a dataframe for features selections
:param features: list of features to select. Lits of features must be in the
dataframe otherwise an error occurs.
:param include: the type of data to retrieved in the dataframe `df`. Can
be ``number``.
:param exclude: type of the data to exclude in the dataframe `df`. Can be
``number`` i.e. only non-digits data will be keep in the data return.
:param coerce: return the whole dataframe with transforming numeric columns.
Be aware that no selection is done and no error is raises instead.
*default* is ``False``
:param kwd: additional keywords arguments from `pd.astype` function
:ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
"""
if features is not None:
existfeatures(df, features, error ='raise')
# change the dataype
df = df.astype (float, errors ='ignore', **kwd)
# assert whether the features are in the data columns
if features is not None:
return df [features]
# raise ValueError: at least one of include or exclude must be nonempty
# use coerce to no raise error and return data frame instead.
return df if coerce else df.select_dtypes (include, exclude)