# -*- coding: utf-8 -*-
# License: BSD-3-Clause
# Author: LKouadio <etanoyau@gmail.com>
# created on Thu Oct 13 14:52:26 2022
import itertools
import numpy as np
import pandas as pd
from .._typing import List
[docs]
class Boxspace(dict):
"""Is a container object exposing keys as attributes.
BowlSpace objects are sometimes used as an output for functions and methods.
They extend dictionaries by enabling values to be accessed by key,
`Boxspace["value_key"]`, or by an attribute, `Boxspace.value_key`.
Another option is to use Namespace of collection modules as:
>>> from collections import namedtuple
>>> Boxspace = namedtuple ('Boxspace', [< attribute names >] )
However the explicit class that inhers from build-in dict is easy to
handle attributes and to avoid multiple error where the given name
in the `names` attributes does not match the expected attributes to fetch.
Examples
--------
>>> from watex.utils.box import Boxspace
>>> bs = Boxspace(pkg='watex', objective ='give water', version ='0.1.dev')
>>> bs['pkg']
... 'watex'
>>> bs.pkg
... 'watex'
>>> bs.objective
... 'give water'
>>> bs.version
... '0.1.dev'
"""
def __init__(self, **kws):
super().__init__(kws)
def __setattr__(self, key, value):
self[key] = value
def __dir__(self):
return self.keys()
def __getattr__(self, key):
try:
return self[key]
except KeyError:
raise AttributeError(key)
def __setstate__(self, state):
# Overriding __setstate__ to be a noop has the effect of
# ignoring the pickled __dict__
pass
class _Group:
""" Group of Aquifer is mostly related to area information after multiple
boreholes collected.
However when predicted 'k' with a missing k-values using the Mixture
Learning Strategy (MXS), we intend to solve this problem by creating
a Naive Group of Aquifer (NGA) to compensate the missing k-values in the
dataset. This could be a good idea to avoid introducing a lot of bias since
the group of aquifer is mostly tied to the permeability coefficient 'k'.
To do this, an unsupervised learning is used to predict the NGA labels then
the NGA labels are used in turn to fill the missing k-values. The best
strategy for operting this trick is to seek for some importances between
the true k-values with their corresponding aquifer groups at each depth,
and find the most representative group. Once the most representative group
is found for each true label 'k', the group of aquifer can be renamed as
the naive similarity with the true k-label. For instance if true k-value
is the label 1 and label 1 is most representative with the group of aquifer
'IV', therefore this group can be replaced throughout the column
with 'k1'+'IV=> i.e. 'k14'. This becomes a new label created and is used to
fill the true label 'y_true' to become a MXS target ( include NGA label).
Note that the true label with valid 'k-value' remained intach and unchanged.
The same process is done for label 2, 3 and so on. The selection of MXS
label from NGA strongly depends on its preponderance or importance rate in
the whole dataset.
The following example is the demonstration to how to compute the group
representativity in datasets.
Parameters
----------
g:dict,
Dictionnary compose of occurence between the true labels
and the group of aquifer as a function of occurence and
repesentativity
Example
--------
>>> from watex.utils import naive_imputer, read_data , reshape
>>> from watex.datasets import load_hlogs
>>> from watex.utils.hydroutils import classify_k, find_aquifer_groups
>>> b= load_hlogs () #just taking the target names
>>> data = read_data ('data/boreholes/hf.csv') # read complete data
>>> y = data [b.target_names]
>>> # impute the missing values found in aquifer group columns
>>> # reshape 1d array along axis 0 for imputation
>>> agroup_imputed = naive_imputer ( reshape (y.aquifer_group, axis =0 ) ,
strategy ='most_frequent')
>>> # reshape back to array_like 1d
>>> y.aquifer_group =reshape (agroup_imputed)
>>> # categorize the 'k' continous value in 'y.k' using the default
>>> # 'k' mapping func
>>> y.k = classify_k (y.k , default_func =True)
>>> # get the group obj
>>> group_obj = find_aquifer_groups(y.k, y.aquifer_group, )
>>> group_obj
... _Group(Label=[' 1 ',
Preponderance( rate = '53.141 %',
[('Groups', {'V': 0.32, 'IV': 0.266,
'II': 0.236, 'III': 0.158,
'IV&V': 0.01, 'II&III': 0.005,
'III&IV': 0.005}),
('Representativity', ( 'V', 0.32)),
('Similarity', 'V')])],
Label=[' 2 ',
Preponderance( rate = ' 19.11 %',
[('Groups', {'III': 0.274, 'II': 0.26,
'V': 0.26, 'IV': 0.178,
'III&IV': 0.027}),
('Representativity', ( 'III', 0.27)),
('Similarity', 'III')])],
Label=[' 3 ',
Preponderance( rate = '27.749 %',
[('Groups', {'V': 0.443, 'IV': 0.311,
'III': 0.245}),
('Representativity', ( 'V', 0.44)),
('Similarity', 'V')])],
)
"""
def __init__ (self, g=None, / ):
self.g_ = g
@property
def g(self):
return self.g_
@property
def similarity (self):
"""return label similarities with NGA labels """
return (
(label, list(rep_val [1])[0] )
for label, rep_val in self.g_.items()
)
@property
def preponderance (self):
""" Returns label occurences in the datasets """
return (
(label, rep_val[0])
for label, rep_val in self.g_.items()
)
@property
def representativity (self):
""" Returns the representativity of each labels"""
return ( (label, round(rep_val[1].get(list(rep_val [1])[0]), 2))
for label, rep_val in self.g_.items()
)
@property
def groups (self):
"""Return groups for each label """
return ((label, {k: v for k, v in repr_val[1].items()})
for label, repr_val in self.g_.items ()
)
def __repr__ (self ) :
return self.__class__.__name__ + "(" + self._format (
self.g) + "{:>13}".format(")")
def _format (self, gdict):
""" Format representativity of Aquifer groups
Parameters
----------
gdict: dict,
Dictionnary compose of occurence of the group as a function
of aquifer group repesentativity
"""
ag=[]
for k, (label, repr_val ) in enumerate ( gdict.items() ):
prep , g = repr_val
ag+=["{:5}=['{:^3}', \n".format(
"Label" if k==0 else "{:>17}".format("Label"), label
)
]
ag +=["{:>32}( rate = '{:^7} %', \n".format(
"Preponderance", round (prep *100, 3 )
)]
ag += ["{:>34}'Groups', {}),\n".format("[(",
# str({ k: "{:>5}".format(round (v, 3)) for k , v in g.items()})
str({ k: round (v, 3) for k , v in g.items()})
)
]
ag +=["{:>34}'Representativity', ( '{}', {})),\n".format("(",
list(g)[0], round ( g.get(list(g)[0]), 2))
]
ag += ["{:>34}'Similarity', '{}')])],\n ".format("(", list(g)[0] )
]
# ag+=['{:>30}'.format("])],\n ")]
#ag+=["{:>7}".format(")")]
return ''.join (ag)
[docs]
def data2Box(
data, /,
name: str = None,
use_colname: bool =False,
keep_col_data: bool =True,
columns: List [str] =None
):
""" Transform each data rows as Boxspace object.
Parameters
-----------
data: DataFrame
Data to transform as an object
columns: list of str,
List of str item used to construct the dataframe if tuple or list
is passed.
name: str, optional
The object name. When string argument is given, the index value of
the data is is used to prefix the name data unless the `use_column_name`
is set to ``True``.
use_colname: bool, default=False
If ``True`` the name must be in columns. Otherwise an error raises.
However, when ``use_colname=true``, It is recommended to make sure
whether each item in column data is distinct i.e. is unique, otherwise,
some data will be erased. The number of object should be less than
the data size along rows axis.
keep_col_data: bool, default=True
Keep in the data the column that is used to construct the object name.
Otherwise, column data whom object created from column name should
be dropped.
Return
--------
Object: :class:`.BoxSpace`, n_objects = data.size
Object that composed of many other objects where the number is equals
to data size.
Examples
---------
>>> from watex.utils.box import data2Box
>>> o = data2Box ([2, 3, 4], name = 'borehole')
>>> o.borehole0
{'0': 2}
>>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5]}, name = 'borehole')
>>> o.borehole0.y
8
>>> from watex.utils.box import data2Box
>>> o = data2Box ([2, 3, 4], name = 'borehole', columns ='id')
>>> o.borehole0.id
2
>>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5],
"code": ['h2', 'h7', 'h12'] }, name = 'borehole')
>>> o.borehole1.code
'h7'
>>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5], "code": ['h2', 'h7', 'h12'] },
name = 'code', use_colname= True )
>>> o.h7.code
'h7'
>>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5], "code": ['h2', 'h7', 'h12']
}, name = 'code', use_colname= True, keep_col_data= False )
>>> o.h7.code # code attribute does no longer exist
AttributeError: code
"""
from .validator import _is_numeric_dtype
from .funcutils import is_iterable
if columns is not None:
columns = is_iterable (
columns, exclude_string= True , transform =True )
if (
not hasattr ( data , 'columns')
or hasattr ( data, '__iter__')
):
data = pd.DataFrame ( data, columns = columns )
if not hasattr(data, '__array__'):
raise TypeError (
f"Object accepts only DataFrame. Got {type(data).__name__}")
if columns is not None:
# rename columns if given
data = pd.DataFrame(np.array( data), columns = columns )
if name is not None:
# Name must be exists in the dataframe.
if use_colname:
if name not in data.columns:
raise ValueError (
f"Name {name!r} must exist in the data columns.")
name = data [name] if keep_col_data else data.pop ( name )
# make name column if not series
if not hasattr ( name, 'name'):
# check whether index is numeric then prefix with index
index = data.index
if _is_numeric_dtype(index, to_array= True ):
index = index.astype (str)
if name is None:
name ='obj'
name = list(map(''.join, itertools.zip_longest(
[name for i in range ( len(index ))], index)))
# for consistency # reconvert name to str
name = np.array (name ).astype ( str )
obj = dict()
for i in range ( len(data)):
v = Boxspace( **dict ( zip ( data.columns.astype (str),
data.iloc [i].values )))
obj [ name[i]] = v
return Boxspace( **obj )