Source code for watex.utils.box

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   created on Thu Oct 13 14:52:26 2022

import itertools
import numpy as np
import pandas as pd 
from .._typing import List 

[docs] class Boxspace(dict): """Is a container object exposing keys as attributes. BowlSpace objects are sometimes used as an output for functions and methods. They extend dictionaries by enabling values to be accessed by key, `Boxspace["value_key"]`, or by an attribute, `Boxspace.value_key`. Another option is to use Namespace of collection modules as: >>> from collections import namedtuple >>> Boxspace = namedtuple ('Boxspace', [< attribute names >] ) However the explicit class that inhers from build-in dict is easy to handle attributes and to avoid multiple error where the given name in the `names` attributes does not match the expected attributes to fetch. Examples -------- >>> from watex.utils.box import Boxspace >>> bs = Boxspace(pkg='watex', objective ='give water', version ='0.1.dev') >>> bs['pkg'] ... 'watex' >>> bs.pkg ... 'watex' >>> bs.objective ... 'give water' >>> bs.version ... '0.1.dev' """ def __init__(self, **kws): super().__init__(kws) def __setattr__(self, key, value): self[key] = value def __dir__(self): return self.keys() def __getattr__(self, key): try: return self[key] except KeyError: raise AttributeError(key) def __setstate__(self, state): # Overriding __setstate__ to be a noop has the effect of # ignoring the pickled __dict__ pass
class _Group: """ Group of Aquifer is mostly related to area information after multiple boreholes collected. However when predicted 'k' with a missing k-values using the Mixture Learning Strategy (MXS), we intend to solve this problem by creating a Naive Group of Aquifer (NGA) to compensate the missing k-values in the dataset. This could be a good idea to avoid introducing a lot of bias since the group of aquifer is mostly tied to the permeability coefficient 'k'. To do this, an unsupervised learning is used to predict the NGA labels then the NGA labels are used in turn to fill the missing k-values. The best strategy for operting this trick is to seek for some importances between the true k-values with their corresponding aquifer groups at each depth, and find the most representative group. Once the most representative group is found for each true label 'k', the group of aquifer can be renamed as the naive similarity with the true k-label. For instance if true k-value is the label 1 and label 1 is most representative with the group of aquifer 'IV', therefore this group can be replaced throughout the column with 'k1'+'IV=> i.e. 'k14'. This becomes a new label created and is used to fill the true label 'y_true' to become a MXS target ( include NGA label). Note that the true label with valid 'k-value' remained intach and unchanged. The same process is done for label 2, 3 and so on. The selection of MXS label from NGA strongly depends on its preponderance or importance rate in the whole dataset. The following example is the demonstration to how to compute the group representativity in datasets. Parameters ---------- g:dict, Dictionnary compose of occurence between the true labels and the group of aquifer as a function of occurence and repesentativity Example -------- >>> from watex.utils import naive_imputer, read_data , reshape >>> from watex.datasets import load_hlogs >>> from watex.utils.hydroutils import classify_k, find_aquifer_groups >>> b= load_hlogs () #just taking the target names >>> data = read_data ('data/boreholes/hf.csv') # read complete data >>> y = data [b.target_names] >>> # impute the missing values found in aquifer group columns >>> # reshape 1d array along axis 0 for imputation >>> agroup_imputed = naive_imputer ( reshape (y.aquifer_group, axis =0 ) , strategy ='most_frequent') >>> # reshape back to array_like 1d >>> y.aquifer_group =reshape (agroup_imputed) >>> # categorize the 'k' continous value in 'y.k' using the default >>> # 'k' mapping func >>> y.k = classify_k (y.k , default_func =True) >>> # get the group obj >>> group_obj = find_aquifer_groups(y.k, y.aquifer_group, ) >>> group_obj ... _Group(Label=[' 1 ', Preponderance( rate = '53.141 %', [('Groups', {'V': 0.32, 'IV': 0.266, 'II': 0.236, 'III': 0.158, 'IV&V': 0.01, 'II&III': 0.005, 'III&IV': 0.005}), ('Representativity', ( 'V', 0.32)), ('Similarity', 'V')])], Label=[' 2 ', Preponderance( rate = ' 19.11 %', [('Groups', {'III': 0.274, 'II': 0.26, 'V': 0.26, 'IV': 0.178, 'III&IV': 0.027}), ('Representativity', ( 'III', 0.27)), ('Similarity', 'III')])], Label=[' 3 ', Preponderance( rate = '27.749 %', [('Groups', {'V': 0.443, 'IV': 0.311, 'III': 0.245}), ('Representativity', ( 'V', 0.44)), ('Similarity', 'V')])], ) """ def __init__ (self, g=None, / ): self.g_ = g @property def g(self): return self.g_ @property def similarity (self): """return label similarities with NGA labels """ return ( (label, list(rep_val [1])[0] ) for label, rep_val in self.g_.items() ) @property def preponderance (self): """ Returns label occurences in the datasets """ return ( (label, rep_val[0]) for label, rep_val in self.g_.items() ) @property def representativity (self): """ Returns the representativity of each labels""" return ( (label, round(rep_val[1].get(list(rep_val [1])[0]), 2)) for label, rep_val in self.g_.items() ) @property def groups (self): """Return groups for each label """ return ((label, {k: v for k, v in repr_val[1].items()}) for label, repr_val in self.g_.items () ) def __repr__ (self ) : return self.__class__.__name__ + "(" + self._format ( self.g) + "{:>13}".format(")") def _format (self, gdict): """ Format representativity of Aquifer groups Parameters ---------- gdict: dict, Dictionnary compose of occurence of the group as a function of aquifer group repesentativity """ ag=[] for k, (label, repr_val ) in enumerate ( gdict.items() ): prep , g = repr_val ag+=["{:5}=['{:^3}', \n".format( "Label" if k==0 else "{:>17}".format("Label"), label ) ] ag +=["{:>32}( rate = '{:^7} %', \n".format( "Preponderance", round (prep *100, 3 ) )] ag += ["{:>34}'Groups', {}),\n".format("[(", # str({ k: "{:>5}".format(round (v, 3)) for k , v in g.items()}) str({ k: round (v, 3) for k , v in g.items()}) ) ] ag +=["{:>34}'Representativity', ( '{}', {})),\n".format("(", list(g)[0], round ( g.get(list(g)[0]), 2)) ] ag += ["{:>34}'Similarity', '{}')])],\n ".format("(", list(g)[0] ) ] # ag+=['{:>30}'.format("])],\n ")] #ag+=["{:>7}".format(")")] return ''.join (ag)
[docs] def data2Box( data, /, name: str = None, use_colname: bool =False, keep_col_data: bool =True, columns: List [str] =None ): """ Transform each data rows as Boxspace object. Parameters ----------- data: DataFrame Data to transform as an object columns: list of str, List of str item used to construct the dataframe if tuple or list is passed. name: str, optional The object name. When string argument is given, the index value of the data is is used to prefix the name data unless the `use_column_name` is set to ``True``. use_colname: bool, default=False If ``True`` the name must be in columns. Otherwise an error raises. However, when ``use_colname=true``, It is recommended to make sure whether each item in column data is distinct i.e. is unique, otherwise, some data will be erased. The number of object should be less than the data size along rows axis. keep_col_data: bool, default=True Keep in the data the column that is used to construct the object name. Otherwise, column data whom object created from column name should be dropped. Return -------- Object: :class:`.BoxSpace`, n_objects = data.size Object that composed of many other objects where the number is equals to data size. Examples --------- >>> from watex.utils.box import data2Box >>> o = data2Box ([2, 3, 4], name = 'borehole') >>> o.borehole0 {'0': 2} >>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5]}, name = 'borehole') >>> o.borehole0.y 8 >>> from watex.utils.box import data2Box >>> o = data2Box ([2, 3, 4], name = 'borehole', columns ='id') >>> o.borehole0.id 2 >>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5], "code": ['h2', 'h7', 'h12'] }, name = 'borehole') >>> o.borehole1.code 'h7' >>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5], "code": ['h2', 'h7', 'h12'] }, name = 'code', use_colname= True ) >>> o.h7.code 'h7' >>> o = data2Box ({"x": [2, 3, 4], "y":[8, 7, 5], "code": ['h2', 'h7', 'h12'] }, name = 'code', use_colname= True, keep_col_data= False ) >>> o.h7.code # code attribute does no longer exist AttributeError: code """ from .validator import _is_numeric_dtype from .funcutils import is_iterable if columns is not None: columns = is_iterable ( columns, exclude_string= True , transform =True ) if ( not hasattr ( data , 'columns') or hasattr ( data, '__iter__') ): data = pd.DataFrame ( data, columns = columns ) if not hasattr(data, '__array__'): raise TypeError ( f"Object accepts only DataFrame. Got {type(data).__name__}") if columns is not None: # rename columns if given data = pd.DataFrame(np.array( data), columns = columns ) if name is not None: # Name must be exists in the dataframe. if use_colname: if name not in data.columns: raise ValueError ( f"Name {name!r} must exist in the data columns.") name = data [name] if keep_col_data else data.pop ( name ) # make name column if not series if not hasattr ( name, 'name'): # check whether index is numeric then prefix with index index = data.index if _is_numeric_dtype(index, to_array= True ): index = index.astype (str) if name is None: name ='obj' name = list(map(''.join, itertools.zip_longest( [name for i in range ( len(index ))], index))) # for consistency # reconvert name to str name = np.array (name ).astype ( str ) obj = dict() for i in range ( len(data)): v = Boxspace( **dict ( zip ( data.columns.astype (str), data.iloc [i].values ))) obj [ name[i]] = v return Boxspace( **obj )