Source code for watex.methods.hydro

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
"""
:mod:`~watex.methods.hydro` computes Hydrogeological parameters of aquifer 
that are the essential and crucial basic data in the designing and 
construction progress of geotechnical engineering and groundwater dewatering. 
"""

from __future__ import ( 
    division, 
    annotations 
    )
import warnings 
from abc import ABC, abstractclassmethod
from .._docstring import ( 
    _core_docs, 
    DocstringComponents 
    )
from ..exceptions import ( 
    NotFittedError, 
    StrataError, 
    kError, 
    AquiferGroupError
    )
from ..utils.hydroutils import (
    find_aquifer_groups, 
    find_similar_labels, 
    get_aquifer_sections, 
    reduce_samples, 
    select_base_stratum,
    make_MXS_labels, 
    predict_NGA_labels

    )
from ..utils.funcutils import ( 
    sanitize_frame_cols, 
    to_numeric_dtypes, 
    smart_strobj_recognition, 
    repr_callable_obj, 
    is_in_if, 
    )
from ..utils.validator import check_array 

from .._watexlog import watexlog 

__all__=["Hydrogeology", 
         "AqSection", 
         "AqGroup", 
         "MXS", 
         "Logging"
         ]
#-----------------------

_base_params = dict( 
    aqname="""
aqname: str, optional, 
    Name of aquifer group column. `aqname` allows to retrieve the 
    aquifer group `arr_aq` value in  a specific dataframe. Commonly
   `aqname` needs to be supplied when a dataframe is passed as a positional 
    or keyword argument. Note that it is not mandatory to have a group of 
    aquifer in the log data. It is needed only if the label similarity 
    needs to be calculated.    
    """, 
    sname="""
sname: str, optional 
    Name of column in the dataframe that contains the strata values. 
    Dont confuse 'sname' with 'stratum' which is the name of the valid 
    layer/rock in the array/Series of strata.     
    """, 
    )

_param_docs = DocstringComponents.from_nested_components(
    core=_core_docs["params"], 
    base= DocstringComponents(_base_params)
    )
#------------------------

class HData(ABC):
    @abstractclassmethod 
    def __init__(
        self,
        kname=None, 
        zname=None, 
        aqname=None, 
        sname=None, 
        verbose=0
        ): 
        self._logging = watexlog.get_watex_logger(self.__class__.__name__)
        self.kname=kname
        self.zname=zname
        self.aqname=aqname
        self.sname=sname
        self.verbose=verbose
    
    def fit(
        self, 
        data,  
        **fit_params
        ): 
        """
        Fit Hydro-data and populate attributes. 
        
        Note that each column of the dataframe can be retrieved as an attribute
        value. The attribute maker replace all spaces in the items if exist
        in data columns with '_'. For instance, retrieving the 'layer thickness' 
        as an items in the data should be: 'layer_thickness' like:: 
            
            >>> from watex.datasets import load_hlogs 
            >>> from watex.methods.hydro import HData 
            >>> h=HData ().fit(load_hlogs().frame) 
            >>> h.layer_thickness # for retrieving 'layer thickness' 
            
        Parameters 
        -----------
        
        data : Dataframe of shape (n_samples, n_features)
            where `n_samples` is the number of data, expected to be the data 
            collected at different depths and `n_features` is the number of 
            columns (features) that supposed to be plot. 
            Note that `X` must include the ``depth`` columns. If not given a 
            relative depth should be created according to the number of 
            samples that composes `data`.
 
        fit_params: dict, 
            Additional keyword arguments passed to 
            :func:`~watex.utils.funcutils.to_numeric_dtypes`. 
      
        """
        data = check_array (
            data, 
            force_all_finite= "allow-nan", 
            dtype =object , 
            input_name="Data", 
            to_frame=True, 
            )
        data = sanitize_frame_cols(data, fill_pattern= '_' )
        self.data_, nf, cf = to_numeric_dtypes(
            data , 
            return_feature_types= True, 
            verbose =self.verbose, 
            **fit_params 
            )
        self.feature_names_in_ = nf + cf 
        
        if len(cf )!=0:
            # sanitize the categorical values 
            for c in cf : self.data_ [c] = self.data_[c].str.strip() 
        for name, val in zip (("k", "z", "aq", "s"), (
                self.kname, self.zname, self.aqname, self.sname)): 
            if val: 
                c=val
                val = is_in_if (list(self.data_.columns), val, 
                                 error ='ignore')
                if val is None and self.verbose : 
                    warnings.warn(f" Invalid '{name}name'={c!r}. Name not "
                                  "found in the given dataset. None is set "
                                  "instead.")
                
            setattr (self, f"{name}_", 
                     self.data_[val[0]] if val else val 
                     )
            
        for name in self.data_.columns : 
            setattr (self, name, self.data_[name])
            
        return self 
    

[docs]
    def squeeze_data (self, strategy="average", **rs_kws): 
        """ Compressed data by sample reducing 
        
        To compress many boreholes data, it is recommended to use 
        :func:`get_unique_section`. 
        
        Parameters 
        ---------- 
        
        sname: str, optional 
            Name of column in the dataframe that contains the strata values. 
            Dont confuse 'sname' with 'stratum' which is the name of the valid 
            layer/rock in the array/Series of strata. 
        
        strategy: str , default='average' or 'mean', 
            strategy used to select or compute the numerical data into a 
            singular series. It can be ['naive']. In that case , a single serie 
            if randomly picked up into the base strata data.
            
        rs_kws: dict, 
            keyword arguments passed to 
            :func:`~watex.utils.hydroutils.reduce_samples`
            
        Returns 
        ----------
        sqdat: pandas.dataframes
            new dataframe with reducing samples. 
            
        """
        self.inspect 
        
        if self.sname is None: 
            raise StrataError (
                "'sname' cannot be none for data compressing. Refer to"
                " :func:`~watex.utils.hydroutils.reduce_samples` for"
                " pure examples.")
            
        sqdat = reduce_samples(
            self.data_, 
            sname= self.sname, 
            zname=self.zname, 
            kname =self.kname, 
            strategy = strategy,
            **rs_kws
            )[0]
        return sqdat 

    

[docs]
    def get_base_stratum (self , stratum=None ): 
        """Select the base stratum 
        
        Parameters
        -----------
        stratum: str, optional 
            Name of the base stratum. Must be self contain as an item of the 
            strata data. Note that if `stratum` is passed, the auto-detection of 
            base stratum is not triggered. It returns the same stratum.
        
        Returns
        ---------
        base_stratum : str
            the most recurrent stratum in the data and compute the rate of 
            occurrence. 
            
        """
        self.inspect 
        
        self.base_stratum_ = select_base_stratum(
            self.data_,
            sname = self.sname , 
            stratum =stratum, 
            return_counts=False, 
            return_rate=False, 
            )
        return self.base_stratum_ 

    
    
    @property 
    def inspect (self): 
        """ Inspect object whether is fitted or not"""
        msg = ( "{obj.__class__.__name__} instance is not fitted yet."
               " Call 'fit' with appropriate arguments before using"
               " this method"
               )
        
        if not hasattr (self, 'data_'): 
            raise NotFittedError(msg.format(
                obj=self)
            )
        return 1 
    
    def __repr__(self):
        """ Pretty format for programmer guidance following the API... """
        t =("kname", "zname", "aqname", "sname", "verbose" )
        outm = ( '<{!r}:' + ', '.join(
            [f"{k}={getattr(self, k)!r}" for k in t]) + '>' 
            ) 
        return  outm.format(self.__class__.__name__)
       
    
    def __getattr__(self, name):
        _getattr_(self, name)
           
HData.__doc__="""\
Hydro-Log data , Abstract Base class and can't be instanciated. 

Hydro-log data is a mixed data composed of logging data, borehole data 
and geological data. To only used the logging data, it recommended to use 
:class:`~.watex.methods.hydro.Logging` instead. 


Parameters 
------------
{params.core.kname}
{params.core.zname}
{params.base.aqname}
{params.base.sname}

""".format (params =_param_docs , 
)     
    
    

[docs]
class AqSection (HData): 
    def __init__(
            self,
            aqname=None, 
            kname=None, 
            zname= None, 
            **kws
            ): 
        super().__init__(
            kname =kname , 
            aqname= aqname, 
            zname= zname, 
            **kws
            )
    

[docs]
    def findSection(
        self, 
        z= None, 
        depth_unit ="m"
        ): 
        """ Find aquifer valid section (upper and lower section ) 
        
        Parameters 
        -----------
        z: array-like 1d, pandas.Series 
            Array of depth or a pandas series that contains the depth values. 
            Two  dimensional array or more is not allowed. However when `z` 
            is given as  a dataframe and `zname` is not supplied, an error 
            raises since `zname` is used to fetch and overwritten `z` 
            from the dataframe. 
            
        Returns 
        --------
        self.section_: list of float 
            valid upper and lower section in SI units (m) if depth values are 
            given in meters. 
        
        """
        self.inspect 
        
        self.section_ = get_aquifer_sections(
            self.data_ , 
            zname=self.zname, 
            kname= self.kname, 
            return_data= False, 
            return_index= False,  
            z=z, 
            )[0]
        if self.verbose: 
            print("### The valid section of aquifer is {} to {} {}."
                  .format(self.section_[0], self.section_[-1],
                          depth_unit)
                  )
        return self.section_ 



AqSection.__doc__="""\
Aquifer section class 

Get the section of each aquifer from dataframe. 

The unique section 'upper' and 'lower' is the valid range of the whole 
data to consider as a  valid data. Indeed, the aquifer section computing 
is  necessary to shrunk the data of the whole boreholes. Mosly the data 
from the section is consided the valid data as the predictor Xr. Out of the
range of aquifers ection, data can be discarded or compressed to top Xr. 

Parameters 
------------
{params.base.aqname}
{params.core.kname}
{params.core.zname}

""".format(params =_param_docs )    


[docs]
class MXS (HData): 
    def __init__(
        self, 
        kname=None, 
        aqname=None,
        threshold:float=None,
        method:str="naive", 
        trailer:str="*", 
        keep_label_0:bool=False,
        random_state:int=42,
        n_groups:int=3, 
        sep:str=None, 
        prefix=None,
        **kws
        ): 
        super().__init__(
        kname =kname, 
        aqname =aqname, 
        **kws
            )
        
        self.threshold=threshold 
        self.method=method
        self.n_groups=n_groups
        self.trailer=trailer
        self.keep_label_0=keep_label_0 
        self.random_state=random_state 
        self.sep=sep 
        self.prefix=prefix 
        

[docs]
    def predictNGA (
        self,
        n_components:int=2 ,  
        return_label=False, 
        **NGA_kws
        ): 
        """ Predicts Naive Group of Aquifer from Hydro-Log data. 
        
        Parameters
        ------------
        n_components: int, default=2 
            Number of dimension to preserve. If`n_components` is ranged 
            between float 0. to 1., it indicates the number of variance 
            ratio to preserve. If ``None`` as default value the number of 
            variance to preserve is ``95%``.
        return_label: bool,default=False
            If `True`, return the NGA label predicted, otherwise return 
            :class:`~.MXS` instanciated object. if ``False``, NGA label 
            can be fetch using the attribute 
            :attr:`watex.hydro.MXS.yNGA_`
            
        NGA_kws: dict, 
            keyword argument passed to :func:`watex.utils.predict_NGA_labels`
        Returns 
        --------
        yNGA_ or self : arraylike-1d of naive group of aquifer or 
            :class:`~.MXS` instanciated object.
        
        Example 
        --------
        >>> from watex.datasets import load_hlogs 
        >>> from watex.methods.hydro import MXS 
        >>> hdata = load_hlogs ().frame 
        >>> # drop the 'remark' columns since there is no valid data 
        >>> hdata.drop (columns ='remark', inplace=True) 
        >>> mxs =MXS (kname ='k').fit(hdata) # specify the 'k' column  
        >>> y_pred = mxs.predictNGA(return_label=True )
        >>> y_pred [-12:] 
        Out[52]: array([1, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3])
        """
        self.inspect 
        
        from ..analysis.dimensionality import nPCA 
        from ..utils.mlutils import ( 
            naive_imputer, 
            naive_scaler 
        )
        
        X= to_numeric_dtypes(
            self.data_, 
            pop_cat_features= True , 
            verbose =self.verbose 
            )
        X= nPCA(naive_scaler (naive_imputer(X)), 
                n_components= n_components , 
                random_state=self.random_state,
                view=False, 
                return_X=True,
                plot_kws=dict(), 
               
                )
        self.yNGA_, self.cluster_centers_= predict_NGA_labels(
            X, n_clusters= self.n_groups, 
            return_cluster_centers= True, 
            keep_label_0= self.keep_label_0 ,  
            random_state= self.random_state,
            **NGA_kws
            )
        return self.yNGA_ if return_label else self 

    
    

[docs]
    def makeyMXS (
        self, 
        y_pred=None, 
        func:callable=None,
        categorize_k= False, 
        default_func= False,  
        **mxs_kws
        ): 
        r""" Construct the MXS target :math:`y*`
        
        Parameters 
        -----------
        y_pred: Array-like 1d, pandas.Series
            Array composing the valid NGA labels. Note that NGA labels is  a 
            predicted labels mostly using the unsupervising learning. 
            
            :seealso: :func:`~predict_NGA_labels` for further details. 
        
        func: callable 
            Function to specifically map the permeability coefficient column 
            in the dataframe of serie. If not given, the default function can be 
            enabled instead from param `default_func`. 

        string: bool, 
            If set to "True", categorized map from 'k'  should be prefixed by "k". 
            However is string value is given , the prefix is changed according 
            to this label. 
            
        default_ufunc: bool, 
            Default function for mapping k is setting to ``True``. Note that, this 
            could probably not fitted your own data. So  it is recommended to 
            provide your own function for mapping 'k'. However the default 'k' 
            mapping is given as follow: 
                
            - k0 {0}: k = 0 
            - k1 {1}: 0 < k <= .01 
            - k2 {2}: .01 < k <= .07 
            - k3 {3}: k> .07 
            
         mxs_kws:dict, 
             Additional keyword arguments passed to 
             :func:`~.watex.utils.make_MXS_labels`. 
             
        Returns 
        --------
        MXS.mxs_labels_: array-like 1d `
             array like of MXS labels 
             
        Example 
        --------
        >>> from watex.datasets import load_hlogs 
        >>> from watex.methods.hydro import MXS 
        >>> hdata = load_hlogs ().frame 
        >>> # drop the 'remark' columns since there is no valid data 
        >>> hdata.drop (columns ='remark', inplace=True) 
        >>> mxs =MXS (kname ='k').fit(hdata) # specify the 'k'columns 
        >>> # we can predict the NGA labels and yMXS with single line 
        >>> # of code snippet using the default 'k' classification.
        >>> ymxs = mxs.predictNGA().makeyMXS(categorize_k=True, default_func=True)
        >>> mxs.yNGA_[:7] 
        ... array([2, 2, 2, 2, 2, 2, 2])
        >>> ymxs[:7]
        Out[40]: array([22, 22, 22, 22, 22, 22, 22])
        >>> mxs.mxs_group_classes_
        Out[56]: {1: 1, 2: 22, 3: 3} # transform classes 
        >>> mxs.mxs_group_labels_ 
        Out[57]: (2,)
        >>> # **comment: 
            # # only the label '2' is tranformed to '22' since 
            # it is the only one that has similariry with the true label 2 
        """
        self.inspect 
        
        if self.k_ is None: 
            raise kError ("'k' data for permeability coefficient cannot"
                        " be None. Specify the name of the column 'kname'"
                        " that fits the permeability coefficient values"
                        " in the hydro-log dataset."
            )

        if ( 
            not hasattr (self, 'yNGA_') 
            and y_pred is None
            ) : 
            raise AquiferGroupError (
                "y_pred for Naive Group of Aquifer (NGA) cannot be "
                " None. Use :meth:`~predictNGA` method or"
                " :func:`~.watex.utils.predict_NGA_labels` to"
                " predict NGA labels first."
                 )
        
        elif ( 
            hasattr (self, "yNGA_") 
            and y_pred is None 
            ): 
            y_pred = self.yNGA_ 
            
        MXS = make_MXS_labels(
            self.k_, 
            y_pred, 
            threshold= self.threshold, 
            trailer=self.trailer, 
            method=self.method, 
            return_groups=False, 
            return_obj= True, 
            kname=self.kname, 
            keep_label_0=self.keep_label_0,
            sep=self.sep, 
            prefix=self.prefix,
            inplace=False, 
            categorize_k=categorize_k, 
            default_func=default_func, 
            func=func, 
            **mxs_kws
            )
        for key in MXS.keys (): 
            setattr(self, key, MXS[key])
        return  MXS.mxs_labels_



[docs]
    def labelSimilarity(
        self, 
        func:callable=None,
        categorize_k= False, 
        default_func= False, 
        **sm_kws
        ):
        """Find label similarities
        
        Parameters 
        -----------

        func: callable 
            Function to specifically map the permeability coefficient column 
            in the dataframe of serie. If not given, the default function can be 
            enabled instead from param `default_func`. 

        string: bool, 
            If set to "True", categorized map from 'k'  should be prefixed by "k". 
            However is string value is given , the prefix is changed according 
            to this label. 
            
        default_ufunc: bool, 
            Default function for mapping k is setting to ``True``. Note that, this 
            could probably not fitted your own data. So  it is recommended to 
            provide your own function for mapping 'k'. However the default 'k' 
            mapping is given as follow: 
                
            - k0 {0}: k = 0 
            - k1 {1}: 0 < k <= .01 
            - k2 {2}: .01 < k <= .07 
            - k3 {3}: k> .07 
        sm_kws:dict, 
            Additional keyword arguments passed to 
            :func:`~.watex.utils.find_similar_labels`.
            
        """
        
        self.inspect
        
        msg =("{0!r} data for {1} cannot be None. Specify the name of the "
              "column {2!r} that fits the {1} values in the hydro-log dataset."
              )
        if self.k_ is None: 
            raise kError (msg.format("k","permeability coefficient", "kname" ))
        if self.aq_ is None: 
            raise AquiferGroupError(msg.format(
                "aq", "aquifer groups", "aqname")
            )

        similar_labels= find_similar_labels(
            self.k_, 
            self.aq_, 
            threshold=self.threshold, 
            keep_label_0=self.keep_label_0, 
            method=self.method, 
            return_groups=False, 
            **sm_kws
            )
        return  similar_labels


    
MXS.__doc__="""\
Mixture Learning Strategy (MXS)    

The use of machine learning for k-parameter prediction seems an alternative
way to reduce the cost of data collection thereby saving money. However, 
the borehole data comes with a lot of missing k  since the parameter is 
strongly tied to the aquifer after the pumping test. In other words, the 
k-parameter collection is feasible if the layer in the well is an aquifer. 
Unfortunately, predicting some samples of k in a large set of missing data 
remains an issue using the classical supervised learning methods. We, 
therefore propose an alternative approach called a mixture learning 
strategy (MXS) to solve these double issues. It entails predicting upstream 
a naïve group of aquifers (NGA) combined with the real values k to 
counterbalance the missing values and yield an optimal prediction score. 
The method, first, implies the K-Means and Hierarchical Agglomerative 
Clustering (HAC) algorithms. K-Means and HAC are used for NGA label 
predicting necessary the MXS label merging. 


Parameters 
-----------

{params.core.kname} 
{params.base.aqname}

threshold: float, default=None 
    The threshold from which, label in 'k' array can be considered  
    similar than the one in NGA labels 'y_pred'. The default is 'None' which 
    means none rule is considered and the high preponderence or occurence 
    in the data compared to other labels is considered as the most 
    representative  and similar. Setting the rule instead by fixing 
    the threshold is recommended especially in a huge dataset.

n_groups : int, default=3
    The number of aquifer n_groups to form as well as the number of
    centroids to generate. If a idea about the number of aquifer group
    in the areas, it should be used instead. Hiwever, it is recommended
    to validate this number using the 'elbow plot' or the 'silhouette
    plot' or the Hierachical Agglomerative Clustering dendrogram. 
    Refer to :func:`~watex.utils.plot_elbow` or 
    :func:`~.watex.view.plotSilhouette` 
    or :func:~.watex.view.plotDendrogram` for plotting purpose. 
            
keep_label_0: bool, default=False
    The prediction already include the label 0. However, including 0 in 
    the predicted label refers to 'k=0' i.e. no permeability coefficient 
    equals to 0, which is not True in principle, because all rocks  have 
    a permeability coefficient 'k'. Here we considered 'k=0' as an undefined 
    permeability coefficient. Therefore, '0' , can be exclude since, it can 
    also considered as a missing 'k'-value. If predicted '0' is in the target 
    it should mean a missing 'k'-value rather than being a concrete label.  
    Therefore, to avoid any confusion, '0' is altered to '1' so the value 
    `+1` is used to move forward all class labels thereby excluding 
    the '0' label. To force include 0 in the label, set `keep_label_0` 
    to ``True``.
    
 sep: str, default'' 
     Separator between the true labels 'y_true' and predicted NGA labels.
     Sep is used to rewrite the MXS labels. Mostly the MXS labels is a 
     combinaison with the true label of permeability coefficient 'k' and 
     the label of NGA to compose new similarity labels. For instance 
     
     >>> true_labels=['k1', 'k2', 'k3'] ; NGA_labels =['II', 'I', 'UV']
     >>> # gives 
     >>> MXS_labels= ['k1_II', 'k2_I', 'k3_UV']
 
     where the seperator `sep` is set to ``_``. This happens especially 
     when one of the label (NGA or true_labels) is not a numeric datatype 
     and a similariy is found between 'k1' and 'II', 'k2' and 'I' and so on.
     
 prefix: str, default=''
     prefix is used to rename the true_labels i.e the true valid-k. For
     instance::
         >>> k_valid =[1, 2, ..] -> k_new = [k1, k2, ...]
     where 'k' is the prefix. 
     
 method: str ['naive', 'strict'], default='naive'
     The kind of strategy to compute the representativity of a label 
     in the predicted array 'y_pred'. It can also be 'strict'. Indeed:
     
     - ``naive`` computes the importance of the label by the number of its
         occurence for this specific label in the array 'y_true'. It does not 
         take into account of the occurence of other existing labels. This 
         is usefull for unbalanced class labels in `y_true`.
     - ``strict`` computes the importance of the label by the number of 
         occurence in the whole valid `y_true` i.e. under the total of 
         occurence of all the labels that exist in the whole 'arra_aq'. 
         This can give a suitable anaylse results if the data is not 
         unbalanced for each labels in `y_pred`.
         
 trailer: str, default='*'
     The Mixture strategy marker to differentiate the existing class label  
     in 'y_true' with the predicted labels 'y_pred' especially when  
     the the same class labels are also present the true label with the 
     same label-identifier name. This usefull  to avoid any confusion  for
     both labels  in `y_true` and `y_pred` for better demarcation and 
     distinction. Note that if the `trailer`is set to ``None`` and both 
     `y_true` and `y_pred` are numeric data, the labels in `y_pred` are 
     systematically renamed to be distinct with the ones in the 'y_true'. 
     For instance :: 
         
         >>> true_labels=[1, 2, 3] ; NGA_labels =[0, 1, 2]
         >>> # with trailer , MXS labels should be 
         >>>  MXS_labels= ['0', '1*', '2*', '3'] # 1 and 2 are in true_labels 
         >>> # with no trailer 
         >>> MXS_labels= [0, 4, 5, 3] # 1 and 2 have been changed to [4, 5]
         
{params.core.verbose}

Examples 
---------
>>> from watex.datasets import load_hlogs 
>>> from watex.methods.hydro import MXS 
>>> hdata= load_hlogs (as_frame =True) 
>>> # drop the 'remark' columns since there is no valid data 
>>> hdata.drop (columns ='remark', inplace =True)
>>> mxs = MXS (kname ='k').fit(hdata)
>>> # predict the default NGA 
>>> mxs.predictNGA() # default prediction with n_groups =3 
>>> # make MXS labels using the default 'k' categorization 
>>> ymxs=mxs.makeyMXS(categorize_k=True, default_func=True)
>>> mxs.yNGA_ [62:74] 
Out[43]: array([1, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 2])
>>> ymxs[62:74] 
Out[44]: array([ 1, 22, 22, 22,  3,  1, 22,  1, 22, 22,  1, 22]) 
>>> # to get the label similariry , need to provide the 
>>> # the column name of aquifer group and fit again like 
>>> mxs = MXS (kname ='k', aqname ='aquifer_group').fit(hdata)
>>> sim = mxs.labelSimilarity() 
>>> sim 
Out[47]: [(0, 'II')] # group II and label 0 are very similar 
""" .format(
params =_param_docs 
)   


[docs]
class Logging :
    """
    Logging class
    
    Only deal with numerical values. If categorical values are find in the 
    logging dataset, they should be discarded. 
    
    Parameters 
    -----------
    zname: str, default='depth' or 'None'
        The name of the depth column in `data`. If the name 'depth' is not  
        specified as the main depth columns, an other name in the columns 
        that matches the depth can also be indicated so the function will put 
        aside this columm as depth column for plot purpose. If set to ``None``, 
        `zname` holds the name ``depth`` and assumes that depth exists in 
        `data` columns.
        
    kname: str, int
        Name of permeability coefficient columns. `kname` allows to retrieve the 
        permeability coefficient 'k' in  a specific dataframe. If integer is passed, 
        it assumes the index of the dataframe  fits the 'k' columns. Note that 
        integer value must not be out the dataframe size along axis 1. Commonly
       `kname` needs to be supplied when a dataframe is passed as a positional 
        or keyword argument. 
        
    Examples 
    ----------
    >>> from watex.datasets import load_hlogs 
    >>> from watex.methods.hydro import Logging 
    >>> # get the logging data 
    >>> h = load_hlogs ()
    >>> h.feature_names
    Out[29]: 
    ['hole_id',
     'depth_top',
     'depth_bottom',
     'strata_name',
     'rock_name',
     'layer_thickness',
     'resistivity',
     'gamma_gamma',
     'natural_gamma',
     'sp',
     'short_distance_gamma',
     'well_diameter']
    >>> # we can fit to collect the valid logging data
    >>> log= Logging(kname ='k', zname='depth_top' ).fit(h.frame[h.feature_names])
    >>> log.feature_names_in_ # categorical features should be discarded.
    Out[33]: 
    ['depth_top',
     'depth_bottom',
     'layer_thickness',
     'resistivity',
     'gamma_gamma',
     'natural_gamma',
     'sp',
     'short_distance_gamma',
     'well_diameter']
    >>> log.plot ()
    Out[34]: Logging(zname= depth_top, kname= k, verbose= 0)
    >>> # plot log including the target y 
    >>> log.plot (y = h.frame.k , posiy =0 )# first position 
    Logging(zname= depth_top, kname= k, verbose= 0)
    
    """
    def __init__(
        self, 
        zname=None, 
        kname=None,
        verbose=0
        ):
        
        self._logging = watexlog.get_watex_logger(self.__class__.__name__)
        self.zname=zname 
        self.kname=kname
        self.verbose=verbose 
        
        

[docs]
    def fit(
        self, 
        data, 
        **fit_params
        )->"Logging": 
        """
        Fit logging data and populate attributes 
        
        Parameters 
        -----------
        
        data : Dataframe of shape (n_samples, n_features)
            where `n_samples` is the number of data, expected to be the data 
            collected at different depths and `n_features` is the number of 
            columns (features) that supposed to be plot. 
            Note that `X` must include the ``depth`` columns. If not given a 
            relative depth should be created according to the number of 
            samples that composes `data`.
 
        fit_params: dict, 
            Additional keyword arguments passed to 
            :func:`~.watex.utils.funcutils.to_numeric_dtypes`. 
               
        Returns 
        -------
        self: object instanciated for chaining methods. 
       
        """
        
        data = check_array (
            data, 
            force_all_finite= "allow-nan", 
            dtype =object , 
            input_name="data", 
            to_frame= True, 
            )
        self.data_= to_numeric_dtypes( 
            data , pop_cat_features= True, 
            verbose =self.verbose, 
            **fit_params 
            )
        self.feature_names_in_ = list(self.data_ ) 
        
        return self 

    

[docs]
    def plot (
        self, 
        normalize = False, 
        impute_nan= True, 
        log10=False, 
        posiy=None, 
        fill_value = None, 
        **plot_kws
        ):
        """ Plot the logging data 
        
        Parameters
        -----------
        
        normalize: bool, default = False
            Normalize all the data to be range between (0, 1) except the `depth`,    

        impute_nan: bool, default=True, 
            Replace the NaN values in the dataframe. Note that the default 
            behaviour for replacing NaN is the ``mean``. However if the argument 
            of `fill_value` is provided,the latter should be used to replace 'NaN' 
            in `X`. 
            
        log10: bool, default=False
            Convert values to log10. This can be usefull when using the logarithm 
            data. However, it seems not all the data can be used this operation, 
            for instance, a negative data. In that case, `column_to_skip` argument
            is usefull to provide so to skip that columns when converting values 
            to log10. 
            
        fill_value : str or numerical value, optional
            When strategy == "constant", fill_value is used to replace all
            occurrences of missing_values.
            If left to the default, fill_value will be 0 when imputing numerical
            data and "missing_value" for strings or object data types. If not 
            given and `impute_nan` is ``True``, the mean strategy is used instead.

        posiy: int, optional 
            the position to place the target plot `y` . By default the target plot 
            if given is located at the last position behind the logging plots.  
            
        """
        self.inspect 
        
        from ..utils.plotutils import plot_logging 
        
        plot_logging (
            self.data_, 
            tname = self.kname, 
            zname =self.zname,
            normalize = normalize, 
            impute_nan= impute_nan, 
            log10=log10, 
            posiy=posiy, 
            fill_value = fill_value, 
            **plot_kws
            )
        
        return self 

    
    def __repr__(self):
        """ Pretty format for programmer guidance following the API... """
        return repr_callable_obj  (self)
       
    
    def __getattr__(self, name):
        _getattr_(self, name)

    @property 
    def inspect (self): 
        """ Inspect object whether is fitted or not"""
        msg = ( "{obj.__class__.__name__} instance is not fitted yet."
               " Call 'fit' with appropriate arguments before using"
               " this method"
               )
        
        if not hasattr (self, 'data_'): 
            raise NotFittedError(msg.format(
                obj=self)
            )
        return 1 

    

[docs]
class AqGroup (HData):
    def __init__ (
            self, 
            kname =None, 
            aqname =None,
            method="naive", 
            keep_label_0=False, 
            **kws
            ): 
        super().__init__(
            kname =kname,
            aqname=aqname, 
            **kws
            )
        self.method=method
        self.keep_label_0=keep_label_0 
        

[docs]
    def findGroups (
        self , 
        method="naive", 
        default_arr = None, 
        **g_kws 
        ):
        """ Find the existing group between the permeability coefficient `k` 
        and the group of aquifer. 
        
        It computes the occurence between the true labels 
        and the group of aquifer  as a function of occurence and
        repesentativity.
        
        Parameters 
        ----------
        keep_label_0: bool, default=False
            The prediction already include the label 0. However, including 0 in 
            the predicted label refers to 'k=0' i.e. no permeability coefficient 
            equals to 0, which is not True in principle, because all rocks  have 
            a permeability coefficient 'k'. Here we considered 'k=0' as an undefined 
            permeability coefficient. Therefore, '0' , can be exclude since, it can 
            also considered as a missing 'k'-value. If predicted '0' is in the target 
            it should mean a missing 'k'-value rather than being a concrete label.  
            Therefore, to avoid any confusion, '0' is altered to '1' so the value 
            `+1` is used to move forward all class labels thereby excluding 
            the '0' label. To force include 0 in the label, set `keep_label_0` 
            to ``True``.
        
        method: str ['naive', 'strict'], default='naive'
            The kind of strategy to compute the representativity of a label 
            in the predicted array 'y_pred'. It can also be 'strict'. Indeed:
            
            - ``naive`` computes the importance of the label by the number of its
                occurence for this specific label in the array 'y_true'. It does not 
                take into account of the occurence of other existing labels. This 
                is usefull for unbalanced class labels in `y_true`.
            - ``strict`` computes the importance of the label by the number of 
                occurence in the whole valid `y_true` i.e. under the total of 
                occurence of all the labels that exist in the whole 'arra_aq'. 
                This can give a suitable anaylse results if the data is not 
                unbalanced for each labels in `y_pred`.
        Returns
        --------
        g: _Group: :class:`~.box._Group` class object 
            Use attribute `.groups` to find the group values. 
                 
        """
        self.inspect
        
        msg =("{0!r} data for {1} cannot be None. Specify the name of the "
              "column {2!r} that fits the {1} values in the hydro-log dataset."
              )
        if self.k_ is None: 
            raise kError (msg.format("k","permeability coefficient", "kname" ))
        if self.aq_ is None: 
            raise AquiferGroupError(msg.format(
                "aq", "aquifer groups", "aqname")
            )

        g= find_aquifer_groups(
            self.k_, self.aq_,
            kname=self.kname , 
            aqname = self.aqname,
            method=method, 
            **g_kws
            )
        return g 


    
AqGroup.__doc__="""\
Group of Aquifer is mostly related to area information after multiple 
boreholes collected. 

However when predicted 'k' with a missing k-values using the Mixture 
Learning Strategy (MXS), we intend to solve this problem by creating 
a Naive Group of Aquifer (NGA) to compensate the missing k-values in the 
dataset. This could be a good idea to avoid introducing a lot of bias since 
the group of aquifer is mostly tied to the permeability coefficient 'k'. 
To do this, an unsupervised learning is used to predict the NGA labels then 
the NGA labels are used in turn to fill the missing k-values. The best 
strategy for operting this trick is to  seek for some importances between
the true k-values with their corresponding aquifer groups at each depth, 
and find the most representative group. Once the most representative group 
is found for each true label 'k', the group of aquifer can be renamed as 
the naive similarity with the true k-label. For instance if true k-value 
is the label 1 and label 1 is most representative with the group of aquifer
'IV', therefore this group can be replaced throughout the column 
with 'k1'+'IV=> i.e. 'k14'. This becomes a new label created and is used to 
fill the true label 'y_true' to become a MXS target ( include NGA label). 
Note that the true label with valid 'k-value' remained intact and unchanged.
The same process is done for label 2, 3 and so on. The selection of MXS 
label from NGA strongly depends on its preponderance or importance rate in 
the whole dataset. 

The following example is the demonstration to how to compute the group 
representativity in datasets. 

Parameters 
----------
{params.core.kname}
{params.base.aqname}

g:dict, 
    Dictionnary compose of occurence between the true labels 
    and the group of aquifer  as a function of occurence and
    repesentativity 
Example 
--------
>>> from watex.methods.hydro import AqGroup 
>>> hg = AqGroup (kname ='k', aqname='aquifer_group').fit(hdata ) 
>>> hg.findGroups () 
Out[25]: 
 _Group(Label=[' 0 ', 
                   Preponderance( rate = ' 100.0  %', 
                                [('Groups', {{'II': 1.0}}),
                                 ('Representativity', ( 'II', 1.0)),
                                 ('Similarity', 'II')])],
             )                 
""".format(params = _param_docs)

#XXX TODO 

[docs]
class Hydrogeology(ABC):
    """ 
    A branch of geology concerned with the occurrence, use, and functions of 
    surface water and groundwater. 
    
    Hydrogeology is the study of groundwater – it is sometimes referred to as
    geohydrology or groundwater hydrology. Hydrogeology deals with how water 
    gets into the ground (recharge), how it flows in the subsurface 
    (through aquifers) and how groundwater interacts with the surrounding soil 
    and rock (the geology).
    
    Indeed, hydrogeologists apply this knowledge to many practical uses. 
    They might:
        
    * Design and construct water wells for drinking water supply, irrigation 
        schemes and other purposes;
    * Try to discover how much water is available to sustain water supplies 
        so that these do not adversely affect the environment – for example, 
        by depleting natural baseflows to rivers and important wetland 
        ecosystems;
    * Investigate the quality of the water to ensure that it is fit for its 
        intended use; 
    * Where the groundwater is polluted, they design schemes to try and 
        clean up this pollution;
        Design construction dewatering schemes and deal with groundwater 
        problems associated with mining; Help to harness geothermal energy
        through groundwater-based heat pumps.
    """
    @abstractclassmethod 
    def __init__(
        self, 
        **kwd
        ): 
        self._logging = watexlog.get_watex_logger(self.__class__.__name__)

        
#xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
def _getattr_(self, name):
    """ Isolated part of __getattr__ to reformat the attribute getter. """
    rv = smart_strobj_recognition(name, self.__dict__, deep =True)
    appender  = "" if rv is None else f'. Do you mean {rv!r}'
    
    if name =='yNGA_': 
        err_msg =(". Call 'predictNGA' method to fetch attribute 'yNGA_'")
    else: err_msg =  f'{appender}{"" if rv is None else "?"}' 
    
    raise AttributeError (
        f'{self.__class__.__name__!r} object has no attribute {name!r}'
        f'{err_msg}'
        )