Source code for watex.datasets.sets

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>

""" 
Set all dataset.  
"""
from warnings import warn 

fi=False

_DTAGS=(
    "bagoue" , 
    "gbalo", 
    "iris", 
    "semien", 
    "tankesse", 
    "boundiali",
    "hlogs",
    "nlogs", 
    "mxs", 
    "huayuan", 
    "edis"
    )

from .dload import (
    load_bagoue , 
    load_gbalo, 
    load_iris, 
    load_semien, 
    load_tankesse , 
    load_boundiali,
    load_hlogs,
    load_nlogs, 
    load_huayuan, 
    load_edis, 
    load_mxs, 
    ) 
from .gdata import ( 
    make_erp , 
    make_ves 
    )
try : 
    from ._config import _fetch_data
except ImportError: 
    warn ("'fetch_data' seems not respond. Use 'load_<area name>'"
          " instead.")
else: fi=True 
    

__all__=[ 
         "load_bagoue" ,
         "load_gbalo", 
         "load_iris", 
         "load_semien", 
         "load_tankesse", 
         "load_boundiali",
         "load_hlogs",
         "load_nlogs", 
         "load_huayuan", 
         "fetch_data",
         "load_edis",
         "load_mxs", 
         "make_erp" , 
         "make_ves", 
         "DATASET"
         ]


[docs]
def fetch_data (tag, **kws): 
    tag = _parse_tags(tag, multi_kind_dataset='bagoue')
    func= _fetch_data if fi else None 
    funcs= (load_bagoue , load_gbalo, load_iris, load_semien, load_tankesse , 
            load_boundiali, load_hlogs, load_nlogs, load_huayuan, load_edis, 
            load_mxs ) 
    funcns = list (map(lambda f: f.__name__.replace('load_', ''), funcs))
    if tag in (funcns): 
        func = funcs[funcns.index (tag)] 
    
    return func (tag=tag, data_names=funcns, **kws) if callable (func) else None 



fetch_data.__doc__ ="""\
Fetch dataset from `tag`. 

A tag corresponds to the name area of data collection or each 
level of data processing. 

Parameters 
------------
tag: str, ['bagoue', 'tankesse', 'semien', 'iris', 'boundiali', 'gbalo']
    name of the area of data to fetch. For instance set the tag to ``bagoue`` 
    will load the bagoue datasets. If the `tag` name is following by a suffix, 
    the later specifies the stage of the data processing. As an example, 
    `bagoue original` or `bagoue prepared` will retrieve the original data and 
    the transformed data after applying default transformers respectively. 
    
    There are different options to retrieve data such as:
        
    * ['original'] => original or raw data -& returns a dict of details 
        contex combine with get method to get the dataframe like::
            
            >>> fetch_data ('bagoue original').get ('data=df')
            
    * ['stratified'] => stratification data
    * ['mid' |'semi'|'preprocess'|'fit']=> data cleaned with 
        attributes experience combinaisons.
    * ['pipe']=>  default pipeline created during the data preparing.
    * ['analyses'|'pca'|'reduce dimension']=> data with text attributes
        only encoded using the ordinal encoder +  attributes  combinaisons. 
    * ['test'] => stratified test set data

Returns
-------
dict, X, y : frame of :class:`~watex.utils.box.Boxspace` object 
    If tag is following by suffix in the case of 'bagoue' area, it returns:
        - `data`: Original data 
        - `X`, `y` : Stratified train set and training target 
        - `X0`, `y0`: data cleaned after dropping useless features and combined 
            numerical attributes combinaisons if ``True``
        - `X_prepared`, `y_prepared`: Data prepared after applying  all the 
            transformation via the transformer (pipeline). 
        - `XT`, `yT` : stratified test set and test label  
        - `_X`: Stratified training set for data analysis. So None sparse
            matrix is contained. The text attributes (categorical) are 
            converted using Ordianal Encoder.  
        - `_pipeline`: the default pipeline. 
Examples 
---------
>>> from watex.datasets import fetch_data 
>>> b = fetch_data('bagoue' ) # no suffix returns 'Boxspace' object
>>> b.tnames 
... array(['flow'], dtype='<U4')
>>> b.feature_names 
... ['num',
     'name',
     'east',
     'north',
     'power',
     'magnitude',
     'shape',
     'type',
     'sfi',
     'ohmS',
     'lwi',
     'geol']
>>> X, y = fetch_data('bagoue prepared' )
>>> X # is transformed  # ready for prediction 
>>> X[0] 
... <1x18 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>
>>> y
... array([2, 1, 2, 2, 1, 0, ... , 3, 2, 3, 3, 2], dtype=int64)

"""    

def _parse_tags (tag, multi_kind_dataset ='bagoue'): 
    """ Parse and sanitize tag to match the different type of datasets.
    
    In principle, only the 'Bagoue' datasets is allowed to contain a tag 
    composed of two words i.e. 'Bagoue' + '<kind_of_data>'. For instance 
    ``bagoue pipe`` fetchs only the pipeline used for Bagoue case study  
    data preprocessing and so on. 
    However , for other type of dataset, it a second word <kind_of_data> is 
    passed, it should merely discarded. 
    """ 
    tag = str(tag);  t = tag.strip().split() 
    
    if len(t) ==1 : 
        if t[0].lower() not in _DTAGS: 
            tag = multi_kind_dataset +' ' + t[0]
            
            warn(f"Fetching {multi_kind_dataset.title()!r} data without"
                 " explicitly prefixing the kind of data with the area"
                 " name will raise an error. In future, the argument"
                f" should be '{tag}' instead.", FutureWarning 
                 )
    elif len(t) >1 : 
        # only the multi kind dataset is allowed 
        # to contain two words for fetching data 
        if t[0].lower() !=multi_kind_dataset: 
            tag = t[0].lower() # skip the second word 
    return tag 

from ..utils.funcutils import listing_items_format

_l=[ "{:<7}: {:<7}()".format(s.upper() , 'load_'+s ) for s in _DTAGS ] 
_LST = listing_items_format(
    _l, 
    "Fetch data using 'load_<type_of_data|area_name>'like", 
    " or using ufunc 'fetch_data (<type_of_data|area_name>)'.",
    inline=True , verbose= False, 
)

_DDOC="""\
WATex dataset is composed of different kind of data for software implementation. 
    - ERP data found in 'gbalo', 'boundiali' localities in northern part of 
        Cote d'Ivoire <'https://en.wikipedia.org/wiki/Ivory_Coast'>'
    - VES data collected in 'gbalo', 'semien', 'tankesse' in center and 
        eastearn part of Cote d'Ivoire'.
    - FLOW RATE FEATURES data computed from Bagoue ERP and VES data. 
        Refer to paper :doi:`https://doi.org/10.1029/2021wr031623`. 
    - COMMON MACHINE LEARNING popular data sets such IRIS. 
    - EDI datasets from huayaun county in China, Hunan Province. 
    - HLOGS and MXS data  for Hongliu coal mine raw and preprocessed borehole data. 
"""
    
DATASET= type ("DATASET", (), {"KIND": _DTAGS, 
                               "HOW":_LST, 
                               "DOC":_DDOC, 
                               }
)