Source code for watex.utils.coreutils

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Created date: Fri Apr 15 10:46:56 2022

"""
The module encompasses the main functionalities for class and methods to sucessfully 
run. Somes modules are written and shortcutted for the users to do some 
singular tasks before feeding to the main algorithms. 

"""
from __future__ import  annotations 

import os
import re 
import pathlib
import warnings 
import copy 
import itertools
import collections   

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
 
from .._docstring import refglossary 
from .._typing import (
    Any, 
    List ,  
    Union, 
    Tuple,
    Dict,
    Optional,
    NDArray,
    DataFrame, 
    Series,
    ArrayLike, 
    DType, 
    Sub, 
    SP
)
from .._watexlog import watexlog
from ..decorators import refAppender, docSanitizer
from ..property import P , Config
from ..exceptions import ( 
    StationError, 
    HeaderError, 
    ResistivityError,
    ERPError,
    VESError, 
    FileHandlingError
)
from .baseutils import save_or_load
from .funcutils import (
    smart_format as smft,
    _isin , 
    _assert_all_types,
    accept_types,
    read_from_excelsheets,
    to_numeric_dtypes, 
    reshape, 
    is_iterable, 
    is_in_if, 
    ellipsis2false, 
    ) 
from .gistools import (
    assert_lat_value,
    assert_lon_value,
    convert_position_str2float,
    convert_position_float2str,
    utm_to_ll, 
    project_point_ll2utm, 
    project_point_utm2ll, 
    HAS_GDAL, 
    )
from .validator import  (
    _is_arraylike_1d, 
    _check_consistency_size, 
    is_valid_dc_data, 
    array_to_frame, 
    check_y
    )
_logger = watexlog.get_watex_logger(__name__)


__all__=[
    "vesSelector", 
    "erpSelector", 
    "fill_coordinates", 
    "plotAnomaly", 
    "makeCoords", 
    "parseDCArgs", 
    "defineConductiveZone", 
    "read_data", 
    "_is_readable", 
    "is_erp_series", 
    "is_erp_dataframe"
    ]


[docs]
@refAppender(refglossary.__doc__)
def vesSelector( 
    data:str | DataFrame[DType[float|int]] = None, 
    *, 
    rhoa: ArrayLike |Series | List [float] = None, 
    AB :ArrayLike |Series = None, 
    MN: ArrayLike|Series | List[float] =None, 
    index_rhoa: Optional[int]  = None, 
    xy_coords: Tuple [float|int]=None, 
    is_utm: bool= False, 
    utm_zone: str =None,
    epsg: int|str=None, 
    **kws
) -> DataFrame : 
    """ Assert the validity of |VES| data and return a sanitize dataframe. 
    
    :param rhoa: array-like - Apparent resistivities collected during the 
        sounding. 
        
    :param AB: array-like - Investigation distance between the current 
        electrodes. Note that the `AB` is by convention equals to `AB/2`. 
        It's taken as half-space of the investigation depth.
        
    :param MN: array-like - Potential electrodes distances at each investigation 
        depth. Note by convention the values are half-space and equals to 
        `MN/2`. 
        
    :param f: Path-like object or sounding dataframe. If given, the 
        others parameters could keep the ``None` values. 
        
    :param index_rhoa: int - The index to retrieve the resistivity data of a 
        specific sounding point. Sometimes the sounding data are composed of
        the different sounding values collected in the same survey area into 
        different |ERP| line. For instance:
            
            +------+------+----+----+----+----+----+
            | AB/2 | MN/2 |SE1 | SE2| SE3| ...|SEn |
            +------+------+----+----+----+----+----+
            
        Where `SE` are the electrical sounding data values  and `n` is the 
        number of the sounding points selected. `SE1`, `SE2` and `SE3` are 
        three  points selected for |VES| i.e. 3 sounding points carried out 
        either in the same |ERP| or somewhere else. These sounding data are 
        the resistivity data with a  specific numbers. Commonly the number 
        are randomly chosen. It does not refer to the expected best fracture
        zone selected after the prior-interpretation. After transformation 
        via the function `ves_selector`, the header of the data should hold 
        the `resistivity`. For instance, refering to the table above, the 
        data should be:
            
            +----+----+-------------+-------------+-------------+-----+
            | AB | MN |resistivity  | resistivity | resistivity | ... |
            +----+----+-------------+-------------+-------------+-----+
        
        Therefore, the `index_rhoa` is used to select the specific resistivity
        values i.e. select the corresponding sounding number  of the |VES| 
        expecting to locate the drilling operations or for computation. For 
        esample, ``index_rhoa=1`` should figure out: 
            
            +------+------+----+--------+-----+----+------------+
            | AB/2 | MN/2 |SE2 |  -->   | AB  | MN |resistivity |
            +------+------+----+--------+-----+----+------------+
        
        If `index_rhoa` is ``None`` and the number of sounding curves are more 
        than one, by default the first sounding curve is selected ie 
        `index_rhoa` equals to ``0``.
        
    :param xy_coords: tuple (float, float) 
       Coordinates of the sounding point. Must be ('longitude','latitude') or 
       ('easting', 'northing'). If xy is `xy_coords` is given as 
       ('easting' , 'northing'), specify ``is_utm=True`` so the conversion to 
       ('longitude', 'latitude') should be triggered. If ``False``, a 
       warnings occurs if values are greater than 180 and 90 degree for 
       longitude and latitude respectively. 
       Note that if the coordinates exists in the dataframe, its should 
       takes the priority 
       
       .. versionadded:: 0.2.1 
       
    :param is_utm: bool, default= False, 
       Allow conversion the ('easting', 'northing') coordinated from `xy_coords` 
       to ('longitude', 'latitude') 
       
    :param utm_zone: default='49R' 
       Is needed when `xy_coords` is passed as ('easting', 'northing') for 
       conversion. 
       
    :param epsg: int, str , optional 
       EPSG number defining projection. See http://spatialreference.org/ref/ 
       for moreinfo. Overrides utm_zone if both are provided
       
    :param kws: dict - Pandas dataframe reading additionals
        keywords arguments.
        
    :return: -dataframe -Sanitize |VES| dataframe with ` AB`, `MN` and
        `resistivity` as the column headers. 
    
    :Example: 
        
        >>> from watex.utils.coreutils import vesSelector 
        >>> df = vesSelector (data='data/ves/ves_gbalo.csv')
        >>> df.head(3)
        ...    AB   MN  resistivity
            0   1  0.4          943
            1   2  0.4         1179
            2   3  0.4         1103
        >>> df = vesSelector ('data/ves/ves_gbalo.csv', index_rhoa=3 )
        >>> df.head(3) 
        ...    AB   MN  resistivity
            0   1  0.4          457
            1   2  0.4          582
            2   3  0.4          558
    """
    err =VESError("Data validation aborted! Current electrodes values"
        " are missing. Specify the deep measurement AB/2")
    
    for arr, arr_name in zip ((AB , rhoa), ("AB", "Resistivity")): 
        if arr is not None: 
            if isinstance(arr, (list, tuple)): 
                arr=np.array(arr)
            if not _is_arraylike_1d(arr): 
                raise VESError(
                    f"{arr_name!r} should be a one-dimensional array.")
                
    index_rhoa =  0 if index_rhoa is None else index_rhoa 
    index_rhoa = int (_assert_all_types(
        index_rhoa, int, objname ="Resistivity column index"))
    
    # make  a copy of xy coordinates 
    xy_coords_copy = copy.deepcopy(xy_coords)
    if data is not None: 
        rhoa, AB, MN, xy_coords =_validate_ves_data_if(
            data, index_rhoa, err, **kws)
        # in the case coordinates are not in the data 
        # and passed explicitly then takes the copy       
        if xy_coords is None: 
            xy_coords = xy_coords_copy

    if rhoa is None: 
        raise ResistivityError(
            "Data validation aborted! Missing resistivity values.")
        
    if AB is None: 
        raise err

    AB = np.array(AB) ; MN = np.array(MN) ; rhoa = np.array(rhoa) 
    
    if not _check_consistency_size(AB, rhoa, error ='ignore'): 
        raise VESError(
            " Deep measurement size `AB` ( current electrodes ) "
            " and the resistiviy values `rhoa` must be consistent."
            f" '{len(AB)}' and '{len(rhoa)}' were given."
                       )
    sdata =pd.DataFrame(
        {'AB': AB, 'MN': MN, 'resistivity':rhoa},index =range(len(rhoa)))
    
    # when xy_coords are directly retrieved 
    # from dataframe . make the whole frame instead 
    # including the sounding coordinates points.
    if  ( 
            hasattr (xy_coords , 'columns' ) and 
            hasattr (xy_coords, '__array__')): 
        sdata = pd.concat ( [ sdata , xy_coords ], axis = 1 )
        xy_coords= None 
      
    if xy_coords is None: 
        return sdata 
  
    xy_coords = is_iterable(
        xy_coords, exclude_string= True , transform =True ) 
    
    if len(xy_coords)!=2: 
        warnings.warn("Unexpected coordinates xy. xy should be a tuple" 
                      f" of (longitude, latitude) values. Got {xy_coords}")
        
        xy_coords = None 
        
    if xy_coords is not None: 
        try: 
            xy_coords  = _convert_xy_coordinates(
                *xy_coords, 
                is_utm=is_utm, 
                utm_zone = utm_zone , epsg =epsg 
                )
        except Exception as e: 
            warnings.warn(str(e) + ". This error occurs probably because"
                          " you passed wrong coordinates xy or the utm_zone"
                          " is not set while using ('easting', 'northing')"
                          " as sounding coordinates. Unable to convert UTM"
                          " coordinates to longitude/latitude with missing"
                          " EPSG or UTM zone number. Please check your" 
                          " sounding coordinates.") 
            xy_coords= None 
            
    if xy_coords is not None: 
        sdata ['longitude']= xy_coords[0] 
        sdata ['latitude']= xy_coords[-1]

    return sdata

 
def _convert_xy_coordinates ( *xy, is_utm = False, as_frame =False, 
                             utm_zone = '49R', epsg = None ):
    """ manage coordinates and convert coordinates to longitude/latitude 
    if UTM data ( 'easting', 'northing') is given.
    
    `xy` must be ('longitude' , 'latitude') coordinates. Turn ``is_utm==True``
    when `xy` are in ( 'easting', 'northing') 
    
    An isolated part of `vesSelector`. Refer to documentation for params 
    explanations. 
    """
    # fetch_random coordinate for sves
    if is_utm : 
        # so reverse it 
        yx = project_point_utm2ll(*xy[::-1] , utm_zone = utm_zone , epsg= epsg 
                                  ) 
        xy = yx [::-1] # reverse back to longitude latitude 
        
        is_utm=False # conversion is done 
        
    # now validate longitude and latitude 
    try: 
        xy = ( assert_lon_value(xy [0]) , assert_lat_value(xy[1]))  
    
    except (TypeError, ValueError) as e: 
        warnings.warn (str(e)+ ' Please check your sounding xy coordinates') 
        xy = None 
        
    if as_frame and xy is not None : 
        xy = pd.DataFrame ({'longitude':xy[0] , 'latitude': xy[-1]}, 
                           index = range (1))
    return xy 


[docs]
@docSanitizer()
def fill_coordinates(
    data: DataFrame =None, 
    lon: ArrayLike = None,
    lat: ArrayLike = None,
    east: ArrayLike = None,
    north: ArrayLike = None, 
    epsg: Optional[int] = None , 
    utm_zone: Optional [str]  = None,
    datum: str  = 'WGS84', 
    verbose:int =0, 
) -> Tuple [DataFrame, str] : 
    """ Assert and recompute coordinates values based on geographical 
    coordinates systems.
    
    Compute the couples (easting, northing) or (longitude, latitude ) 
    and set the new calculated values into a dataframe.
    
    Parameters 
    -----------
    
    data : dataframe, 
        Dataframe contains the `lat`, `lon` or `east` and `north`. All data 
        don't need to  be provided. If ('lat', 'lon') and (`east`, `north`) 
        are given, ('`easting`, `northing`') should be overwritten.
        
    lat: array-like float or string (DD:MM:SS.ms)
        Values composing the `longitude`  of point

    lon: array-like float or string (DD:MM:SS.ms)
        Values composing the `longitude`  of point
              
    east : array-like float
        Values composing the northing coordinate in meters
                 
    north : array-like float
        Values composing the northing coordinate in meters

    datum: string
        well known datum ex. WGS84, NAD27, etc.
                
    projection: string
        projected point in lat and lon in Datum `latlon`, as decimal degrees 
        or 'UTM'.
                
    epsg: int
        epsg number defining projection (see http://spatialreference.org/ref/ 
        for moreinfo). Overrides utm_zone if both are provided
        
    utm_zone : string
       zone number and 'S' or 'N' e.g. '55S'. Defaults to the
       centre point of the provided points
    verbose: int,default=0 
        warning user if UTMZONE is not supplied when computing the 
        latitude/longitude from easting/northing 
    
                    
    Returns 
    ------- 
        - `data`: Dataframe with new coodinates values computed 
        - `utm_zone`: zone number and 'S' or 'N'  
        
    Examples 
    ----------
	>>> from watex.utils.coreutils import fill_coordinates 
    >>> from watex.utils import read_data 
    >>> data = read_data ('data/erp/l2_gbalo.xlsx') 
    >>> # rename columns 'x' and 'y' to 'easting' and 'northing'  inplace 
    >>> data.rename (columns ={"x":'easting', "y":'northing'} , inplace =True ) 
    >>> # transform the data by computing latitude/longitude by specifying the utm zone 
    >>> data_include,_ = fill_coordinates (data , utm_zone ='49N' ) 
    >>> data.head(2)  
          easting   northing   rho  longitude  latitude
     0   790752  1092750.0  1101        113         9
    10   790747  1092758.0  1147        113         9
    >>> # doing the revert action 
    >>> datalalon = data_include[['pk', 'longitude', 'latitude']] 
	>>> data_east_north, _ = fill_coordinates (datalalon ) 
	>>> data_east_north.head(2) 
		pk  longitude  latitude  easting  northing
	0   0        113         9   719870    995452
	1  10        113         9   719870    995452
        
    """
    def _get_coordcomps (str_, df):
        """ Retrieve coordinate values and assert whether values are given. 
        If ``True``, returns `array` of `given item` and valid type of the 
        data. Note that if data equals to ``0``, we assume values are not 
        provided. 
        
        :param str_: str - item in the `df` columns 
        :param df: DataFrame - dataframe expected containing the `str_` item. 
        """
        
        if str_ in df.columns: 
            return df[str_] , np.all(df[str_])!=0 
        return None, None 
    
    def _set_coordinate_values (x, y, *, func ): 
        """ Iterate `x` and `y` and output new coordinates values computed 
        from `func` . 
        param x: iterable values 
        :param y: iterabel values 
        :param func: function F 
            can be: 
                - ``project_point_utm2ll`` for `UTM` to `latlon`` or 
                - `` project_point_ll2utm`` for `latlon`` to `UTM` 
        :retuns: 
            - xx new calculated 
            - yy new calculated 
            - utm zone 
        """
        xx = np.zeros_like(x); 
        yy = np.zeros_like(xx)
        for ii, (la, lo) in enumerate (zip(x, y)):
            e , n, *uz  = func (
                la, lo, utm_zone = utm_zone, datum = datum, epsg =epsg 
                ) 
            xx [ii] = e ; yy[ii] = n  
                
        return xx, yy , uz  
    
    if data is None:  

        data = pd.DataFrame (
            dict ( 
                longitude = lon ,
                latitude = lat ,
                easting = east,
                northing=north
                ), 
            #pass index If using all scalar values 
            index = range(4)  
            )

    if data is not None : 
        data = _assert_all_types(data, pd.DataFrame, objname="Coordinate data")

    lon , lon_isvalid  = _get_coordcomps(
        'longitude', data )
    lat , lat_isvalid = _get_coordcomps(
        'latitude', data )
    east , e_isvalid = _get_coordcomps(
        'easting', data )
    north, n_isvalid  = _get_coordcomps(
        'northing', data )
 
    if lon_isvalid and lat_isvalid: 
        # raise warning when all coordinates are valids 
        if  ( e_isvalid and n_isvalid ): 
            if verbose:
                warnings.warn(
                    "Data contains valid longitude/latitude and "
                    "easting/northing. The latter should be overwritten.")
        else: 
            try : 
                east , north , uz = _set_coordinate_values(
                    lat.values, lon.values, func=project_point_ll2utm,
                    )
            except :# pass if an error occurs 
                pass 
            else : 
                data['easting'] = east ; data['northing'] = north 
            
    elif e_isvalid and n_isvalid: 
        if utm_zone is None: 
            if verbose > 0: 
                warnings.warn(
                    'Should provide the `UTM` for `latitute` and `longitude`'
                    ' calculus. `NoneType` can not be used as UTM zone number.'
                    ' Refer to the documentation.')
        try : 
            lat , lon, *_ = _set_coordinate_values(
                east.values, north.values,
                func = project_point_utm2ll,
                )
        except : pass 
        else : 
            data['longitude'] = lon ;  data['latitude'] = lat 

    return data, utm_zone 


    
def _assert_data (data :DataFrame  ): 
    """ Assert  the data and return the property dataframe """
    data = _assert_all_types(
        data, list, tuple, np.ndarray, pd.Series, pd.DataFrame) 
    
    if isinstance(data, pd.DataFrame): 
        cold , ixc =list(), list()
        for i , ckey in enumerate(data.columns): 
            for kp in P().isrll : 
                if ckey.lower() .find(kp) >=0 : 
                    cold.append (kp); ixc.append(i)
                    break 
                    
        if len (cold) ==0: 
            raise ValueError (f'Expected {smft(P().isrll)} '
                ' columns, but not found in the given dataframe.'
                )
                
        dup = cold.copy() 
        # filter and remove one by one duplicate columns.
        list(filter (lambda x: dup.remove(x), set(cold)))
        dup = set(dup)
        if len(dup) !=0 :
            raise HeaderError(
                f'Duplicate column{"s" if len(dup)>1 else ""}'
                f' {smft(dup)} found. It seems to be {smft(dup)}'
                f'column{"s" if len(dup)>1 else ""}. Please provide'
                '  the right column name in the dataset.'
                )
        data_ = data [cold] 
  
        col = list(data_.columns)
        for i, vc in enumerate (col): 
            for k in P().isrll : 
                if vc.lower().find(k) >=0 : 
                    col[i] = k ; break 
                
    return data_
 

[docs]
def is_erp_series (
        data : Series ,
        dipolelength : Optional [float] = None 
        ) -> DataFrame : 
    """ Validate the data series whether is ERP data.  
    
    The `data` should be the resistivity values with the one of the following 
    property index names ``resistivity`` or ``rho``. Will raises error 
    if not detected. If a`dipolelength` is given, a data should include 
    each station positions values. 
    
    Parameters 
    -----------
    
    data : pandas Series object 
        Object of resistivity values 
    
    dipolelength: float
        Distance of dipole during the whole survey line. If it is
        is not given , the station location should be computed and
        filled using the default value of the dipole. The *default* 
        value is set to ``10 meters``. 
        
    Returns 
    --------
    A dataframe of the property indexes such as
    ``['station', 'easting','northing', 'resistivity']``. 
    
    Raises 
    ------ 
    ResistivityError
    If name does not match the `resistivity` column name. 
    
    Examples 
    --------
    >>> import numpy as np 
    >>> import pandas as pd 
    >>> from watex.utils.coreutils imprt is_erp_series 
    >>> data = pd.Series (np.abs (np.random.rand (42)), name ='res') 
    >>> data = is_erp_series (data)
    >>> data.columns 
    ... Index(['station', 'easting', 'northing', 'resistivity'], dtype='object')
    >>> data = pd.Series (np.abs (np.random.rand (42)), name ='NAN') 
    >>> data = _is_erp_series (data)
    ... ResistivityError: Unable to detect the resistivity column: 'NAN'.
    
    """
    data = _assert_all_types(data, pd.Series) 
    is_valid = False 
    for p in P().iresistivity : 
        if data.name.lower().find(p) >=0 :
            data.name = p ; is_valid = True ; break 
    
    if not is_valid : 
        raise ResistivityError(
            f"Unable to detect the resistivity column: {data.name!r}."
            )
    
    if is_valid: 
        df = is_erp_dataframe  (pd.DataFrame (
            {
                data.name : data , 
                'NAN' : np.zeros_like(data ) 
                }
            ),
                dipolelength = dipolelength,
            )
    return df 



[docs]
def is_erp_dataframe (
        data :DataFrame ,
        dipolelength : Optional[float] = None, 
        force:bool=False, 
        verbose=0. 
        ) -> DataFrame:
    """ Ckeck whether the dataframe contains the electrical resistivity 
    profiling (ERP) index properties. 
    
    DataFrame should be reordered to fit the order of index properties. 
    Anyway it should he dataframe filled by ``0.`` where the property is
    missing. However, if `station` property is not given. station` property 
    should be set by using the dipolelength default value equals to ``10.``.
    
    Parameters 
    ----------
    
    data : Dataframe object 
        Dataframe object. The columns dataframe should match the property 
        ERP property object such as ``['station','resistivity', 
                                       'longitude','latitude']`` 
        or ``['station','resistivity', 'easting','northing']``.
            
    dipolelength: float
        Distance of dipole during the whole survey line. If the station 
        is not given as  `data` columns, the station location should be 
        computed and filled the station columns using the default value 
        of the dipole. The *default* value is set to ``10 meters``. 
        
    force: bool, default=False, 
        If Vertical electrical (VES) is passed while expecting ERP data, 
        force set to `True` will consider the VES data as ERP data and 
        will use only the resistivity values in VES data. This will 
        will an invalid results especially when parameters computation are 
        needed.
        
    verbose: int, 
       Show the verbosity; outputs more messages if ``True``. 
       
    Returns
    --------
    A new data with index properties.
        
    Raises 
    ------
    - None of the column matches the property indexes.  
    - Find duplicated values in the given data header.
    
    Examples
    --------
    >>> import numpy as np 
    >>> from watex.utils.coreutils import is_erp_dataframe 
    >>> df = pd.read_csv ('data/erp/testunsafedata.csv')
    >>> df.columns 
    ... Index(['x', 'stations', 'resapprho', 'NORTH'], dtype='object')
    >>> df = _is_erp_dataframe (df) 
    >>> df.columns 
    ... Index(['station', 'easting', 'northing', 'resistivity'], dtype='object')
    
    """
    err_msg = ("ERP data must contain 'the resistivity' and the station"
             " position measurement. A sample of ERP data can be found" 
             " in `watex.datasets`. For e.g. 'watex.datasets.load_tankesse'"
             " fetches a 'tankesse' locality dataset and its docstring"
             " `~.load_tankesse.__doc__` can give a furher details about"
             " the ERP data arrangement. {fmsg}"
             )
    
    force_msg= "" if force else (
        "To force reading unsafety data as ERP, set 'force' to ``True``.") 
    
    if force: 
        if verbose: 
            warnings.warn("Force considering unsafety data as ERP data might"
                          " lead to breaking code or invalid results during"
                          " ERP parameters computation. Use at your own risk."
                          )
        data = _assert_all_types(data, pd.DataFrame, 
                 objname="ERP 'resistivity' and station measurement data" )
    else:
        data = is_valid_dc_data( data, exception =ERPError, 
                                extra = err_msg.format(fmsg = force_msg))
     
    datac= data.copy() 
    
    def _is_in_properties (h ):
        """ check whether the item header `h` is in the property values. 
        Return `h` and it correspondence `key` in the property values. """
        for key, values in P().idicttags.items() : 
            for v in values : 
                if h.lower().find (v)>=0 :
                    return h, key 
        return None, None 
    
    def _check_correspondence (pl, dl): 
        """ collect the duplicated name in the data columns """
        return [ l for l in pl for d  in dl if d.lower().find(l)>=0 ]
        
    cold , c = list(), list()
    # create property object
    pObj = P(data.columns)
    for i , ckey in enumerate(list(datac.columns)): 
        h , k = _is_in_properties(ckey)
        cold.append (h) if h is not None  else h 
        c.append(k) if k is not None else k
        
    if len (cold) ==0: 
        raise HeaderError (
            f'Wrong column headers {list(data.columns)}.'
            f' Unable to find the expected {smft(pObj.isrll)}'
            ' column properties.'
                           )

    dup = cold.copy() 
    # filter and remove one by one duplicate columns.
    list(filter (lambda x: dup.remove(x), set(cold)))

    dup = set(dup) ; ress = _check_correspondence(
        pObj() or pObj.idicttags.keys(), dup)
    
    if len(dup) !=0 :
        raise HeaderError(
            f'Duplicate column{"s" if len(dup)>1 else ""}' 
            f' {smft(dup)} {"are" if len(dup)>1 else "is"} '
            f'found. It seems correspond to {smft(ress)}. '
            'Please ckeck your data column names. '
            )
            
    # fetch the property column names and 
    # replace by 0. the non existence column
    # reorder the column to match 
    # ['station','resistivity', 'easting','northing', ]
    
    data_ = data[cold] 
    data_.columns = c  
    
    msg = ERPError("Unknown DC-ERP data. ERP data must contain"
                   f" {smft(pObj.idicttags.keys())}")
    try : 
        data_= data_.reindex (columns =pObj.idicttags.keys(), fill_value =0.
                              ) 
    except : 
        raise msg 
        
    dipolelength = _assert_all_types(
        dipolelength , float, int) if dipolelength is not None else None 
    
    if (np.all (data_.station) ==0. 
        and dipolelength is None 
        ): 
        dipolelength = 10.
        data_.station = np.arange (
            0 , data_.shape[0] * dipolelength  , dipolelength ) 
        
    return data_




[docs]
def erpSelector (
    f: str | NDArray | Series | DataFrame ,
    columns: str | List[str] = ..., 
    force:bool= False, 
    utm_zone:str=None, 
    epsg:int | str=None, 
    verbose:int =0., 
    **kws:Any 
) -> DataFrame  : 
    """ Read and sanitize the data collected from the survey. 
    
    `data` should be an array, a dataframe, series, or  arranged in ``.csv`` 
    or ``.xlsx`` formats. Be sure to provide the header of each columns in'
    the worksheet. In a file is given, header columns should be aranged as  
    ``['station','resistivity' ,'longitude', 'latitude']``. Note that 
    coordinates columns (`longitude` and `latitude`) are not  compulsory. 
    
    Parameters 
    ----------
    
    f: Path-like object, ndarray, Series or Dataframe, 
        If a path-like object is given, can only parse `.csv` and `.xlsx` 
        file formats. However, if ndarray is given and shape along axis 1 
        is greater than 4, the ndarray should be shrunked. 
        
    columns: list 
        list of the valuable columns. It can be used to fix along the axis 1 
        of the array the specific values. It should contain the prefix or 
        the whole name of each item in 
        ``['station','resistivity' ,'longitude', 'latitude']``.
        
    force: bool, default=False, 
        If Vertical electrical (VES) is passed while expecting ERP data, 
        force set to `True` will consider the VES data as ERP data and 
        will use only the resistivity values in VES data. This will 
        will an invalid results especially when parameters computation are 
        needed.
        
    verbose: int, 
       Show the verbosity; outputs more messages if ``True``. 
       
    utm_zone : string, optional
       zone number and 'S' or 'N' e.g. '55S'. Default to the
       centre point of the provided points. If given, the longitude/latitude 
       are computed from valid easting/northing coordinates. 
       
       .. versionadded::  0.2.1
       
    epsg: int
        epsg number defining projection (see http://spatialreference.org/ref/ 
        for moreinfo). Overrides utm_zone if both are provided
       
    kws: dict
        Additional pandas `pd.read_csv` and `pd.read_excel` 
        methods keyword arguments. Be sure to provide the right argument. 
        when reading `f`. For instance, provide ``sep= ','`` argument when 
        the file to read is ``xlsx`` format will raise an error. Indeed, 
        `sep` parameter is acceptable for parsing the `.csv` file format
        only.
   
    Returns 
    -------
    DataFrame with valuable column(s). 
    
    Notes
    ------
    The length of acceptable columns is ``4``. If the size of the columns is 
    higher than `4`, the data should be shrunked to match the expected columns.
    Futhermore, if the header is not specified in `f` , the defaut column
    arrangement should be used. Therefore, the second column should be 
    considered as the ``resistivity`` column. 
     
    Examples
    ---------
    >>> import numpy as np 
    >>> from watex.utils.coreutils import erpSelector
    >>> df = erpSelector ('data/erp/testsafedata.csv')
    >>> df.shape 
    ... (45, 4)
    >>> list(df.columns) 
    ... ['station','resistivity', 'longitude', 'latitude']
    >>> df = erp_selector('data/erp/testunsafedata.xlsx') 
    >>> list(df.columns)
    ... ['easting', 'station', 'resistivity', 'northing']
    >>> df = erpSelector(np.random.randn(7, 7)) 
    >>> df.shape 
    ... (7, 4)
    >>> list(df.columns) 
    ... ['station', 'resistivity', 'longitude', 'latitude']
    
    """
    
    if columns is ...: columns=None 
    if columns is not None: 
        if isinstance(columns, str):
            columns =columns.replace(':', ',').replace(';', ',')
            if ',' in columns: columns =columns.split(',')
            
    if isinstance(f, (str,  pathlib.PurePath)):
        try : 
            f = _is_readable(f, **kws)
        except TypeError as typError: 
            raise ERPError (str(typError))
            
    if isinstance( f, np.ndarray): 
        name = copy.deepcopy(columns)
        columns = P().isrll if columns is None else columns 
        colnum = 1 if f.ndim ==1 else f.shape[1]
     
        if colnum==1: 
            if isinstance (name, list) : 
                if len(name) ==1: name = name[0]
            f = is_erp_series (
                pd.Series (f, name = name or columns[1] 
                           )
                ) 
    
        elif colnum==2 : 
            f= pd.DataFrame (f, columns = columns
                             if columns is None  
                             else columns[:2]
                             ) 
      
        elif colnum==3: 
            warnings.warn("One missing column `longitude|latitude` value."
                          "If the `longitude` and `latitude` data are"
                          f" not available. Use {smft(P().isrll[:2])} "
                          "columns instead.", UserWarning)
            columns = name or columns [:colnum]
            f= pd.DataFrame (f[:, :len(columns)],
                              columns =columns )

        elif f.shape[1]==4:
            f =pd.DataFrame (f, columns =columns 
                )
        elif colnum > 4: 
            # add 'none' columns for the remaining columns.
                f =pd.DataFrame (
                    f, columns = columns  + [
                        'none' for i in range(colnum-4)]
                    )
                
    if isinstance(f, pd.DataFrame): 
        f = is_erp_dataframe( f, force = force , verbose =verbose 
                             )
 
    elif isinstance(f , pd.Series ): 
        f = is_erp_series(f)
    else : 
        amsg = smft(accept_types (
            pd.Series, pd.DataFrame, np.ndarray) + ['*.xls', '*.csv'])
        raise ValueError (f" Unsupports data. Expects only {amsg}."
                          )  
    if np.all(f.resistivity)==0: 
        raise ResistivityError('Resistivity values need to be supply.')

    if utm_zone is not None: 
        # compute the longitude latitude if 
        # utm_zone is given. 
        if ('easting' in f.columns and 'northing' in f.columns) and (
                'longitude' in f.columns  and 'latitude' in f.columns): 
            if  (
                    np.all(f['longitude'])==0 
                    and np.all(f['latitude'])==0
                    ): 
   
                f, _ = fill_coordinates(f, utm_zone = utm_zone , 
                                        epsg = epsg )
                
    return f 


def _fetch_prefix_index (
    arr:NDArray [DType[float]] = None,
    col: List[str]  = None,
    df : DataFrame = None, 
    prefixs: List [str ]  =None
) -> Tuple [int | int]: 
    """ Retrieve index at specific column. 
    
    Use the given station positions collected on the field to 
    compute the dipole length during the whole survey. 
    
    :param arr: array. Ndarray of data where one colum must the 
            positions values. 
    :param col: list. The list should be considered as the head of array. Each 
        position in the list sould fit the column data in the array. It raises 
        an error if the number of item in the list is different to the size 
        of array in axis=1. 
    :param df: dataframe. When supply, the `arr` and `col` is not 
        compulsory. 
        
    :param prefixs: list. Contains specific column prefixs to 
        fetch the corresponding data. For instance::
            
            - Station prefix : ['pk','sta','pos']
            - Easting prefix : ['east', 'x', 'long'] 
            - Northing prefix: ['north', 'y', 'lat']
   :returns: 
       - index of the position columns in the data 
       - station position array-like. 
       
    :Example: 
        >>> from numpy as np 
        >>> from watex.utils.coreutils import _assert_positions
        >>> array1 = np.c_[np.arange(0, 70, 10), np.random.randn (7,3)]
        >>> col = ['pk', 'x', 'y', 'rho']
        >>> index, = _fetch_prefix_index (array1 , col = ['pk', 'x', 'y', 'rho'], 
        ...                         prefixs = EASTPREFIX)
        ... 1
        >>> index, _fetch_prefix_index (array1 , col = ['pk', 'x', 'y', 'rho'], 
        ...                         prefixs = NOTHPREFIX )
        ... 2
    """
    if prefixs is None: 
        raise ValueError('Please specify the list of items to compose the '
                         'prefix to fetch the columns data. For instance'
                         f' `station prefix` can  be `{P().istation}`.')

    if arr is None and df is None :
        raise TypeError ( 'Expected and array or a dataframe not'
                         ' a Nonetype object.'
                        )
    elif df is None and col is None: 
        raise StationError( 'Column list is missing.'
                         ' Could not detect the position index.') 
        
    if isinstance( df, pd.DataFrame): 
        # collect the resistivity from the index 
        # if a dataFrame is given 
        arr, col = df.values, df.columns 

    if arr.ndim ==1 : 
        # Here return 0 as colIndex
        return  0, arr 
    if isinstance(col, str): col =[col] 
    if len(col) != arr.shape[1]: 
        raise ValueError (
            f'Column should match the array shape in axis =1 <{arr.shape[1]}>.'
            f' But {"was" if len(col)==1 else "were"} given')
        
    # convert item in column in lowercase 
    comsg = col.copy()
    col = list(map(lambda x: x.lower(), col)) 
    colIndex = [col.index (item) for item in col 
             for pp in prefixs if item.find(pp) >=0]   

    if len(colIndex) is None or len(colIndex) ==0: 
        raise ValueError (f'Unable to detect the position in `{smft(comsg)}`'
                          ' columns. Columns must contain at least'
                          f' `{smft(prefixs)}`.')
 
    return colIndex[0], arr 

def _assert_station_positions(
    arr: SP = None,
    prefixs: List [str] =...,
    **kws
) -> Tuple [int, float]: 
    """ Assert positions and compute dipole length. 
    
    Use the given station positions collected on the field to 
    detect the dipole length during the whole survey. 
    
    :param arr: array. Ndarray of data where one column must the 
            positions values. 
    :param col: list. The list should be considered as the head of array. Each 
        position in the list sould fit the column data in the array. It raises 
        an error if the number of item in the list is different to the size 
        of array in axis=1. 
    :param df: dataframe. When supply, the `arr` and `col` are not needed.

    :param prefixs: list. Contains all the station column names prefixs to 
        fetch the corresponding data.
    :returns: 
        - positions: new positions numbering from station `S00` to ...    
        - dipolelength:  recomputed dipole value
    :Example: 
        
        >>> from numpy as np 
        >>> from watex.utils.coreutils import _assert_station_positions
        >>> array1 = np.c_[np.arange(0, 70, 10), np.random.randn (7,3)]
        >>> col = ['pk', 'x', 'y', 'rho']
        >>> _assert_positions(array1, col)
        ... (array([ 0, 10, 20, 30, 40, 50, 60]), 10)
        >>> array1 = np.c_[np.arange(30, 240, 30), np.random.randn (7,3)]
        ... (array([  0,  30,  60,  90, 120, 150, 180]), 30)
    
    """
    if prefixs is (None or ...): prefixs = P().istation 
    
    colIndex, arr =_fetch_prefix_index( arr=arr, prefixs = prefixs, **kws )
    positions = arr[:, colIndex]
    # assert the position is aranged from lower to higher 
    # if there is not wrong numbering. 
    fsta = np.argmin(positions) 
    lsta = np.argmax (positions)
    if int(fsta) !=0 or int(lsta) != len(positions)-1: 
        raise StationError(
            'Wrong numbering! Please number the position from first station '
            'to the last station. Check your array positionning numbers.')
    
    dipoleLength = int(np.abs (positions.min() - positions.max ()
                           ) / (len(positions)-1)) 
    # renamed positions  
    positions = np.arange(0 , len(positions) *dipoleLength ,
                          dipoleLength ) 
    
    return  positions, dipoleLength 


[docs]
@refAppender(refglossary.__doc__)
def plotAnomaly(
    erp: ArrayLike | List[float],
    cz: Optional [Sub[ArrayLike], List[float]] = None, 
    station: Optional [str] = None, 
    fig_size: Tuple [int, int] = (10, 4),
    fig_dpi: int = 300 ,
    savefig: str | None = None, 
    show_fig_title: bool = True,
    style: str = 'seaborn', 
    fig_title_kws: Dict[str, str|Any] = ...,
    czkws: Dict [str , str|Any] = ..., 
    legkws: Dict [Any , str|Any] = ...,
    how:Optional[str]='py',
    **kws, 
): 

    """ Plot the whole |ERP| line and selected conductive zone. 
    
    Conductive zone can be supplied nannualy as a subset of the `erp` or by 
    specifying the station expected for drilling location. For instance 
    ``S07`` for the seventh station. Futhermore, for automatic detection, one 
    should set the station argument `s` to ``auto``. However, it 's recommended 
    to provide the `cz` or the `s` to have full control. The conductive zone 
    overlained the whole |ERP| survey. user can customize the `cz` plot by 
    filling with `Matplotlib pyplot`_ additional keywords araguments thought 
    the keyword arguments `czkws`. 

    Parameters 
    -----------
    erp: array_like 1d
        the |ERP| survey line. The line is an array of resistivity values. 
        Note that if a dataframe is passed, be sure that the frame matches 
        the DC resistivity data (ERP), otherwise an error occurs. At least,
        the frame columns includes the resistivity and stations. 
        
    cz: array_like 1d 
        the selected conductive zone. If ``None``, only the `erp` should be 
        displayed. Note that `cz` is an subset of `erp` array. 
        
    station: str, optional
        The station location given as string (e.g. ``s= "S10"``) 
        or as a station number (indexing; e.g ``s =10``). If value is set to 
        ``"auto"``, `s` should be find automatically and fetching `cz` as well. 
        
    figsize: tuple, default =(10, 4)
        Tuple value of figure size. Refer to the web resources `Matplotlib figure`_. 
        
    fig_dpi: int , default=300, 
        figure resolution "dot per inch". Refer to `Matplotlib figure`_.
        
    savefig: str, optional, 
        save the figure. Refer  to `Matplotlib figure`_.
    
    show_fig_title: bool, default =True
        display the title of the figure. 
    
    fig_title_kws: dict, 
        Keywords arguments of figure suptile. Refer to 
        `Matplotlib figsuptitle`_.
        
    style: str - the style for customizing visualization. For instance to 
        get the first seven available styles in pyplot, one can run 
        the script below:: 
        
            plt.style.available[:7]
            
        Futher details can be foud in Webresources below or click on 
        `GeekforGeeks`_. 
    how: str, default='py'
        By default (``how='py'``), the station is naming following the 
        Python indexing. Station is counting from station 00(S00). Any other
        values will start the station naming from 1.
        
    czkws: dict, 
        keywords `Matplotlib pyplot`_ additional arguments to customize 
        the `cz` plot.
        
    legkws: dict, 
        Additional keywords Matplotlib legend arguments. 
        
    kws: dict, 
        additional keywords argument for `Matplotlib pyplot`_ to 
        customize the `erp` plot.
        
    Return 
    ---------
    ax: Matplotlib.pyplot.Axis
        Axis 
       
    Examples
    ---------
    >>> import numpy as np 
    >>> from watex.utils import plotAnomaly, defineConductiveZone 
    >>> test_array = np.abs (np.random.randn (10)) *1e2
    >>> selected_cz ,*_ = defineConductiveZone(test_array, 7) 
    >>> plotAnomaly(test_array, selected_cz )
    >>> plotAnomaly(test_array, selected_cz , s= 5)
    >>> plotAnomaly(test_array, s= 's02')
    >>> plotAnomaly(test_array)
        
    Note
    -----
    :func:`plotAnomaly` does not imply the use of constraints. The conductive
    detection can only be used if and only if there is not constraints 
    applicable to the survey site, otherwise use :func:`erpSmartDetector` 
    by triggered the `view` parameter to ``True``.
    In addition, If `cz` is given, No need to worry about the 
    station `s`. `s` can still keep it default value ``None``. 
    
    See Also
    ---------
    watex.erpSmartDetector: 
            Detection conductive zone applying the constraint. Set the
            ``view=True`` for constraints visualization. 
        
    References   
    -----------
    See Matplotlib Axes: https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.tick_params.html
    GeekforGeeks: https://www.geeksforgeeks.org/style-plots-using-matplotlib/#:~:text=Matplotlib%20is%20the%20most%20popular,without%20using%20any%20other%20GUIs.
    
    """
    
    def format_ticks (value, tick_number):
        """ Format thick parameter with 'FuncFormatter(func)'
        rather than using:: 
            
        axi.xaxis.set_major_locator (plt.MaxNLocator(3))
        
        ax.xaxis.set_major_formatter (plt.FuncFormatter(format_thicks))
        """
        nskip = len(erp ) * 7 // 100 
        
        if value % nskip ==0: 
            return 'S{:02}'.format(int(value)+ 1 
                                   if str(how).lower()!='py' else int(value)
                                   )
        else: None 
        
    if hasattr ( erp, "columns") and isinstance (erp, pd.DataFrame): 
        erp = is_valid_dc_data(erp).resistivity 
        
    erp = _assert_all_types( 
        erp, tuple, list , np.ndarray , pd.Series)
    if cz is not None: 
        cz = _assert_all_types(
            cz, tuple, list , np.ndarray , pd.Series)
        cz = np.array (cz)
        
    erp =np.array (erp) 
    
    plt.style.use (style)

    kws =dict (
        color=P().frcolortags.get('fr1') if kws.get(
            'color') is None else kws.get('color'), 
        linestyle='-' if kws.get('ls') is None else kws.get('ls'),
        linewidth=2. if kws.get('lw') is None else kws.get('lw'),
        label = 'Electrical resistivity profiling' if kws.get(
            'label') is None else kws.get('label')
                  )

    if czkws is ( None or ...) :
        czkws =dict (color=P().frcolortags.get('fr3'), 
                      linestyle='-',
                      linewidth=3,
                      label = 'Conductive zone'
                      )
    
    if czkws.get('color') is None: 
        czkws['color']= P().frcolortags.get(czkws['color'])
      
    if (xlabel := kws.get('xlabel')) is not None : 
        del kws['xlabel']
    if (ylabel := kws.get('ylabel')) is not None : 
        del kws['ylabel']
        
    if (rotate:= kws.get ('rotate')) is not None: 
        del kws ['rotate']
        
    fig, ax = plt.subplots(1,1, figsize =fig_size)
    
    leg =[]

    zl, = ax.plot(np.arange(len(erp)), erp, 
                  **kws 
                  )
    leg.append(zl)
    
    if station =='' : 
        station= None  # for consistency 

    if station is not None:
        auto =False 
        if isinstance (station , str): 
            if station.lower()=='auto': 
                auto=True ; station =None # reset station 
        cz , _ , _, ix = defineConductiveZone(
           erp,
           station = station, 
           auto = auto,
           index=how, 
           )
        station = "S{:02}".format(ix if str(how).lower()=='py' else ix+ 1)

    if cz is not None: 
        # construct a mask array with np.isin to check whether
        if not _isin (erp, cz ): 
            raise ValueError ('Expected a conductive zone to be a subset of '
                              ' the resistivity profiling line.')
        # `cz` is subset array
        z = np.ma.masked_values (erp, np.isin(erp, cz ))
        # a masked value is constructed so we need 
        # to get the attribute fill_value as a mask 
        # However, we need to use np.invert or the tilde operator  
        # to specify that other value except the `CZ` values mus be 
        # masked. Note that the dtype must be changed to boolean
        sample_masked = np.ma.array(
            erp, mask = ~z.fill_value.astype('bool') )

        czl, = ax.plot(
            np.arange(len(erp)), sample_masked, 'o',
            **czkws)
        leg.append(czl)
        
        
    ax.tick_params (labelrotation = 0. if rotate is None else rotate)
    ax.set_xticks(range(len(erp)),
                  )

    if len(erp ) >= 14 : 
        ax.xaxis.set_major_formatter (plt.FuncFormatter(format_ticks))
    else : 
        
        ax.set_xticklabels(
            ['S{:02}'.format(int(i)+1 if str(how).lower()!='py' else int(i) )
             for i in range(len(erp))],
            rotation =0. if rotate is None else rotate ) 
   

    if legkws is( None or ...): 
        legkws =dict() 
    
    ax.set_xlabel ('Stations') if xlabel is  None  else ax.set_xlabel (xlabel)
    ax.set_ylabel ('Resistivity (Ω.m)'
                ) if ylabel is None else ax.set_ylabel (ylabel)

    ax.legend( handles = leg, 
              **legkws )
    

    if show_fig_title: 
        title = 'Plot ERP: SVES = {0}'.format(station if station is not None else '')
        if fig_title_kws is ( None or ...): 
            fig_title_kws = dict (
                t = title if station is not None else title.replace (
                    ': SVES =', ''), 
                style ='italic', 
                bbox =dict(boxstyle='round',facecolor ='lightgrey'))
            
        plt.tight_layout()
        fig.suptitle(**fig_title_kws, 
                      )
    if savefig is not None :
        plt.savefig(savefig,
                    dpi=fig_dpi,
                    )
        
    plt.close () if savefig is not None else plt.show() 
    
    return ax 

  

[docs]
def erpSmartDetector(
        constr: list |dict,  
        erp: ArrayLike, 
        station:str=None, 
        coerce:bool=False, 
        return_cz:bool=False, 
        view:bool=False, 
        raise_warn: bool=True, 
        **plot_kws
        ): 
    """ 
    Automatically detect the drilling location by involving the 
    constraints observed in the survey area. 
    
    Consider the constraints on the survey area and detect the suitable
    drilling location. Commonly the `station` is not needed when using 
    the constraintssince the station indicates that the user is aware 
    about the reason to select this station. However in the case, 
    doubts raise, user can set the parameter `coerce` to 
    ``True``. 
    
    Parameters 
    -----------
    constr: list, dict
        List of restricted station. The constraint or restricted stations are 
        the station where to ignore when selecting the best drilling location. 
        Indeed, this is useful since in :term:`DWSC`, not the station are 
        presumed to be suitable to propose the drilling in technical view. 
        For instance, if some stations are close to the household waste site,
        the stations must be list and ignored. 
        
        If the `constr` is passed in a dictionnary, it might be contain, the 
        key for the restricted stations and the value for the reason why the 
        station is restricted. For instance:: 
            
            constr = {"s02": "station close to the household waste"
                      "S25": "station is located in a marsh area."
                      }
    erp: array-like 1d
        DC profiling :term:`ERP` resistivity values 
        
    station: str, optional
        The station of the presumed location for drilling operations. Commonly 
        the station is not need when using the constraints. If the station is 
        given whereas ``coerce=False`` an errors will raise top warnm the users, 
        To force considering the station in the auto-detection, ``coerce`` must
        be set to ``True``. 
        
    coerce:bool, default=False, 
        Allow the station to be consider in the auto-detection. 
        
    raise_warn: bool, default=True, 
         warn the user whether a suitable location is found or not. Returns 
         ``None`` otherwise. 
         
    view: bool, default=False, 
        Plot the conductive zone and restricted stations.
    plot_kws:dict, 
        Additional plotting keywords arguments passed to 
        :func:`plotAnomaly`. 
        
    Return 
    -------
    (station |None) or cz, cs : str, 
        staion for  the drilling operations detected automatically. 
        If no station is detected, will return ``None``. 
        if `return_cz` is ``True``, station and the conductive zone are 
        returned as well as the restricted station position number. 
        
    See Also
    ----------
    watex.plotAnomaly: Plot DC profiling :term:`ERP` and conductive zone. 
    
    Examples
    --------
    >>> import numpy as np 
    >>> from watex.datasets import make_erp 
    >>> from watex.utils.coreutils import erpSmartDetector 
    >>> resistivity = make_erp (n_stations =50 , as_frame=True, seed=125).resistivity 
    >>> # get the min value of the resistivity 
    >>> resmin_index = np.where ( resistivity==resistivity.min()) 
    42
    >>> erpSmartDetector (constr =['s42'], resistivity )
    'S13'
    >>> # S42 is rejected and selected another zone presumed to be better.
    >>> constraints ={"S00": "Marsh area. ", 
                      "S10": " Municipality square, no authorization to make drill",
                      "S29": "Heritage site", 
                      "S46": "Household waste site",
                      "S42": "Household waste site"
                      } 
    >>> erpSmartDetector (constraints, resistivity)
    'S16'
    >>> erpSmartDetector (['s12', 's40'], resistivity) 
    'S29'
    >>> # station 42 close s40 is rejected too.
  
    """   
    
    constr_msg=("No suitable location for drilling operations is detected"
                " after applying the constraints.")
    # assert station when given 
    s=None
    if station is not None: 
        if not coerce:
            raise ERPError(
                "Usually the restriction is not applicable when user explicitly"
                " sets the station for the drilling operations. Restriction"
                " is effective for automatic drilling location. To force"
               f" considering the station {station}, set ``coerce=True``.")

        s = re.findall('\d+', str(station )) 
        if len(s)==0: 
            raise StationError(f"Wrong station {station}. Station must contain"
                               " the position number. e.g., 'S07'")
        s = int (s[0])
    
    # assert erp 
    if ( 
            hasattr (erp, 'columns')  
            and hasattr(erp, 'resistivity')
        ) : 
        erp = erp.resistivity 
        
    erp = check_y (erp, allow_nan=True, input_name="ERP data ")
    res_arr = np.array (erp).copy().astype(np.float64) # for consistency 
    # assert constraint values 
    if isinstance ( constr , dict): 
        constr = list( constr)
    else: 
        constr= is_iterable(constr, exclude_string=True,
                            transform=True, parse_string=True)
    
    constr = list(constr)
    # check the effectiveness of constraints 
    cs = _check_constr_eff (constr, s, station)
    # if constraints is not applicable
    # list of stations to  remove if out of the range 
    out_cs =list() 

    
    if cs is not None:
        
        for ix in cs: 
            if ix >= len(erp): 
                if raise_warn: 
                    warnings.warn(f"Station position {ix} is ignored. Position"
                                  f" number {ix} is out range of station number"
                                  " range. By default station numbering starts"
                                  f" from 'S00'--> 'S{len(erp)-1:02}`."
                                  )
                out_cs.append(ix )
                continue 
            res_arr = _nan_constr(ix, res_arr)
    #------------
    if len(out_cs)!=0: [cs.remove (it) for it in out_cs]  
    cs = None if ( hasattr (cs, '__len__')  and len(cs)==0 ) else cs 
    #-------------
    if np.isnan (res_arr).all(): 
        if raise_warn:
            warnings.warn(constr_msg)
        return 
    
    if coerce and station is not None: 
        cz = _nan_constr(s, res_arr, return_indexed_arr=True )
        
    else:
        cz , *_, pos= defineConductiveZone(
            res_arr, auto =True)
        
        station = f'S{pos:02}'
        
    if np.isnan (cz).any(): 
        warnings.warn(f"{station!r} seems close to a restricted area."
                      " It is recommended to not take a risk by considering"
                      f" {station} for drilling operations. You may leave"
                      " this station and carry out another ERP line far away"
                      f" this site. Force considering {station} with its "
                      " resulting DC-parameters is your own risk.") 

    if view: 
        if cs is not None: 
            ax = plotAnomaly(erp, station= station, cz = cz, **plot_kws) 
            ax.scatter (cs, erp [cs ], marker="s", s=70, 
                            color = 'red', alpha = .5, 
                        label=f"Restricted station{'s' if len(cs)>1 else ''}")
            ax.legend ()
            plt.show() 
        else: 
            imsg = ( f"{smft([f'S{i:02}' for i in out_cs])} are not valid"
                    " restricted areas. " if len(out_cs)!=0 else ''
                    )
            if raise_warn: 
                warnings.warn(f"{imsg}Visualization cannot be possible with no"
                              " constraints. Use `watex.plotAnomaly()` instead."
                              )

    return (station, cz , cs) if return_cz else station 

    

def _check_constr_eff (constr, six= None, station=None, raise_warn=True): 
    """ Check if the given station is not in the constraint values.
    
    Raise  warning messages otherwise.
    
    :param constr: list of dict conatining the constraint items 
    :param six: index of the station to apply the constraints 
    :param station: name of the station. The station may include the position 
        values.
    :param raise_warn: alert user that the site is not appropriate 
        for drilling.
    :return: cs
       list of constraints position indexes
    """
    def raise_warn_if (l, lt): 
        """ Raise warning if the no position number is found 
        
        :param l: list containing the position number, e.g. e.g: [04]
        :param lt: The total position including the letter. eg. 'S04'
        """ 
        if len(l) ==0: 
            if raise_warn:
                warnings.warn(f"Missing position number of station {lt}."
                              f" Station {lt} is ignored instead.")
            return [None] 
        return [int (l[0])]
  
    # use regex to find the station positions. 
    cs = [raise_warn_if(re.findall('\d+', key), key) for key in constr ] 
    # use itertools to generate single list for all 
    cs=list (itertools.chain (*cs))
    # remove all missing position numbers 
    cs =list(filter (None, cs))
    # check duplicate stations 
    dp = [item for item, count in collections.Counter(cs).items() if count > 1
          ]
    if len(dp)!=0: 
        warnings.warn(f"Duplicated stations {smft(dp)} found in"
                      " the constraint items. Single item is kept"
                      " while others should be discarded.")
    cs = list(set(cs))
    if six is not None:
        # check whether the given station is among the constraint values 
        d = is_in_if ( cs, [six], return_intersect= True) 
        if d is not None: 
            msg = (f"Station {station} is a restricted station. Constraints"
                   " cannot be applied when the station is explicitly given."
                   " By default, the constraints applicability is ignored."
                   f" You may remove the station {station!r} among the "
                   " restricted stations or select another station."
                   )
            if raise_warn: warnings.warn(msg)
            
            cs= None 
        
    return cs 

def _nan_constr (cs_ix , arr , return_indexed_arr =False ):
    """ Use NaN to mask the constraints in the erp. 
    
    :param cs_ix: int, index of the constraint station. 
    :param arr: DC profiling  resistivity array 
    :param return_indexed_arr: 
        If ``True``, returns the resistivity values  of the selected 
        conductive zone from constraint.
        
    :return: arraylike 
      New array of discarded the constraint area. 
      
    :example: 
        
    >>> import numpy as np 
    >>> from watex.utils.coreutils import _nan_constr 
    >>> r = np.linspace (1, 10, 21)
    array([ 1.  ,  1.45,  1.9 ,  2.35,  2.8 ,  3.25,  3.7 ,  4.15,  4.6 ,
        5.05,  5.5 ,  5.95,  6.4 ,  6.85,  7.3 ,  7.75,  8.2 ,  8.65,
        9.1 ,  9.55, 10.  ])
    >>> r = _nan_constr ( 10, r)
    >>> r 
    array([ 1.  ,  1.45,  1.9 ,  2.35,  2.8 ,  3.25,  3.7 ,   nan,   nan,
             nan,   nan,   nan,   nan,   nan,  7.3 ,  7.75,  8.2 ,  8.65,
            9.1 ,  9.55, 10.  ])
    >>> r = _nan_constr (5, r)
    >>> r 
    array([ 1.  ,  1.45,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
             nan,   nan,   nan,   nan,   nan,  7.3 ,  7.75,  8.2 ,  8.65,
            9.1 ,  9.55, 10.  ])
    """
    # note that station must be framed with 3 stations before and after. 
    index_range = np.arange (cs_ix - 3 , cs_ix + 3 +1 ) 

    # if there is a negative index, discarded then 
    index_range= index_range [ index_range >=0 ] 
    # use is inx to find the valuable index 
    mask = _isin( np.arange (len(arr)), index_range, return_mask=True) 
    index_in = np.arange (len(arr))[mask]
    # replace value of index with NaN
    arr[index_in]  = np.nan 
    
    return  index_in if return_indexed_arr else arr 


[docs]
def defineConductiveZone(
    erp:ArrayLike| pd.Series | List[float] ,
    station: Optional [str|int] = None, 
    position: SP = None,  
    auto: bool = False,
    index:str='py', 
    **kws,
) -> Tuple [ArrayLike, int] :
    """ Define conductive zone as subset of the erp line.
    
    Indeed the conductive zone is a specific zone expected to hold the 
    drilling location `station`. If drilling location is not provided,  
    it would be by default the very low resistivity values found in the 
    `erp` line. 
    
    Parameters 
    -----------
    erp : array_like,
        the array contains the apparent resistivity values 
    station: str or int, 
        is the station position name. 
    position: float, 
        station position value. 
    auto: bool
        If ``True``, the station position should be the position of the lower 
        resistivity value in |ERP|. 
    indexing: str, 
    
    Returns 
    -------- 
        - conductive zone of resistivity values 
        - conductive zone positionning 
        - station position index in the conductive zone
        - station position index in the whole |ERP| line 
    
    :Example: 
        >>> import numpy as np 
        >>> 
        >>> from watex.utils.coreutils import defineConductiveZone
        >>> test_array = np.random.randn (10)
        >>> selected_cz ,*_ = defineConductiveZone(test_array, 's20') 
        >>> shortPlot(test_array, selected_cz )
    """
    if isinstance(erp, pd.DataFrame): 
        try: erp = erp.resistivity  
        except AttributeError: 
            raise ResistivityError (" Resistivity data is missing ")
            
    if isinstance(erp, pd.Series):
        erp = erp.values 
    
    erp = check_y(erp, allow_nan= True, input_name ="DC-resistivity ERP data" )  

    # conductive zone positioning
    pcz : Optional [ArrayLike]  = None  

    if station is None and auto is False: 
        raise StationError("Missing station. Set ``auto=True`` for a naive"
                          " auto-detection (no-restrictions observed).")
        
    elif  ( station is None 
           and auto is True 
           ): 
        station= np.argwhere (erp ==np.nanmin(erp))
        station= int(station) if len(station) ==1 else int(station[0])
        # station, = np.where (erp == erp.min()) 
        # station=int(station)
    elif auto and station:
        warnings.warn ("Naive auto-detection is ignored while the"
                       " station is supplied.")

    station, pos = _assert_stations(station, index=index,  **kws )
    # takes the last position if the position is outside 
    # the number of stations. 
    msg=("Station position must not be greater than the number of stations."
     " It seems the dipole length is used for naming the stations."
     " If true, set `dipole` parameter value with the units. For instance"
     " '10m' names the stations as S00-S10-S20... and recompute the position"
     " for consistency to fit the number of stations. Expect {} stations,"
     " got {}."
     )
    if pos >= len(erp): 
        raise StationError(msg.format(len(erp), pos))
    # pos = len(erp) -1  if pos >= len(erp) else pos 
    # frame the `sves` (drilling position) within 03 stations left/right
    # and define the conductive zone 
    ir = erp[:pos][-3:] ;  il = erp[pos:pos +3 +1 ]
    cz = np.concatenate((ir, il))

    if position is not None: 
        if len(position) != len(erp): 
            raise StationError (
                'Array of position and conductive zone must have the same '
                f'length: `{len(position)}` and `{len(cz)}` were given.')
            
        sr = position[:pos][-3:] ;  sl = position[pos:pos +3 +1 ]
        pcz = np.concatenate((sr, sl))
        
    # Get the new position in the selected conductive zone 
    # from the of the whole erp 
    pix= np.argwhere (cz == erp[pos])
    pix = pix [0] if len(pix) > 1 else pix 
    return cz , pcz, int(pix), pos


def _assert_stations(
    station:Any , 
    dipole:Any = None,
    index:str = None,
) -> Tuple[str, int]:
    """ Sanitize stations and returns station name and index.
    
    ``pk`` and ``S`` can be used as prefix to define the station `s`. For 
    instance ``S01`` and ``PK01`` means the first station. 
    
    :param station: Station name
    :type station: str, int 
    
    :param dipole: dipole_length in meters.  
    :type dipole: float 
    
    :param index: str, default=None,
        Stands for keeping the Python indexing. If set to 
        ``py` so the station should start by `S00` and so on. 
    
    :returns: 
        - station name 
        - index of the station.
        
    .. note:: 
        
        The defaut station numbering is from 1. So if ``S00` is given, and 
        the argument `index` is still on its default value i.e ``False``,
        the station name should be set to ``S01``. Moreover, if `dipole`
        value is given, the station should  named according to the 
        value of the dipole. For instance for `dipole` equals to ``10m``, 
        the first station should be ``S00``, the second ``S10`` , 
        the third ``S30`` and so on. However, it is recommend to name the 
        station using counting numbers rather than using the dipole 
        position.
            
    :Example: 
        >>> from watex.utils.coreutils import _assert_stations
        >>> _assert_stations('pk01')
        ... ('S01', 0)
        >>> _assert_stations('S1')
        ... ('S01', 0)
        >>> _assert_stations('S1', index =None)
        ... ('S01', 1) # station here starts from 0 i.e `S00` 
        >>> _assert_stations('S00')
        ... ('S00', 0)
        >>> _assert_stations('S1000',dipole ='1km')
        ... ('S02', 1) # by default it does not keep the Python indexing 
        >>> _assert_stations('S10', dipole ='10m')
        ... ('S02', 1)
        >>> _assert_stations(1000,dipole =1000)
        ... ('S02', 1)
    """
    # in the case s is string: eg. "00", "pk01", "S001"
    sta= copy.deepcopy(station)
    ix = 0
    stnl =P().istation 
    station = _assert_all_types(station, str, int, float)

    station = str(station).strip() 
    regex = re.compile (r'\d+', flags= re.IGNORECASE)
    station = regex.findall (station)
    if len(station)==0: 
        raise StationError (f"Wrong station name {sta!r}. Station must be "
                            f"prefixed by {smft(stnl +['S'], 'or')} e.g. "
                            "'S00' for the first station")
    else : station = int(station[0])
    
    if (str(index).lower().find ('py')>=0 
        or str(index).lower().find ('true')>=0
        ): 
        # keep Python indexing for naming stations. 
        keepindex =True 
    else: keepindex =False
    
    if station ==0 : 
        # set index to 0 , is station `S00` is found for instance.
        keepindex =True 

    st = copy.deepcopy(station)
    
    if isinstance(station, int):  
        msg = 'Station numbering must start'\
            ' from {0!r} or set `keepindex` argument to {1!r}.'
        msg = msg.format('0', 'False') if keepindex else msg.format(
            '1', 'True')
        if not keepindex: # station starts from 1
            if station <=0: 
                raise ValueError (msg )
            station , ix  = "S{:02}".format(station), station - 1
        
        elif keepindex: 
            
            if station < 0: raise ValueError (msg) # for consistency
            station, ix =  "S{:02}".format(station ), station  
    # Recompute the station position if the dipole value are given
    if dipole is not None: 
        if isinstance(dipole, str): #'10m'
            if dipole.find('km')>=0: 
           
                dipole = dipole.lower().replace('km', '000') 
                
            dipole = dipole.lower().replace('m', '')
            try : 
                dipole = float(dipole) 
            except : 
                raise StationError(f'Invalid literal value for dipole: {dipole!r}')
        # since the renamed from dipole starts at 0 
        # e.g. 0(S1)---10(S2)---20(S3) ---30(S4)etc ..
        ix = int(st//dipole)  ; station= "S{:02}".format(ix +1)
    
    return station, ix 

def _parse_args (
    args:Union[List | str ]
)-> Tuple [ pd.DataFrame, List[str|Any]]: 
    """ `Parse_args` function returns array of rho and coordinates 
    values (X, Y).
    
    Arguments can be a list of data, a dataframe or a Path like object. If 
    a Path-like object is set, it should be the priority of reading. 
    
    :param args: arguments 
    
    :return: ndarray or array-like  arranged with apparent 
        resistivity at the first index 
        
    .. note:: If a list of arrays is given or numpy.ndarray is given, 
            we assume that the columns at the first index fits the
            apparent resistivity values. 
            
    :Example: 
    >>> import numpy as np 
    >>> from watex.utils.coreutils import _parse_args
    >>> a, b = np.arange (1, 10 , 0.5), np.random.randn(9).reshape(3, 3)
    >>> _parse_args ([a, 'data/erp/l2_gbalo.xlsx', b])
    ... array([[1.1010000e+03, 0.0000000e+00, 7.9075200e+05, 1.0927500e+06],
               [1.1470000e+03, 1.0000000e+01, 7.9074700e+05, 1.0927580e+06],
               [1.3450000e+03, 2.0000000e+01, 7.9074300e+05, 1.0927630e+06],
               [1.3690000e+03, 3.0000000e+01, 7.9073800e+05, 1.0927700e+06],
               [1.4060000e+03, 4.0000000e+01, 7.9073300e+05, 1.0927765e+06],
               [1.5430000e+03, 5.0000000e+01, 7.9072900e+05, 1.0927830e+06],
               [1.4800000e+03, 6.0000000e+01, 7.9072400e+05, 1.0927895e+06],
               [1.5170000e+03, 7.0000000e+01, 7.9072000e+05, 1.0927960e+06],
               [1.7540000e+03, 8.0000000e+01, 7.9071500e+05, 1.0928025e+06],
               [1.5910000e+03, 9.0000000e+01, 7.9071100e+05, 1.0928090e+06]])
    
    """
    
    keys= ['res', 'rho', 'app.res', 'appres', 'rhoa']
    
    col=None 
    if isinstance(args, list): 
        args, isfile  = _assert_file(args) # file to datafame 
        if not isfile:                     # list of values 
        # _assert _list of array_length 
            args = np.array(args, dtype =np.float64).T
            
    if isinstance(args, pd.DataFrame):
        # firt drop all untitled items 
        # if data is from xlsx sheets
        args.drop([ c for c in args.columns if c.find('untitle')>=0 ],
                  axis =1, inplace =True) 

        # get the index of items `resistivity`
        ixs = [ii for ii, name in enumerate(args.columns ) 
               for item in keys if name.lower().find(item)>=0]
        if len(set(ixs))==0: 
            raise ValueError(
                f"Column name `resistivity` not found in {list(args.columns)}"
                " Please provide the resistivity column.")
        elif len(set(ixs))>1: 
            raise ValueError (
                f"Expected 1 but got {len(ixs)} resistivity columns "
                f"{tuple([list(args.columns)[i] for i in ixs])}.")

        rc= args.pop(args.columns[ixs[0]]) 
        args.insert(0, 'app.res', rc)
        col =list(args.columns )  
        args = args.values

    if isinstance(args, pd.Series): 
        col =args.name 
        args = args.values

    return args, col

def _assert_file (
        args: List[str, Any]
)-> Tuple [List [str , pd.DataFrame] | Any , bool]: 
    """ Check whether the data is gathering into a Excel sheet workbook file.
    
    If the workbook is detected, will read the data and grab all into a 
    dataframe. 
    
    :param args: argument into a list 
    :returns: 
        - dataframe  
        - assert whether workbook was successful read. 
        
    :Example: 
        >>> import numpy as np 
        >>> from watex.utils.coreutils import  _assert_file
        >>> a, b = np.arange (1, 10 , 0.5), np.random.randn(9).reshape(3, 3)
        >>> data = [a, 'data/erp/l2_gbalo', b] # collection of 03 objects 
        >>>  # but read only the Path-Like object 
        >>> _assert_file([a, 'data/erp/l2_gbalo.xlsx', b])
        ... 
        ['l2_gbalo',
            pk       x          y   rho
         0   0  790752  1092750.0  1101
         1  10  790747  1092758.0  1147
         2  20  790743  1092763.0  1345
         3  30  790738  1092770.0  1369
         4  40  790733  1092776.5  1406
         5  50  790729  1092783.0  1543
         6  60  790724  1092789.5  1480
         7  70  790720  1092796.0  1517
         8  80  790715  1092802.5  1754
         9  90  790711  1092809.0  1591]
    """
    
    isfile =False 
    file = [ item for item in args if isinstance(item, str)
                    if os.path.isfile (item)]

    if len(file) > 1: 
        raise ValueError (
            f"Expected a single file but got {len(file)}. "
            "Please select the right file expected to contain the data.")
    if len(file) ==1 : 
        _, args = read_from_excelsheets(file[0])
        isfile =True 
        
    return args , isfile 
 


[docs]
def makeCoords(
  reflong: str | Tuple[float], 
  reflat: str | Tuple[float], 
  nsites: int ,  
  *,  
  r: int =45.,
  utm_zone: Optional[str] =None,   
  step: Optional[str|float] ='1km', 
  order: str = '+', 
  todms: bool =False, 
  is_utm: bool  =False,
  raise_warning: bool=True, 
  **kws
)-> Tuple[ArrayLike[DType[float]]]: 
    """ Generate multiple stations coordinates (longitudes, latitudes)
    from a reference station/site.
    
    One degree of latitude equals approximately 364,000 feet (69 miles), 
    one minute equals 6,068 feet (1.15 miles), and one-second equals 101 feet.
    One-degree of longitude equals 288,200 feet (54.6 miles), one minute equals
    4,800 feet (0.91 mile) , and one second equals 80 feet. Illustration showing
    longitude convergence. (1 feet ~=0.3048 meter)
    
    Parameters 
    ----------
    reflong: float or string or list of [start, stop]
        Reference longitude  in degree decimal or in DD:MM:SS for the first 
        site considered as the origin of the landmark.
        
    reflat: float or string or list of [start, stop]
        Reference latitude in degree decimal or in DD:MM:SS for the reference  
        site considered as the landmark origin. If value is given in a list, 
        it can contain the start point and the stop point. 
        
    nsites: int or float 
        Number of site to generate the coordinates onto. 
        
    r: float or int 
        The rotate angle in degrees. Rotate the angle features the direction
        of the projection line. Default value is ``45`` degrees. 
        
    step: float or str 
        Offset or the distance of seperation between different sites in meters. 
        If the value is given as string type, except the ``km``, it should be 
        considered as a ``m`` value. Only meters and kilometers are accepables.
        
    order: str 
        Direction of the projection line. By default the projected line is 
        in ascending order i.e. from SW to NE with angle `r` set to ``45``
        degrees. Could be ``-`` for descending order. Any other value should 
        be in ascending order. 
    
    is_utm: bool, 
        Consider the first two positional arguments as UTM coordinate values. 
        This is an alternative way to assume `reflong` and `reflat` are UTM 
        coordinates 'easting'and 'northing` by default. If `utm2deg` is ``False``, 
        any value greater than 180 degrees for longitude and 90 degrees for 
        latitude will raise an error. Default is ``False``.
        
    utm_zone: string (##N or ##S)
        utm zone in the form of number and North or South hemisphere, 10S or 03N
        Must be given if `utm2deg` is set to ``True``. 
                      
    todms: bool 
        Convert the degree decimal values into the DD:MM:SS. Default is ``False``. 
        
    raise_warning: bool, default=True, 
        Raises warnings if GDAL is not set or the coordinates accurately status.
    
    kws: dict, 
        Additional keywords of :func:`.gistools.project_point_utm2ll`. 
        
    Returns 
    -------
        Tuple of  generated projected coordinates longitudes and latitudes
        either in degree decimals or DD:MM:SS
        
    Notes 
    ------
    The distances vary. A degree, minute, or second of latitude remains 
    fairly constant from the equator to the poles; however a degree, minute,
    or second of longitude can vary greatly as one approaches the poles
    and the meridians converge.
        
    References 
    ----------
    https://math.answers.com/Q/How_do_you_convert_degrees_to_meters
    
    Examples 
    --------
    >>> from watex.utils.coreutils import makeCoords 
    >>> rlons, rlats = makeCoords('110:29:09.00', '26:03:05.00', 
    ...                                     nsites = 7, todms=True)
    >>> rlons
    ... array(['110:29:09.00', '110:29:35.77', '110:30:02.54', '110:30:29.30',
           '110:30:56.07', '110:31:22.84', '110:31:49.61'], dtype='<U12')
    >>> rlats 
    ... array(['26:03:05.00', '26:03:38.81', '26:04:12.62', '26:04:46.43',
           '26:05:20.23', '26:05:54.04', '26:06:27.85'], dtype='<U11')
    >>> rlons, rlats = makeCoords ((116.7, 119.90) , (44.2 , 40.95),
                                            nsites = 238, step =20. ,
                                            order = '-', r= 125)
    >>> rlons 
    ... array(['119:54:00.00', '119:53:11.39', '119:52:22.78', '119:51:34.18',
           '119:50:45.57', '119:49:56.96', '119:49:08.35', '119:48:19.75',
           ...
           '116:46:03.04', '116:45:14.43', '116:44:25.82', '116:43:37.22',
           '116:42:48.61', '116:42:00.00'], dtype='<U12')
    >>> rlats 
    ... array(['40:57:00.00', '40:57:49.37', '40:58:38.73', '40:59:28.10',
           '41:00:17.47', '41:01:06.84', '41:01:56.20', '41:02:45.57',
           ...
       '44:07:53.16', '44:08:42.53', '44:09:31.90', '44:10:21.27',
       '44:11:10.63', '44:12:00.00'], dtype='<U11')
    
    """  
    def assert_ll(coord):
        """ Assert coordinate when the type of the value is string."""
        try: coord= float(coord)
        except ValueError: 
            if ':' not in coord: 
                raise ValueError(f'Could not convert value to float: {coord!r}')
            else : 
                coord = convert_position_str2float(coord)
        return coord
    
    xinf, yinf = None, None 
    
    nsites = int(_assert_all_types(nsites,int, float)) 
    if isinstance (reflong, (list, tuple, np.ndarray)): 
        reflong , xinf, *_ = reflong 
    if isinstance (reflat, (list, tuple, np.ndarray)): 
        reflat , yinf, *_ = reflat 
    step=str(step).lower() 
    if step.find('km')>=0: # convert to meter 
        step = float(step.replace('km', '')) *1e3 
    elif step.find('m')>=0: step = float(step.replace('m', '')) 
    step = float(step) # for consistency 
    
    if str(order).lower() in ('descending', 'down', '-'): order = '-'
    else: order ='+'
    # compute length of line using the reflong and reflat
    # the origin of the landmark is x0, y0= reflong, reflat
    x0= assert_ll(reflong) if is_utm else assert_ll(
        assert_lon_value(reflong))
    y0= assert_ll(reflat) if is_utm else assert_ll(
        assert_lat_value(reflat))
    
    xinf = xinf or x0  + (np.sin(np.deg2rad(r)) * step * nsites
                          ) / (364e3 *.3048) 
    yinf = yinf or y0 + (np.cos(np.deg2rad(r)) * step * nsites
                         ) /(2882e2 *.3048)
    
    reflon_ar = np.linspace(x0 , xinf, nsites ) 
    reflat_ar = np.linspace(y0, yinf, nsites)
    #--------------------------------------------------------------------------
    # r0 = np.sqrt(((x0-xinf)*364e3 *.3048)**2 + ((y0 -yinf)*2882e2 *.3048)**2)
    # print('recover distance = ', r0/nsites )
    #--------------------------------------------------------------------------
    if is_utm : 
        if utm_zone is None: 
            raise TypeError("Please provide your UTM zone e.g.'10S' or '03N' !")
        lon = np.zeros_like(reflon_ar) 
        lat = lon.copy() 
        
        for kk , (lo, la) in enumerate (zip( reflon_ar, reflat_ar)): 
            try : 
                with warnings.catch_warnings(): # ignore multiple warnings 
                    warnings.simplefilter('ignore')
                    lat[kk], lon[kk] = project_point_utm2ll(
                        easting= la, northing=lo, utm_zone=utm_zone, **kws)
            except : 
                lat[kk], lon[kk] = utm_to_ll(
                    23, northing=lo, easting=la, zone=utm_zone)
                
        if not HAS_GDAL : 
            if raise_warning:
                warnings.warn("It seems GDAL is not set! will use the equations"
                              " from USGS Bulletin 1532. Be aware, the positionning" 
                              " is less accurate than using GDAL.")
        
        if raise_warning:
            warnings.warn("By default,'easting/northing' are assumed to"
                          " fit the 'longitude/latitude' respectively.") 
        
        reflat_ar, reflon_ar = lat , lon 
    
    if todms:
       reflat_ar = np.array(list(
           map(lambda l: convert_position_float2str(float(l)), reflat_ar)))
       reflon_ar = np.array(list(
           map(lambda l: convert_position_float2str(float(l)), reflon_ar)))
       
    return (reflon_ar , reflat_ar ) if order =='+' else (
        reflon_ar[::-1] , reflat_ar[::-1] )  


#XXX OPTIMIZE 

[docs]
def parseDCArgs(fn :str , 
                delimiter:Optional[str]=None,
                 arg='stations'
                 )-> ArrayLike [str]: 
    """ Parse DC `stations` and `search` arguments from file and output to 
    array accordingly.
    
    The `froms` argument is the depth in meters from which one expects to find  
    a fracture zone outside of pollutions. Indeed, the `fromS` parameter is
    used to  speculate about the expected groundwater in the fractured rocks 
    under the average level of water inrush in a specific area. For more details
    refer to :attr:`watex.methods.electrical.VerticalSounding.fromS` 
    documentation. 
    
    :param fn: path-like object, full path to DC station or fromS file. 
        if data is considered as a station file, it must be composed  
        the station names. Commonly it can be used to specify the selected 
        station of all DC-resistity line where one expects
        to locate the drilling. 
        Conversly, the fromS file should not include any letter so if given, 
        ot sould be removed.  
        
    :param arg: str of the attribute of the DC methods.Any other value except 
        ``station`` should considered as ``fromS`` value and will parse the 
        file accordingly. 
        
    :param delimiter: str , delimiter to separate the different stations 
        or 'fromS' value. For instance, use use < delimiter=' '> when all 
        values are separated with space and be arranged in the same line like::
            
            >>> 'S02 S12 S12 S15 S28 S30' #  line of the file.
    
    :return: 
        array: array of station name. 
        
    :note: if all station prefixes belong to the module station property object 
        i.e :class:`watex.property.P.istation`, the prefix should be overwritten 
        to only keep the `S`. For instance 'pk25'-> 'S25'
    
    :Example: 
        >>> from watex.utils.coreutils import parseDCArgs 
        >>> sf='data/sfn.txt' # use delimiter if values are in the same line. 
        >>> sdata= parseDCArgs(sf)
        >>> sdata 
        ...
        >>> # considered that the digits in the file correspond to the depths 
        >>> fdata= parseDCArgs(sf, arg='froms') 
        >>> fdata 
        ...
    """
    if not os.path.isfile (fn): 
        raise FileNotFoundError("No file found:")
    arg= str(arg).lower().strip() 
    if arg.find('station')>=0 : 
        arg ='station'
    with open(fn, 'r', encoding ='utf8') as f : 
        sdata = f.readlines () 
    if delimiter is not None: 
        # flatter list into a list 
        sdata = list(map (lambda l: l.split(delimiter), sdata ))
        sdata = list(itertools.chain(*sdata))

    regex =re.compile (rf"{'|'.join([a for a in (P().istation+['S'])])}", 
                       flags =re.IGNORECASE
                       ) if arg =='station' else re.compile (
                           r'\d+', flags=re.IGNORECASE ) 
    
    sdata = list(map(lambda o:  regex.sub('S', o.strip()), 
                     sdata )
                 ) if arg =='station' else list(map(
                     lambda o:  regex.findall(o.strip()), sdata )
                              )
    # for consitency delte all empty string in the list 
    sdata = list(filter (None, sdata ))
    
    return np.array(sdata )if arg=='station' else reshape (np.array(
        sdata ).astype(float))



[docs]
def read_data (
    f: str|pathlib.PurePath, 
    sanitize: bool= ..., 
    reset_index: bool=..., 
    comments: str="#", 
    delimiter: str=None, 
    columns: List[str]=None,
    npz_objkey: str= None, 
    verbose: bool= ..., 
    **read_kws
 ) -> DataFrame: 
    """ Assert and read specific files and url allowed by the package
    
    Readable files are systematically convert to a data frame.  
    
    Parameters 
    -----------
    f: str, Path-like object 
       File path or Pathlib object. Must contain a valid file name  and 
       should be a readable file or url 
        
    sanitize: bool, default=False, 
       Push a minimum sanitization of the data such as: 
           - replace a non-alphabetic column items with a pattern '_' 
           - cast data values to numeric if applicable 
           - drop full NaN columns and rows in the data 
           
    reset_index: bool, default=False, 
      Reset index if full NaN columns are dropped after sanitization. 
      
      .. versionadded:: 0.2.5
          Apply minimum data sanitization after reading data. 
     
    comments: str or sequence of str or None, default='#'
       The characters or list of characters used to indicate the start 
       of a comment. None implies no comments. For backwards compatibility, 
       byte strings will be decoded as 'latin1'. 

    delimiter: str, optional
       The character used to separate the values. For backwards compatibility, 
       byte strings will be decoded as 'latin1'. The default is whitespace.

    npz_objkey: str, optional 
       Dataset key to indentify array in multiples array storages in '.npz' 
       format.  If key is not set during 'npz' storage, ``arr_0`` should 
       be used. 
      
       .. versionadded:: 0.2.7 
          Capable to read text and numpy formats ('.npy' and '.npz') data. Note
          that when data is stored in compressed ".npz" format, provided the 
          '.npz' object key  as argument of parameter `npz_objkey`. If None, 
          only the first array should be read and ``npz_objkey='arr_0'``. 
          
    verbose: bool, default=0 
       Outputs message for user guide. 
       
    read_kws: dict, 
       Additional keywords arguments passed to pandas readable file keywords. 
        
    Returns 
    -------
    f: :class:`pandas.DataFrame` 
        A dataframe with head contents by default.  
        
    See Also 
    ---------
    np.loadtxt: 
        load text file.  
    np.load 
       Load uncompressed or compressed numpy `.npy` and `.npz` formats. 
    watex.utils.baseutils.save_or_load: 
        Save or load numpy arrays.
       
    """
    def min_sanitizer ( d, /):
        """ Apply a minimum sanitization to the data `d`."""
        return to_numeric_dtypes(
            d, sanitize_columns= True, 
            drop_nan_columns= True, 
            reset_index=reset_index, 
            verbose = verbose , 
            fill_pattern='_', 
            drop_index = True
            )
    sanitize, reset_index, verbose = ellipsis2false (
        sanitize, reset_index, verbose )
    if ( isinstance ( f, str ) 
            and str(os.path.splitext(f)[1]).lower()in (
                '.txt', '.npy', '.npz')
            ): 
        f = save_or_load(f, task = 'load', comments=comments, 
                         delimiter=delimiter )
        # if extension is .npz
        if isinstance(f, np.lib.npyio.NpzFile):
            npz_objkey = npz_objkey or "arr_0"
            f = f[npz_objkey] 

        if columns is not None: 
            columns = is_iterable(columns, exclude_string= True, 
                                  transform =True, parse_string =True 
                                  )
            if len( columns )!= f.shape [1]: 
                warnings.warn(f"Columns expect {f.shape[1]} attributes."
                              f" Got {len(columns)}")
            
        f = pd.DataFrame(f, columns=columns )
        
    if isinstance (f, pd.DataFrame): 
        if sanitize: 
            f = min_sanitizer (f)
        return  f 
    
    cpObj= Config().parsers 
    f= _check_readable_file(f)
    _, ex = os.path.splitext(f) 
    if ex.lower() not in tuple (cpObj.keys()):
        raise TypeError(f"Can only parse the {smft(cpObj.keys(), 'or')} files"
                        )
    try : 
        f = cpObj[ex](f, **read_kws)
    except FileNotFoundError:
        raise FileNotFoundError (
            f"No such file in directory: {os.path.basename (f)!r}")
    except BaseException as e : 
        raise FileHandlingError (
            f"Cannot parse the file : {os.path.basename (f)!r}. "+  str(e))
    if sanitize: 
        f = min_sanitizer (f)
        
    return f 

    
def _check_readable_file (f): 
    """ Return file name from path objects """
    msg =(f"Expects a Path-like object or URL. Please, check your"
          f" file: {os.path.basename(f)!r}")
    if not os.path.isfile (f): # force pandas read html etc 
        if not ('http://'  in f or 'https://' in f ):  
            raise TypeError (msg)
    elif not isinstance (f,  (str , pathlib.PurePath)): 
         raise TypeError (msg)
    if isinstance(f, str): f =f.strip() # for consistency 
    return f 

def _validate_ves_data_if(data, index_rhoa , err , **kws): 
    """ Validate VES data if data is given as a Path-like object and 
    returns AB/2 position, MN if exists and resistivity data. 
    
    :param data: str, path-like object 
        litteral path string or PathLib object 
    :param index_rhoa: int, 
        Index to retreive the resistivity data is the number of sounding 
        point are greater than 1 
    :param err: :class:`~watex.exceptions.VESError`
        VESerror messages 
    :returns: 
        - rhoa: resistivity data 
        - AB : current electodes measurement values 
        - MN: potential electrodes measurement if exists in the data file. 
        - rxy: Accept the coordinates xy of the place where the sounding is 
           taken. 
    """
    if isinstance(data, (str,  pathlib.PurePath)): 
        try : 
            data = _is_readable(data, **kws)
        except TypeError as typError: 
            raise VESError (str(typError))

    data = _assert_all_types(data, pd.DataFrame )
    # sanitize the dataframe 
    pObj =P() ; ncols = pObj(hl = list(data.columns), kind ='ves')
    if ncols is None:
        raise HeaderError (f"Columns {smft(pObj.icpr)} are missing in "
                           "the given dataset.")
    err_msg = ("VES data must contain 'the resistivity' and the depth"
             " measurement 'AB/2'. A sample of VES data can be found" 
             " in `watex.datasets`. For e.g. 'watex.datasets.load_semien'"
             " fetches a 'semien' locality dataset and its docstring"
             " `~.load_semien.__doc__` can give a furher details about"
             " the VES data arrangement."
             )
    try:
        data.columns = ncols
        
    except : pass 

    data = is_valid_dc_data(data, method ="ves", exception =VESError, 
                            extra = err_msg)

    try : 
        rhoa= data.resistivity 
    except : 
        raise ResistivityError(
            "Data validation aborted! Missing resistivity values.")
    else : 
        # In the case, we got a multiple resistivity values 
        # corresponding to the different sounding values 
        index_rhoa = index_rhoa or 0 
        # for consistency 
        index_rhoa = int (index_rhoa )
        if ( not _is_arraylike_1d( rhoa) 
             and (
                 index_rhoa >= rhoa.shape[1]
                  or index_rhoa < 0 
                  ) 
            ): 
            warnings.warn(f"The index {index_rhoa} is out of the range." 
                          f" '{len(rhoa.columns)-1}' is max index for "
                          "selecting the specific resistivity data. "
                          "However, the resistivity data at index 0 is "
                          " kept by default."
                )
            index_rhoa= 0 
                
        rhoa = rhoa.iloc[:, index_rhoa] if not _is_arraylike_1d(
            rhoa) else rhoa 
        
    if 'MN' in data.columns: 
        MN = data.MN 
    try: 
        AB= data.AB 
    except: 
        raise err
        
    ext = [ xy for xy in data.columns if xy in (
        'longitude', 'latitude', 'easting', 'northing')]
    rxy = None if len(ext)==0 else data [ext ] 
    
    return rhoa, AB, MN , rxy


def _is_readable (
        f:str, 
        *, 
        as_frame:bool=False, 
        columns:List[str]=None,
        input_name='f', 
        **kws
 ) -> DataFrame: 
    """ Assert and read specific files and url allowed by the package
    
    Readable files are systematically convert to a pandas frame.  
    
    Parameters 
    -----------
    f: Path-like object -Should be a readable files or url  
    columns: str or list of str 
        Series name or columns names for pandas.Series and DataFrame. 
        
    to_frame: str, default=False
        If ``True`` , reconvert the array to frame using the columns orthewise 
        no-action is performed and return the same array.
    input_name : str, default=""
        The data name used to construct the error message. 
        
    raise_warning : bool, default=True
        If True then raise a warning if conversion is required.
        If ``ignore``, warnings silence mode is triggered.
    raise_exception : bool, default=False
        If True then raise an exception if array is not symmetric.
        
    force:bool, default=False
        Force conversion array to a frame is columns is not supplied.
        Use the combinaison, `input_name` and `X.shape[1]` range.
        
    kws: dict, 
        Pandas readableformats additional keywords arguments. 
    Returns
    ---------
    f: pandas dataframe 
         A dataframe with head contents... 
    
    """
    if hasattr (f, '__array__' ) : 
        f = array_to_frame(
            f, 
            to_frame= True , 
            columns =columns, 
            input_name=input_name , 
            raise_exception= True, 
            force= True, 
            )
        return f 

    cpObj= Config().parsers 
    
    f= _check_readable_file(f)
    _, ex = os.path.splitext(f) 
    if ex.lower() not in tuple (cpObj.keys()):
        raise TypeError(f"Can only parse the {smft(cpObj.keys(), 'or')} files"
                        f" not {ex!r}.")
    try : 
        f = cpObj[ex](f, **kws)
    except FileNotFoundError:
        raise FileNotFoundError (
            f"No such file in directory: {os.path.basename (f)!r}")
    except: 
        raise FileHandlingError (
            f" Can not parse the file : {os.path.basename (f)!r}")

    return f