# -*- coding: utf-8 -*-
# License: BSD-3-Clause
# Author: LKouadio <etanoyau@gmail.com>
# Created date: Fri Apr 15 10:46:56 2022
"""
The module encompasses the main functionalities for class and methods to sucessfully
run. Somes modules are written and shortcutted for the users to do some
singular tasks before feeding to the main algorithms.
"""
from __future__ import annotations
import os
import re
import pathlib
import warnings
import copy
import itertools
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from .._docstring import refglossary
from .._typing import (
Any,
List ,
Union,
Tuple,
Dict,
Optional,
NDArray,
DataFrame,
Series,
ArrayLike,
DType,
Sub,
SP
)
from .._watexlog import watexlog
from ..decorators import refAppender, docSanitizer
from ..property import P , Config
from ..exceptions import (
StationError,
HeaderError,
ResistivityError,
ERPError,
VESError,
FileHandlingError
)
from .baseutils import save_or_load
from .funcutils import (
smart_format as smft,
_isin ,
_assert_all_types,
accept_types,
read_from_excelsheets,
to_numeric_dtypes,
reshape,
is_iterable,
is_in_if,
ellipsis2false,
)
from .gistools import (
assert_lat_value,
assert_lon_value,
convert_position_str2float,
convert_position_float2str,
utm_to_ll,
project_point_ll2utm,
project_point_utm2ll,
HAS_GDAL,
)
from .validator import (
_is_arraylike_1d,
_check_consistency_size,
is_valid_dc_data,
array_to_frame,
check_y
)
_logger = watexlog.get_watex_logger(__name__)
__all__=[
"vesSelector",
"erpSelector",
"fill_coordinates",
"plotAnomaly",
"makeCoords",
"parseDCArgs",
"defineConductiveZone",
"read_data",
"_is_readable",
"is_erp_series",
"is_erp_dataframe"
]
[docs]
@refAppender(refglossary.__doc__)
def vesSelector(
data:str | DataFrame[DType[float|int]] = None,
*,
rhoa: ArrayLike |Series | List [float] = None,
AB :ArrayLike |Series = None,
MN: ArrayLike|Series | List[float] =None,
index_rhoa: Optional[int] = None,
xy_coords: Tuple [float|int]=None,
is_utm: bool= False,
utm_zone: str =None,
epsg: int|str=None,
**kws
) -> DataFrame :
""" Assert the validity of |VES| data and return a sanitize dataframe.
:param rhoa: array-like - Apparent resistivities collected during the
sounding.
:param AB: array-like - Investigation distance between the current
electrodes. Note that the `AB` is by convention equals to `AB/2`.
It's taken as half-space of the investigation depth.
:param MN: array-like - Potential electrodes distances at each investigation
depth. Note by convention the values are half-space and equals to
`MN/2`.
:param f: Path-like object or sounding dataframe. If given, the
others parameters could keep the ``None` values.
:param index_rhoa: int - The index to retrieve the resistivity data of a
specific sounding point. Sometimes the sounding data are composed of
the different sounding values collected in the same survey area into
different |ERP| line. For instance:
+------+------+----+----+----+----+----+
| AB/2 | MN/2 |SE1 | SE2| SE3| ...|SEn |
+------+------+----+----+----+----+----+
Where `SE` are the electrical sounding data values and `n` is the
number of the sounding points selected. `SE1`, `SE2` and `SE3` are
three points selected for |VES| i.e. 3 sounding points carried out
either in the same |ERP| or somewhere else. These sounding data are
the resistivity data with a specific numbers. Commonly the number
are randomly chosen. It does not refer to the expected best fracture
zone selected after the prior-interpretation. After transformation
via the function `ves_selector`, the header of the data should hold
the `resistivity`. For instance, refering to the table above, the
data should be:
+----+----+-------------+-------------+-------------+-----+
| AB | MN |resistivity | resistivity | resistivity | ... |
+----+----+-------------+-------------+-------------+-----+
Therefore, the `index_rhoa` is used to select the specific resistivity
values i.e. select the corresponding sounding number of the |VES|
expecting to locate the drilling operations or for computation. For
esample, ``index_rhoa=1`` should figure out:
+------+------+----+--------+-----+----+------------+
| AB/2 | MN/2 |SE2 | --> | AB | MN |resistivity |
+------+------+----+--------+-----+----+------------+
If `index_rhoa` is ``None`` and the number of sounding curves are more
than one, by default the first sounding curve is selected ie
`index_rhoa` equals to ``0``.
:param xy_coords: tuple (float, float)
Coordinates of the sounding point. Must be ('longitude','latitude') or
('easting', 'northing'). If xy is `xy_coords` is given as
('easting' , 'northing'), specify ``is_utm=True`` so the conversion to
('longitude', 'latitude') should be triggered. If ``False``, a
warnings occurs if values are greater than 180 and 90 degree for
longitude and latitude respectively.
Note that if the coordinates exists in the dataframe, its should
takes the priority
.. versionadded:: 0.2.1
:param is_utm: bool, default= False,
Allow conversion the ('easting', 'northing') coordinated from `xy_coords`
to ('longitude', 'latitude')
:param utm_zone: default='49R'
Is needed when `xy_coords` is passed as ('easting', 'northing') for
conversion.
:param epsg: int, str , optional
EPSG number defining projection. See http://spatialreference.org/ref/
for moreinfo. Overrides utm_zone if both are provided
:param kws: dict - Pandas dataframe reading additionals
keywords arguments.
:return: -dataframe -Sanitize |VES| dataframe with ` AB`, `MN` and
`resistivity` as the column headers.
:Example:
>>> from watex.utils.coreutils import vesSelector
>>> df = vesSelector (data='data/ves/ves_gbalo.csv')
>>> df.head(3)
... AB MN resistivity
0 1 0.4 943
1 2 0.4 1179
2 3 0.4 1103
>>> df = vesSelector ('data/ves/ves_gbalo.csv', index_rhoa=3 )
>>> df.head(3)
... AB MN resistivity
0 1 0.4 457
1 2 0.4 582
2 3 0.4 558
"""
err =VESError("Data validation aborted! Current electrodes values"
" are missing. Specify the deep measurement AB/2")
for arr, arr_name in zip ((AB , rhoa), ("AB", "Resistivity")):
if arr is not None:
if isinstance(arr, (list, tuple)):
arr=np.array(arr)
if not _is_arraylike_1d(arr):
raise VESError(
f"{arr_name!r} should be a one-dimensional array.")
index_rhoa = 0 if index_rhoa is None else index_rhoa
index_rhoa = int (_assert_all_types(
index_rhoa, int, objname ="Resistivity column index"))
# make a copy of xy coordinates
xy_coords_copy = copy.deepcopy(xy_coords)
if data is not None:
rhoa, AB, MN, xy_coords =_validate_ves_data_if(
data, index_rhoa, err, **kws)
# in the case coordinates are not in the data
# and passed explicitly then takes the copy
if xy_coords is None:
xy_coords = xy_coords_copy
if rhoa is None:
raise ResistivityError(
"Data validation aborted! Missing resistivity values.")
if AB is None:
raise err
AB = np.array(AB) ; MN = np.array(MN) ; rhoa = np.array(rhoa)
if not _check_consistency_size(AB, rhoa, error ='ignore'):
raise VESError(
" Deep measurement size `AB` ( current electrodes ) "
" and the resistiviy values `rhoa` must be consistent."
f" '{len(AB)}' and '{len(rhoa)}' were given."
)
sdata =pd.DataFrame(
{'AB': AB, 'MN': MN, 'resistivity':rhoa},index =range(len(rhoa)))
# when xy_coords are directly retrieved
# from dataframe . make the whole frame instead
# including the sounding coordinates points.
if (
hasattr (xy_coords , 'columns' ) and
hasattr (xy_coords, '__array__')):
sdata = pd.concat ( [ sdata , xy_coords ], axis = 1 )
xy_coords= None
if xy_coords is None:
return sdata
xy_coords = is_iterable(
xy_coords, exclude_string= True , transform =True )
if len(xy_coords)!=2:
warnings.warn("Unexpected coordinates xy. xy should be a tuple"
f" of (longitude, latitude) values. Got {xy_coords}")
xy_coords = None
if xy_coords is not None:
try:
xy_coords = _convert_xy_coordinates(
*xy_coords,
is_utm=is_utm,
utm_zone = utm_zone , epsg =epsg
)
except Exception as e:
warnings.warn(str(e) + ". This error occurs probably because"
" you passed wrong coordinates xy or the utm_zone"
" is not set while using ('easting', 'northing')"
" as sounding coordinates. Unable to convert UTM"
" coordinates to longitude/latitude with missing"
" EPSG or UTM zone number. Please check your"
" sounding coordinates.")
xy_coords= None
if xy_coords is not None:
sdata ['longitude']= xy_coords[0]
sdata ['latitude']= xy_coords[-1]
return sdata
def _convert_xy_coordinates ( *xy, is_utm = False, as_frame =False,
utm_zone = '49R', epsg = None ):
""" manage coordinates and convert coordinates to longitude/latitude
if UTM data ( 'easting', 'northing') is given.
`xy` must be ('longitude' , 'latitude') coordinates. Turn ``is_utm==True``
when `xy` are in ( 'easting', 'northing')
An isolated part of `vesSelector`. Refer to documentation for params
explanations.
"""
# fetch_random coordinate for sves
if is_utm :
# so reverse it
yx = project_point_utm2ll(*xy[::-1] , utm_zone = utm_zone , epsg= epsg
)
xy = yx [::-1] # reverse back to longitude latitude
is_utm=False # conversion is done
# now validate longitude and latitude
try:
xy = ( assert_lon_value(xy [0]) , assert_lat_value(xy[1]))
except (TypeError, ValueError) as e:
warnings.warn (str(e)+ ' Please check your sounding xy coordinates')
xy = None
if as_frame and xy is not None :
xy = pd.DataFrame ({'longitude':xy[0] , 'latitude': xy[-1]},
index = range (1))
return xy
[docs]
@docSanitizer()
def fill_coordinates(
data: DataFrame =None,
lon: ArrayLike = None,
lat: ArrayLike = None,
east: ArrayLike = None,
north: ArrayLike = None,
epsg: Optional[int] = None ,
utm_zone: Optional [str] = None,
datum: str = 'WGS84',
verbose:int =0,
) -> Tuple [DataFrame, str] :
""" Assert and recompute coordinates values based on geographical
coordinates systems.
Compute the couples (easting, northing) or (longitude, latitude )
and set the new calculated values into a dataframe.
Parameters
-----------
data : dataframe,
Dataframe contains the `lat`, `lon` or `east` and `north`. All data
don't need to be provided. If ('lat', 'lon') and (`east`, `north`)
are given, ('`easting`, `northing`') should be overwritten.
lat: array-like float or string (DD:MM:SS.ms)
Values composing the `longitude` of point
lon: array-like float or string (DD:MM:SS.ms)
Values composing the `longitude` of point
east : array-like float
Values composing the northing coordinate in meters
north : array-like float
Values composing the northing coordinate in meters
datum: string
well known datum ex. WGS84, NAD27, etc.
projection: string
projected point in lat and lon in Datum `latlon`, as decimal degrees
or 'UTM'.
epsg: int
epsg number defining projection (see http://spatialreference.org/ref/
for moreinfo). Overrides utm_zone if both are provided
utm_zone : string
zone number and 'S' or 'N' e.g. '55S'. Defaults to the
centre point of the provided points
verbose: int,default=0
warning user if UTMZONE is not supplied when computing the
latitude/longitude from easting/northing
Returns
-------
- `data`: Dataframe with new coodinates values computed
- `utm_zone`: zone number and 'S' or 'N'
Examples
----------
>>> from watex.utils.coreutils import fill_coordinates
>>> from watex.utils import read_data
>>> data = read_data ('data/erp/l2_gbalo.xlsx')
>>> # rename columns 'x' and 'y' to 'easting' and 'northing' inplace
>>> data.rename (columns ={"x":'easting', "y":'northing'} , inplace =True )
>>> # transform the data by computing latitude/longitude by specifying the utm zone
>>> data_include,_ = fill_coordinates (data , utm_zone ='49N' )
>>> data.head(2)
easting northing rho longitude latitude
0 790752 1092750.0 1101 113 9
10 790747 1092758.0 1147 113 9
>>> # doing the revert action
>>> datalalon = data_include[['pk', 'longitude', 'latitude']]
>>> data_east_north, _ = fill_coordinates (datalalon )
>>> data_east_north.head(2)
pk longitude latitude easting northing
0 0 113 9 719870 995452
1 10 113 9 719870 995452
"""
def _get_coordcomps (str_, df):
""" Retrieve coordinate values and assert whether values are given.
If ``True``, returns `array` of `given item` and valid type of the
data. Note that if data equals to ``0``, we assume values are not
provided.
:param str_: str - item in the `df` columns
:param df: DataFrame - dataframe expected containing the `str_` item.
"""
if str_ in df.columns:
return df[str_] , np.all(df[str_])!=0
return None, None
def _set_coordinate_values (x, y, *, func ):
""" Iterate `x` and `y` and output new coordinates values computed
from `func` .
param x: iterable values
:param y: iterabel values
:param func: function F
can be:
- ``project_point_utm2ll`` for `UTM` to `latlon`` or
- `` project_point_ll2utm`` for `latlon`` to `UTM`
:retuns:
- xx new calculated
- yy new calculated
- utm zone
"""
xx = np.zeros_like(x);
yy = np.zeros_like(xx)
for ii, (la, lo) in enumerate (zip(x, y)):
e , n, *uz = func (
la, lo, utm_zone = utm_zone, datum = datum, epsg =epsg
)
xx [ii] = e ; yy[ii] = n
return xx, yy , uz
if data is None:
data = pd.DataFrame (
dict (
longitude = lon ,
latitude = lat ,
easting = east,
northing=north
),
#pass index If using all scalar values
index = range(4)
)
if data is not None :
data = _assert_all_types(data, pd.DataFrame, objname="Coordinate data")
lon , lon_isvalid = _get_coordcomps(
'longitude', data )
lat , lat_isvalid = _get_coordcomps(
'latitude', data )
east , e_isvalid = _get_coordcomps(
'easting', data )
north, n_isvalid = _get_coordcomps(
'northing', data )
if lon_isvalid and lat_isvalid:
# raise warning when all coordinates are valids
if ( e_isvalid and n_isvalid ):
if verbose:
warnings.warn(
"Data contains valid longitude/latitude and "
"easting/northing. The latter should be overwritten.")
else:
try :
east , north , uz = _set_coordinate_values(
lat.values, lon.values, func=project_point_ll2utm,
)
except :# pass if an error occurs
pass
else :
data['easting'] = east ; data['northing'] = north
elif e_isvalid and n_isvalid:
if utm_zone is None:
if verbose > 0:
warnings.warn(
'Should provide the `UTM` for `latitute` and `longitude`'
' calculus. `NoneType` can not be used as UTM zone number.'
' Refer to the documentation.')
try :
lat , lon, *_ = _set_coordinate_values(
east.values, north.values,
func = project_point_utm2ll,
)
except : pass
else :
data['longitude'] = lon ; data['latitude'] = lat
return data, utm_zone
def _assert_data (data :DataFrame ):
""" Assert the data and return the property dataframe """
data = _assert_all_types(
data, list, tuple, np.ndarray, pd.Series, pd.DataFrame)
if isinstance(data, pd.DataFrame):
cold , ixc =list(), list()
for i , ckey in enumerate(data.columns):
for kp in P().isrll :
if ckey.lower() .find(kp) >=0 :
cold.append (kp); ixc.append(i)
break
if len (cold) ==0:
raise ValueError (f'Expected {smft(P().isrll)} '
' columns, but not found in the given dataframe.'
)
dup = cold.copy()
# filter and remove one by one duplicate columns.
list(filter (lambda x: dup.remove(x), set(cold)))
dup = set(dup)
if len(dup) !=0 :
raise HeaderError(
f'Duplicate column{"s" if len(dup)>1 else ""}'
f' {smft(dup)} found. It seems to be {smft(dup)}'
f'column{"s" if len(dup)>1 else ""}. Please provide'
' the right column name in the dataset.'
)
data_ = data [cold]
col = list(data_.columns)
for i, vc in enumerate (col):
for k in P().isrll :
if vc.lower().find(k) >=0 :
col[i] = k ; break
return data_
[docs]
def is_erp_series (
data : Series ,
dipolelength : Optional [float] = None
) -> DataFrame :
""" Validate the data series whether is ERP data.
The `data` should be the resistivity values with the one of the following
property index names ``resistivity`` or ``rho``. Will raises error
if not detected. If a`dipolelength` is given, a data should include
each station positions values.
Parameters
-----------
data : pandas Series object
Object of resistivity values
dipolelength: float
Distance of dipole during the whole survey line. If it is
is not given , the station location should be computed and
filled using the default value of the dipole. The *default*
value is set to ``10 meters``.
Returns
--------
A dataframe of the property indexes such as
``['station', 'easting','northing', 'resistivity']``.
Raises
------
ResistivityError
If name does not match the `resistivity` column name.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from watex.utils.coreutils imprt is_erp_series
>>> data = pd.Series (np.abs (np.random.rand (42)), name ='res')
>>> data = is_erp_series (data)
>>> data.columns
... Index(['station', 'easting', 'northing', 'resistivity'], dtype='object')
>>> data = pd.Series (np.abs (np.random.rand (42)), name ='NAN')
>>> data = _is_erp_series (data)
... ResistivityError: Unable to detect the resistivity column: 'NAN'.
"""
data = _assert_all_types(data, pd.Series)
is_valid = False
for p in P().iresistivity :
if data.name.lower().find(p) >=0 :
data.name = p ; is_valid = True ; break
if not is_valid :
raise ResistivityError(
f"Unable to detect the resistivity column: {data.name!r}."
)
if is_valid:
df = is_erp_dataframe (pd.DataFrame (
{
data.name : data ,
'NAN' : np.zeros_like(data )
}
),
dipolelength = dipolelength,
)
return df
[docs]
def is_erp_dataframe (
data :DataFrame ,
dipolelength : Optional[float] = None,
force:bool=False,
verbose=0.
) -> DataFrame:
""" Ckeck whether the dataframe contains the electrical resistivity
profiling (ERP) index properties.
DataFrame should be reordered to fit the order of index properties.
Anyway it should he dataframe filled by ``0.`` where the property is
missing. However, if `station` property is not given. station` property
should be set by using the dipolelength default value equals to ``10.``.
Parameters
----------
data : Dataframe object
Dataframe object. The columns dataframe should match the property
ERP property object such as ``['station','resistivity',
'longitude','latitude']``
or ``['station','resistivity', 'easting','northing']``.
dipolelength: float
Distance of dipole during the whole survey line. If the station
is not given as `data` columns, the station location should be
computed and filled the station columns using the default value
of the dipole. The *default* value is set to ``10 meters``.
force: bool, default=False,
If Vertical electrical (VES) is passed while expecting ERP data,
force set to `True` will consider the VES data as ERP data and
will use only the resistivity values in VES data. This will
will an invalid results especially when parameters computation are
needed.
verbose: int,
Show the verbosity; outputs more messages if ``True``.
Returns
--------
A new data with index properties.
Raises
------
- None of the column matches the property indexes.
- Find duplicated values in the given data header.
Examples
--------
>>> import numpy as np
>>> from watex.utils.coreutils import is_erp_dataframe
>>> df = pd.read_csv ('data/erp/testunsafedata.csv')
>>> df.columns
... Index(['x', 'stations', 'resapprho', 'NORTH'], dtype='object')
>>> df = _is_erp_dataframe (df)
>>> df.columns
... Index(['station', 'easting', 'northing', 'resistivity'], dtype='object')
"""
err_msg = ("ERP data must contain 'the resistivity' and the station"
" position measurement. A sample of ERP data can be found"
" in `watex.datasets`. For e.g. 'watex.datasets.load_tankesse'"
" fetches a 'tankesse' locality dataset and its docstring"
" `~.load_tankesse.__doc__` can give a furher details about"
" the ERP data arrangement. {fmsg}"
)
force_msg= "" if force else (
"To force reading unsafety data as ERP, set 'force' to ``True``.")
if force:
if verbose:
warnings.warn("Force considering unsafety data as ERP data might"
" lead to breaking code or invalid results during"
" ERP parameters computation. Use at your own risk."
)
data = _assert_all_types(data, pd.DataFrame,
objname="ERP 'resistivity' and station measurement data" )
else:
data = is_valid_dc_data( data, exception =ERPError,
extra = err_msg.format(fmsg = force_msg))
datac= data.copy()
def _is_in_properties (h ):
""" check whether the item header `h` is in the property values.
Return `h` and it correspondence `key` in the property values. """
for key, values in P().idicttags.items() :
for v in values :
if h.lower().find (v)>=0 :
return h, key
return None, None
def _check_correspondence (pl, dl):
""" collect the duplicated name in the data columns """
return [ l for l in pl for d in dl if d.lower().find(l)>=0 ]
cold , c = list(), list()
# create property object
pObj = P(data.columns)
for i , ckey in enumerate(list(datac.columns)):
h , k = _is_in_properties(ckey)
cold.append (h) if h is not None else h
c.append(k) if k is not None else k
if len (cold) ==0:
raise HeaderError (
f'Wrong column headers {list(data.columns)}.'
f' Unable to find the expected {smft(pObj.isrll)}'
' column properties.'
)
dup = cold.copy()
# filter and remove one by one duplicate columns.
list(filter (lambda x: dup.remove(x), set(cold)))
dup = set(dup) ; ress = _check_correspondence(
pObj() or pObj.idicttags.keys(), dup)
if len(dup) !=0 :
raise HeaderError(
f'Duplicate column{"s" if len(dup)>1 else ""}'
f' {smft(dup)} {"are" if len(dup)>1 else "is"} '
f'found. It seems correspond to {smft(ress)}. '
'Please ckeck your data column names. '
)
# fetch the property column names and
# replace by 0. the non existence column
# reorder the column to match
# ['station','resistivity', 'easting','northing', ]
data_ = data[cold]
data_.columns = c
msg = ERPError("Unknown DC-ERP data. ERP data must contain"
f" {smft(pObj.idicttags.keys())}")
try :
data_= data_.reindex (columns =pObj.idicttags.keys(), fill_value =0.
)
except :
raise msg
dipolelength = _assert_all_types(
dipolelength , float, int) if dipolelength is not None else None
if (np.all (data_.station) ==0.
and dipolelength is None
):
dipolelength = 10.
data_.station = np.arange (
0 , data_.shape[0] * dipolelength , dipolelength )
return data_
[docs]
def erpSelector (
f: str | NDArray | Series | DataFrame ,
columns: str | List[str] = ...,
force:bool= False,
utm_zone:str=None,
epsg:int | str=None,
verbose:int =0.,
**kws:Any
) -> DataFrame :
""" Read and sanitize the data collected from the survey.
`data` should be an array, a dataframe, series, or arranged in ``.csv``
or ``.xlsx`` formats. Be sure to provide the header of each columns in'
the worksheet. In a file is given, header columns should be aranged as
``['station','resistivity' ,'longitude', 'latitude']``. Note that
coordinates columns (`longitude` and `latitude`) are not compulsory.
Parameters
----------
f: Path-like object, ndarray, Series or Dataframe,
If a path-like object is given, can only parse `.csv` and `.xlsx`
file formats. However, if ndarray is given and shape along axis 1
is greater than 4, the ndarray should be shrunked.
columns: list
list of the valuable columns. It can be used to fix along the axis 1
of the array the specific values. It should contain the prefix or
the whole name of each item in
``['station','resistivity' ,'longitude', 'latitude']``.
force: bool, default=False,
If Vertical electrical (VES) is passed while expecting ERP data,
force set to `True` will consider the VES data as ERP data and
will use only the resistivity values in VES data. This will
will an invalid results especially when parameters computation are
needed.
verbose: int,
Show the verbosity; outputs more messages if ``True``.
utm_zone : string, optional
zone number and 'S' or 'N' e.g. '55S'. Default to the
centre point of the provided points. If given, the longitude/latitude
are computed from valid easting/northing coordinates.
.. versionadded:: 0.2.1
epsg: int
epsg number defining projection (see http://spatialreference.org/ref/
for moreinfo). Overrides utm_zone if both are provided
kws: dict
Additional pandas `pd.read_csv` and `pd.read_excel`
methods keyword arguments. Be sure to provide the right argument.
when reading `f`. For instance, provide ``sep= ','`` argument when
the file to read is ``xlsx`` format will raise an error. Indeed,
`sep` parameter is acceptable for parsing the `.csv` file format
only.
Returns
-------
DataFrame with valuable column(s).
Notes
------
The length of acceptable columns is ``4``. If the size of the columns is
higher than `4`, the data should be shrunked to match the expected columns.
Futhermore, if the header is not specified in `f` , the defaut column
arrangement should be used. Therefore, the second column should be
considered as the ``resistivity`` column.
Examples
---------
>>> import numpy as np
>>> from watex.utils.coreutils import erpSelector
>>> df = erpSelector ('data/erp/testsafedata.csv')
>>> df.shape
... (45, 4)
>>> list(df.columns)
... ['station','resistivity', 'longitude', 'latitude']
>>> df = erp_selector('data/erp/testunsafedata.xlsx')
>>> list(df.columns)
... ['easting', 'station', 'resistivity', 'northing']
>>> df = erpSelector(np.random.randn(7, 7))
>>> df.shape
... (7, 4)
>>> list(df.columns)
... ['station', 'resistivity', 'longitude', 'latitude']
"""
if columns is ...: columns=None
if columns is not None:
if isinstance(columns, str):
columns =columns.replace(':', ',').replace(';', ',')
if ',' in columns: columns =columns.split(',')
if isinstance(f, (str, pathlib.PurePath)):
try :
f = _is_readable(f, **kws)
except TypeError as typError:
raise ERPError (str(typError))
if isinstance( f, np.ndarray):
name = copy.deepcopy(columns)
columns = P().isrll if columns is None else columns
colnum = 1 if f.ndim ==1 else f.shape[1]
if colnum==1:
if isinstance (name, list) :
if len(name) ==1: name = name[0]
f = is_erp_series (
pd.Series (f, name = name or columns[1]
)
)
elif colnum==2 :
f= pd.DataFrame (f, columns = columns
if columns is None
else columns[:2]
)
elif colnum==3:
warnings.warn("One missing column `longitude|latitude` value."
"If the `longitude` and `latitude` data are"
f" not available. Use {smft(P().isrll[:2])} "
"columns instead.", UserWarning)
columns = name or columns [:colnum]
f= pd.DataFrame (f[:, :len(columns)],
columns =columns )
elif f.shape[1]==4:
f =pd.DataFrame (f, columns =columns
)
elif colnum > 4:
# add 'none' columns for the remaining columns.
f =pd.DataFrame (
f, columns = columns + [
'none' for i in range(colnum-4)]
)
if isinstance(f, pd.DataFrame):
f = is_erp_dataframe( f, force = force , verbose =verbose
)
elif isinstance(f , pd.Series ):
f = is_erp_series(f)
else :
amsg = smft(accept_types (
pd.Series, pd.DataFrame, np.ndarray) + ['*.xls', '*.csv'])
raise ValueError (f" Unsupports data. Expects only {amsg}."
)
if np.all(f.resistivity)==0:
raise ResistivityError('Resistivity values need to be supply.')
if utm_zone is not None:
# compute the longitude latitude if
# utm_zone is given.
if ('easting' in f.columns and 'northing' in f.columns) and (
'longitude' in f.columns and 'latitude' in f.columns):
if (
np.all(f['longitude'])==0
and np.all(f['latitude'])==0
):
f, _ = fill_coordinates(f, utm_zone = utm_zone ,
epsg = epsg )
return f
def _fetch_prefix_index (
arr:NDArray [DType[float]] = None,
col: List[str] = None,
df : DataFrame = None,
prefixs: List [str ] =None
) -> Tuple [int | int]:
""" Retrieve index at specific column.
Use the given station positions collected on the field to
compute the dipole length during the whole survey.
:param arr: array. Ndarray of data where one colum must the
positions values.
:param col: list. The list should be considered as the head of array. Each
position in the list sould fit the column data in the array. It raises
an error if the number of item in the list is different to the size
of array in axis=1.
:param df: dataframe. When supply, the `arr` and `col` is not
compulsory.
:param prefixs: list. Contains specific column prefixs to
fetch the corresponding data. For instance::
- Station prefix : ['pk','sta','pos']
- Easting prefix : ['east', 'x', 'long']
- Northing prefix: ['north', 'y', 'lat']
:returns:
- index of the position columns in the data
- station position array-like.
:Example:
>>> from numpy as np
>>> from watex.utils.coreutils import _assert_positions
>>> array1 = np.c_[np.arange(0, 70, 10), np.random.randn (7,3)]
>>> col = ['pk', 'x', 'y', 'rho']
>>> index, = _fetch_prefix_index (array1 , col = ['pk', 'x', 'y', 'rho'],
... prefixs = EASTPREFIX)
... 1
>>> index, _fetch_prefix_index (array1 , col = ['pk', 'x', 'y', 'rho'],
... prefixs = NOTHPREFIX )
... 2
"""
if prefixs is None:
raise ValueError('Please specify the list of items to compose the '
'prefix to fetch the columns data. For instance'
f' `station prefix` can be `{P().istation}`.')
if arr is None and df is None :
raise TypeError ( 'Expected and array or a dataframe not'
' a Nonetype object.'
)
elif df is None and col is None:
raise StationError( 'Column list is missing.'
' Could not detect the position index.')
if isinstance( df, pd.DataFrame):
# collect the resistivity from the index
# if a dataFrame is given
arr, col = df.values, df.columns
if arr.ndim ==1 :
# Here return 0 as colIndex
return 0, arr
if isinstance(col, str): col =[col]
if len(col) != arr.shape[1]:
raise ValueError (
f'Column should match the array shape in axis =1 <{arr.shape[1]}>.'
f' But {"was" if len(col)==1 else "were"} given')
# convert item in column in lowercase
comsg = col.copy()
col = list(map(lambda x: x.lower(), col))
colIndex = [col.index (item) for item in col
for pp in prefixs if item.find(pp) >=0]
if len(colIndex) is None or len(colIndex) ==0:
raise ValueError (f'Unable to detect the position in `{smft(comsg)}`'
' columns. Columns must contain at least'
f' `{smft(prefixs)}`.')
return colIndex[0], arr
def _assert_station_positions(
arr: SP = None,
prefixs: List [str] =...,
**kws
) -> Tuple [int, float]:
""" Assert positions and compute dipole length.
Use the given station positions collected on the field to
detect the dipole length during the whole survey.
:param arr: array. Ndarray of data where one column must the
positions values.
:param col: list. The list should be considered as the head of array. Each
position in the list sould fit the column data in the array. It raises
an error if the number of item in the list is different to the size
of array in axis=1.
:param df: dataframe. When supply, the `arr` and `col` are not needed.
:param prefixs: list. Contains all the station column names prefixs to
fetch the corresponding data.
:returns:
- positions: new positions numbering from station `S00` to ...
- dipolelength: recomputed dipole value
:Example:
>>> from numpy as np
>>> from watex.utils.coreutils import _assert_station_positions
>>> array1 = np.c_[np.arange(0, 70, 10), np.random.randn (7,3)]
>>> col = ['pk', 'x', 'y', 'rho']
>>> _assert_positions(array1, col)
... (array([ 0, 10, 20, 30, 40, 50, 60]), 10)
>>> array1 = np.c_[np.arange(30, 240, 30), np.random.randn (7,3)]
... (array([ 0, 30, 60, 90, 120, 150, 180]), 30)
"""
if prefixs is (None or ...): prefixs = P().istation
colIndex, arr =_fetch_prefix_index( arr=arr, prefixs = prefixs, **kws )
positions = arr[:, colIndex]
# assert the position is aranged from lower to higher
# if there is not wrong numbering.
fsta = np.argmin(positions)
lsta = np.argmax (positions)
if int(fsta) !=0 or int(lsta) != len(positions)-1:
raise StationError(
'Wrong numbering! Please number the position from first station '
'to the last station. Check your array positionning numbers.')
dipoleLength = int(np.abs (positions.min() - positions.max ()
) / (len(positions)-1))
# renamed positions
positions = np.arange(0 , len(positions) *dipoleLength ,
dipoleLength )
return positions, dipoleLength
[docs]
@refAppender(refglossary.__doc__)
def plotAnomaly(
erp: ArrayLike | List[float],
cz: Optional [Sub[ArrayLike], List[float]] = None,
station: Optional [str] = None,
fig_size: Tuple [int, int] = (10, 4),
fig_dpi: int = 300 ,
savefig: str | None = None,
show_fig_title: bool = True,
style: str = 'seaborn',
fig_title_kws: Dict[str, str|Any] = ...,
czkws: Dict [str , str|Any] = ...,
legkws: Dict [Any , str|Any] = ...,
how:Optional[str]='py',
**kws,
):
""" Plot the whole |ERP| line and selected conductive zone.
Conductive zone can be supplied nannualy as a subset of the `erp` or by
specifying the station expected for drilling location. For instance
``S07`` for the seventh station. Futhermore, for automatic detection, one
should set the station argument `s` to ``auto``. However, it 's recommended
to provide the `cz` or the `s` to have full control. The conductive zone
overlained the whole |ERP| survey. user can customize the `cz` plot by
filling with `Matplotlib pyplot`_ additional keywords araguments thought
the keyword arguments `czkws`.
Parameters
-----------
erp: array_like 1d
the |ERP| survey line. The line is an array of resistivity values.
Note that if a dataframe is passed, be sure that the frame matches
the DC resistivity data (ERP), otherwise an error occurs. At least,
the frame columns includes the resistivity and stations.
cz: array_like 1d
the selected conductive zone. If ``None``, only the `erp` should be
displayed. Note that `cz` is an subset of `erp` array.
station: str, optional
The station location given as string (e.g. ``s= "S10"``)
or as a station number (indexing; e.g ``s =10``). If value is set to
``"auto"``, `s` should be find automatically and fetching `cz` as well.
figsize: tuple, default =(10, 4)
Tuple value of figure size. Refer to the web resources `Matplotlib figure`_.
fig_dpi: int , default=300,
figure resolution "dot per inch". Refer to `Matplotlib figure`_.
savefig: str, optional,
save the figure. Refer to `Matplotlib figure`_.
show_fig_title: bool, default =True
display the title of the figure.
fig_title_kws: dict,
Keywords arguments of figure suptile. Refer to
`Matplotlib figsuptitle`_.
style: str - the style for customizing visualization. For instance to
get the first seven available styles in pyplot, one can run
the script below::
plt.style.available[:7]
Futher details can be foud in Webresources below or click on
`GeekforGeeks`_.
how: str, default='py'
By default (``how='py'``), the station is naming following the
Python indexing. Station is counting from station 00(S00). Any other
values will start the station naming from 1.
czkws: dict,
keywords `Matplotlib pyplot`_ additional arguments to customize
the `cz` plot.
legkws: dict,
Additional keywords Matplotlib legend arguments.
kws: dict,
additional keywords argument for `Matplotlib pyplot`_ to
customize the `erp` plot.
Return
---------
ax: Matplotlib.pyplot.Axis
Axis
Examples
---------
>>> import numpy as np
>>> from watex.utils import plotAnomaly, defineConductiveZone
>>> test_array = np.abs (np.random.randn (10)) *1e2
>>> selected_cz ,*_ = defineConductiveZone(test_array, 7)
>>> plotAnomaly(test_array, selected_cz )
>>> plotAnomaly(test_array, selected_cz , s= 5)
>>> plotAnomaly(test_array, s= 's02')
>>> plotAnomaly(test_array)
Note
-----
:func:`plotAnomaly` does not imply the use of constraints. The conductive
detection can only be used if and only if there is not constraints
applicable to the survey site, otherwise use :func:`erpSmartDetector`
by triggered the `view` parameter to ``True``.
In addition, If `cz` is given, No need to worry about the
station `s`. `s` can still keep it default value ``None``.
See Also
---------
watex.erpSmartDetector:
Detection conductive zone applying the constraint. Set the
``view=True`` for constraints visualization.
References
-----------
See Matplotlib Axes: https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.tick_params.html
GeekforGeeks: https://www.geeksforgeeks.org/style-plots-using-matplotlib/#:~:text=Matplotlib%20is%20the%20most%20popular,without%20using%20any%20other%20GUIs.
"""
def format_ticks (value, tick_number):
""" Format thick parameter with 'FuncFormatter(func)'
rather than using::
axi.xaxis.set_major_locator (plt.MaxNLocator(3))
ax.xaxis.set_major_formatter (plt.FuncFormatter(format_thicks))
"""
nskip = len(erp ) * 7 // 100
if value % nskip ==0:
return 'S{:02}'.format(int(value)+ 1
if str(how).lower()!='py' else int(value)
)
else: None
if hasattr ( erp, "columns") and isinstance (erp, pd.DataFrame):
erp = is_valid_dc_data(erp).resistivity
erp = _assert_all_types(
erp, tuple, list , np.ndarray , pd.Series)
if cz is not None:
cz = _assert_all_types(
cz, tuple, list , np.ndarray , pd.Series)
cz = np.array (cz)
erp =np.array (erp)
plt.style.use (style)
kws =dict (
color=P().frcolortags.get('fr1') if kws.get(
'color') is None else kws.get('color'),
linestyle='-' if kws.get('ls') is None else kws.get('ls'),
linewidth=2. if kws.get('lw') is None else kws.get('lw'),
label = 'Electrical resistivity profiling' if kws.get(
'label') is None else kws.get('label')
)
if czkws is ( None or ...) :
czkws =dict (color=P().frcolortags.get('fr3'),
linestyle='-',
linewidth=3,
label = 'Conductive zone'
)
if czkws.get('color') is None:
czkws['color']= P().frcolortags.get(czkws['color'])
if (xlabel := kws.get('xlabel')) is not None :
del kws['xlabel']
if (ylabel := kws.get('ylabel')) is not None :
del kws['ylabel']
if (rotate:= kws.get ('rotate')) is not None:
del kws ['rotate']
fig, ax = plt.subplots(1,1, figsize =fig_size)
leg =[]
zl, = ax.plot(np.arange(len(erp)), erp,
**kws
)
leg.append(zl)
if station =='' :
station= None # for consistency
if station is not None:
auto =False
if isinstance (station , str):
if station.lower()=='auto':
auto=True ; station =None # reset station
cz , _ , _, ix = defineConductiveZone(
erp,
station = station,
auto = auto,
index=how,
)
station = "S{:02}".format(ix if str(how).lower()=='py' else ix+ 1)
if cz is not None:
# construct a mask array with np.isin to check whether
if not _isin (erp, cz ):
raise ValueError ('Expected a conductive zone to be a subset of '
' the resistivity profiling line.')
# `cz` is subset array
z = np.ma.masked_values (erp, np.isin(erp, cz ))
# a masked value is constructed so we need
# to get the attribute fill_value as a mask
# However, we need to use np.invert or the tilde operator
# to specify that other value except the `CZ` values mus be
# masked. Note that the dtype must be changed to boolean
sample_masked = np.ma.array(
erp, mask = ~z.fill_value.astype('bool') )
czl, = ax.plot(
np.arange(len(erp)), sample_masked, 'o',
**czkws)
leg.append(czl)
ax.tick_params (labelrotation = 0. if rotate is None else rotate)
ax.set_xticks(range(len(erp)),
)
if len(erp ) >= 14 :
ax.xaxis.set_major_formatter (plt.FuncFormatter(format_ticks))
else :
ax.set_xticklabels(
['S{:02}'.format(int(i)+1 if str(how).lower()!='py' else int(i) )
for i in range(len(erp))],
rotation =0. if rotate is None else rotate )
if legkws is( None or ...):
legkws =dict()
ax.set_xlabel ('Stations') if xlabel is None else ax.set_xlabel (xlabel)
ax.set_ylabel ('Resistivity (β¦.m)'
) if ylabel is None else ax.set_ylabel (ylabel)
ax.legend( handles = leg,
**legkws )
if show_fig_title:
title = 'Plot ERP: SVES = {0}'.format(station if station is not None else '')
if fig_title_kws is ( None or ...):
fig_title_kws = dict (
t = title if station is not None else title.replace (
': SVES =', ''),
style ='italic',
bbox =dict(boxstyle='round',facecolor ='lightgrey'))
plt.tight_layout()
fig.suptitle(**fig_title_kws,
)
if savefig is not None :
plt.savefig(savefig,
dpi=fig_dpi,
)
plt.close () if savefig is not None else plt.show()
return ax
[docs]
def erpSmartDetector(
constr: list |dict,
erp: ArrayLike,
station:str=None,
coerce:bool=False,
return_cz:bool=False,
view:bool=False,
raise_warn: bool=True,
**plot_kws
):
"""
Automatically detect the drilling location by involving the
constraints observed in the survey area.
Consider the constraints on the survey area and detect the suitable
drilling location. Commonly the `station` is not needed when using
the constraintssince the station indicates that the user is aware
about the reason to select this station. However in the case,
doubts raise, user can set the parameter `coerce` to
``True``.
Parameters
-----------
constr: list, dict
List of restricted station. The constraint or restricted stations are
the station where to ignore when selecting the best drilling location.
Indeed, this is useful since in :term:`DWSC`, not the station are
presumed to be suitable to propose the drilling in technical view.
For instance, if some stations are close to the household waste site,
the stations must be list and ignored.
If the `constr` is passed in a dictionnary, it might be contain, the
key for the restricted stations and the value for the reason why the
station is restricted. For instance::
constr = {"s02": "station close to the household waste"
"S25": "station is located in a marsh area."
}
erp: array-like 1d
DC profiling :term:`ERP` resistivity values
station: str, optional
The station of the presumed location for drilling operations. Commonly
the station is not need when using the constraints. If the station is
given whereas ``coerce=False`` an errors will raise top warnm the users,
To force considering the station in the auto-detection, ``coerce`` must
be set to ``True``.
coerce:bool, default=False,
Allow the station to be consider in the auto-detection.
raise_warn: bool, default=True,
warn the user whether a suitable location is found or not. Returns
``None`` otherwise.
view: bool, default=False,
Plot the conductive zone and restricted stations.
plot_kws:dict,
Additional plotting keywords arguments passed to
:func:`plotAnomaly`.
Return
-------
(station |None) or cz, cs : str,
staion for the drilling operations detected automatically.
If no station is detected, will return ``None``.
if `return_cz` is ``True``, station and the conductive zone are
returned as well as the restricted station position number.
See Also
----------
watex.plotAnomaly: Plot DC profiling :term:`ERP` and conductive zone.
Examples
--------
>>> import numpy as np
>>> from watex.datasets import make_erp
>>> from watex.utils.coreutils import erpSmartDetector
>>> resistivity = make_erp (n_stations =50 , as_frame=True, seed=125).resistivity
>>> # get the min value of the resistivity
>>> resmin_index = np.where ( resistivity==resistivity.min())
42
>>> erpSmartDetector (constr =['s42'], resistivity )
'S13'
>>> # S42 is rejected and selected another zone presumed to be better.
>>> constraints ={"S00": "Marsh area. ",
"S10": " Municipality square, no authorization to make drill",
"S29": "Heritage site",
"S46": "Household waste site",
"S42": "Household waste site"
}
>>> erpSmartDetector (constraints, resistivity)
'S16'
>>> erpSmartDetector (['s12', 's40'], resistivity)
'S29'
>>> # station 42 close s40 is rejected too.
"""
constr_msg=("No suitable location for drilling operations is detected"
" after applying the constraints.")
# assert station when given
s=None
if station is not None:
if not coerce:
raise ERPError(
"Usually the restriction is not applicable when user explicitly"
" sets the station for the drilling operations. Restriction"
" is effective for automatic drilling location. To force"
f" considering the station {station}, set ``coerce=True``.")
s = re.findall('\d+', str(station ))
if len(s)==0:
raise StationError(f"Wrong station {station}. Station must contain"
" the position number. e.g., 'S07'")
s = int (s[0])
# assert erp
if (
hasattr (erp, 'columns')
and hasattr(erp, 'resistivity')
) :
erp = erp.resistivity
erp = check_y (erp, allow_nan=True, input_name="ERP data ")
res_arr = np.array (erp).copy().astype(np.float64) # for consistency
# assert constraint values
if isinstance ( constr , dict):
constr = list( constr)
else:
constr= is_iterable(constr, exclude_string=True,
transform=True, parse_string=True)
constr = list(constr)
# check the effectiveness of constraints
cs = _check_constr_eff (constr, s, station)
# if constraints is not applicable
# list of stations to remove if out of the range
out_cs =list()
if cs is not None:
for ix in cs:
if ix >= len(erp):
if raise_warn:
warnings.warn(f"Station position {ix} is ignored. Position"
f" number {ix} is out range of station number"
" range. By default station numbering starts"
f" from 'S00'--> 'S{len(erp)-1:02}`."
)
out_cs.append(ix )
continue
res_arr = _nan_constr(ix, res_arr)
#------------
if len(out_cs)!=0: [cs.remove (it) for it in out_cs]
cs = None if ( hasattr (cs, '__len__') and len(cs)==0 ) else cs
#-------------
if np.isnan (res_arr).all():
if raise_warn:
warnings.warn(constr_msg)
return
if coerce and station is not None:
cz = _nan_constr(s, res_arr, return_indexed_arr=True )
else:
cz , *_, pos= defineConductiveZone(
res_arr, auto =True)
station = f'S{pos:02}'
if np.isnan (cz).any():
warnings.warn(f"{station!r} seems close to a restricted area."
" It is recommended to not take a risk by considering"
f" {station} for drilling operations. You may leave"
" this station and carry out another ERP line far away"
f" this site. Force considering {station} with its "
" resulting DC-parameters is your own risk.")
if view:
if cs is not None:
ax = plotAnomaly(erp, station= station, cz = cz, **plot_kws)
ax.scatter (cs, erp [cs ], marker="s", s=70,
color = 'red', alpha = .5,
label=f"Restricted station{'s' if len(cs)>1 else ''}")
ax.legend ()
plt.show()
else:
imsg = ( f"{smft([f'S{i:02}' for i in out_cs])} are not valid"
" restricted areas. " if len(out_cs)!=0 else ''
)
if raise_warn:
warnings.warn(f"{imsg}Visualization cannot be possible with no"
" constraints. Use `watex.plotAnomaly()` instead."
)
return (station, cz , cs) if return_cz else station
def _check_constr_eff (constr, six= None, station=None, raise_warn=True):
""" Check if the given station is not in the constraint values.
Raise warning messages otherwise.
:param constr: list of dict conatining the constraint items
:param six: index of the station to apply the constraints
:param station: name of the station. The station may include the position
values.
:param raise_warn: alert user that the site is not appropriate
for drilling.
:return: cs
list of constraints position indexes
"""
def raise_warn_if (l, lt):
""" Raise warning if the no position number is found
:param l: list containing the position number, e.g. e.g: [04]
:param lt: The total position including the letter. eg. 'S04'
"""
if len(l) ==0:
if raise_warn:
warnings.warn(f"Missing position number of station {lt}."
f" Station {lt} is ignored instead.")
return [None]
return [int (l[0])]
# use regex to find the station positions.
cs = [raise_warn_if(re.findall('\d+', key), key) for key in constr ]
# use itertools to generate single list for all
cs=list (itertools.chain (*cs))
# remove all missing position numbers
cs =list(filter (None, cs))
# check duplicate stations
dp = [item for item, count in collections.Counter(cs).items() if count > 1
]
if len(dp)!=0:
warnings.warn(f"Duplicated stations {smft(dp)} found in"
" the constraint items. Single item is kept"
" while others should be discarded.")
cs = list(set(cs))
if six is not None:
# check whether the given station is among the constraint values
d = is_in_if ( cs, [six], return_intersect= True)
if d is not None:
msg = (f"Station {station} is a restricted station. Constraints"
" cannot be applied when the station is explicitly given."
" By default, the constraints applicability is ignored."
f" You may remove the station {station!r} among the "
" restricted stations or select another station."
)
if raise_warn: warnings.warn(msg)
cs= None
return cs
def _nan_constr (cs_ix , arr , return_indexed_arr =False ):
""" Use NaN to mask the constraints in the erp.
:param cs_ix: int, index of the constraint station.
:param arr: DC profiling resistivity array
:param return_indexed_arr:
If ``True``, returns the resistivity values of the selected
conductive zone from constraint.
:return: arraylike
New array of discarded the constraint area.
:example:
>>> import numpy as np
>>> from watex.utils.coreutils import _nan_constr
>>> r = np.linspace (1, 10, 21)
array([ 1. , 1.45, 1.9 , 2.35, 2.8 , 3.25, 3.7 , 4.15, 4.6 ,
5.05, 5.5 , 5.95, 6.4 , 6.85, 7.3 , 7.75, 8.2 , 8.65,
9.1 , 9.55, 10. ])
>>> r = _nan_constr ( 10, r)
>>> r
array([ 1. , 1.45, 1.9 , 2.35, 2.8 , 3.25, 3.7 , nan, nan,
nan, nan, nan, nan, nan, 7.3 , 7.75, 8.2 , 8.65,
9.1 , 9.55, 10. ])
>>> r = _nan_constr (5, r)
>>> r
array([ 1. , 1.45, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, 7.3 , 7.75, 8.2 , 8.65,
9.1 , 9.55, 10. ])
"""
# note that station must be framed with 3 stations before and after.
index_range = np.arange (cs_ix - 3 , cs_ix + 3 +1 )
# if there is a negative index, discarded then
index_range= index_range [ index_range >=0 ]
# use is inx to find the valuable index
mask = _isin( np.arange (len(arr)), index_range, return_mask=True)
index_in = np.arange (len(arr))[mask]
# replace value of index with NaN
arr[index_in] = np.nan
return index_in if return_indexed_arr else arr
[docs]
def defineConductiveZone(
erp:ArrayLike| pd.Series | List[float] ,
station: Optional [str|int] = None,
position: SP = None,
auto: bool = False,
index:str='py',
**kws,
) -> Tuple [ArrayLike, int] :
""" Define conductive zone as subset of the erp line.
Indeed the conductive zone is a specific zone expected to hold the
drilling location `station`. If drilling location is not provided,
it would be by default the very low resistivity values found in the
`erp` line.
Parameters
-----------
erp : array_like,
the array contains the apparent resistivity values
station: str or int,
is the station position name.
position: float,
station position value.
auto: bool
If ``True``, the station position should be the position of the lower
resistivity value in |ERP|.
indexing: str,
Returns
--------
- conductive zone of resistivity values
- conductive zone positionning
- station position index in the conductive zone
- station position index in the whole |ERP| line
:Example:
>>> import numpy as np
>>>
>>> from watex.utils.coreutils import defineConductiveZone
>>> test_array = np.random.randn (10)
>>> selected_cz ,*_ = defineConductiveZone(test_array, 's20')
>>> shortPlot(test_array, selected_cz )
"""
if isinstance(erp, pd.DataFrame):
try: erp = erp.resistivity
except AttributeError:
raise ResistivityError (" Resistivity data is missing ")
if isinstance(erp, pd.Series):
erp = erp.values
erp = check_y(erp, allow_nan= True, input_name ="DC-resistivity ERP data" )
# conductive zone positioning
pcz : Optional [ArrayLike] = None
if station is None and auto is False:
raise StationError("Missing station. Set ``auto=True`` for a naive"
" auto-detection (no-restrictions observed).")
elif ( station is None
and auto is True
):
station= np.argwhere (erp ==np.nanmin(erp))
station= int(station) if len(station) ==1 else int(station[0])
# station, = np.where (erp == erp.min())
# station=int(station)
elif auto and station:
warnings.warn ("Naive auto-detection is ignored while the"
" station is supplied.")
station, pos = _assert_stations(station, index=index, **kws )
# takes the last position if the position is outside
# the number of stations.
msg=("Station position must not be greater than the number of stations."
" It seems the dipole length is used for naming the stations."
" If true, set `dipole` parameter value with the units. For instance"
" '10m' names the stations as S00-S10-S20... and recompute the position"
" for consistency to fit the number of stations. Expect {} stations,"
" got {}."
)
if pos >= len(erp):
raise StationError(msg.format(len(erp), pos))
# pos = len(erp) -1 if pos >= len(erp) else pos
# frame the `sves` (drilling position) within 03 stations left/right
# and define the conductive zone
ir = erp[:pos][-3:] ; il = erp[pos:pos +3 +1 ]
cz = np.concatenate((ir, il))
if position is not None:
if len(position) != len(erp):
raise StationError (
'Array of position and conductive zone must have the same '
f'length: `{len(position)}` and `{len(cz)}` were given.')
sr = position[:pos][-3:] ; sl = position[pos:pos +3 +1 ]
pcz = np.concatenate((sr, sl))
# Get the new position in the selected conductive zone
# from the of the whole erp
pix= np.argwhere (cz == erp[pos])
pix = pix [0] if len(pix) > 1 else pix
return cz , pcz, int(pix), pos
def _assert_stations(
station:Any ,
dipole:Any = None,
index:str = None,
) -> Tuple[str, int]:
""" Sanitize stations and returns station name and index.
``pk`` and ``S`` can be used as prefix to define the station `s`. For
instance ``S01`` and ``PK01`` means the first station.
:param station: Station name
:type station: str, int
:param dipole: dipole_length in meters.
:type dipole: float
:param index: str, default=None,
Stands for keeping the Python indexing. If set to
``py` so the station should start by `S00` and so on.
:returns:
- station name
- index of the station.
.. note::
The defaut station numbering is from 1. So if ``S00` is given, and
the argument `index` is still on its default value i.e ``False``,
the station name should be set to ``S01``. Moreover, if `dipole`
value is given, the station should named according to the
value of the dipole. For instance for `dipole` equals to ``10m``,
the first station should be ``S00``, the second ``S10`` ,
the third ``S30`` and so on. However, it is recommend to name the
station using counting numbers rather than using the dipole
position.
:Example:
>>> from watex.utils.coreutils import _assert_stations
>>> _assert_stations('pk01')
... ('S01', 0)
>>> _assert_stations('S1')
... ('S01', 0)
>>> _assert_stations('S1', index =None)
... ('S01', 1) # station here starts from 0 i.e `S00`
>>> _assert_stations('S00')
... ('S00', 0)
>>> _assert_stations('S1000',dipole ='1km')
... ('S02', 1) # by default it does not keep the Python indexing
>>> _assert_stations('S10', dipole ='10m')
... ('S02', 1)
>>> _assert_stations(1000,dipole =1000)
... ('S02', 1)
"""
# in the case s is string: eg. "00", "pk01", "S001"
sta= copy.deepcopy(station)
ix = 0
stnl =P().istation
station = _assert_all_types(station, str, int, float)
station = str(station).strip()
regex = re.compile (r'\d+', flags= re.IGNORECASE)
station = regex.findall (station)
if len(station)==0:
raise StationError (f"Wrong station name {sta!r}. Station must be "
f"prefixed by {smft(stnl +['S'], 'or')} e.g. "
"'S00' for the first station")
else : station = int(station[0])
if (str(index).lower().find ('py')>=0
or str(index).lower().find ('true')>=0
):
# keep Python indexing for naming stations.
keepindex =True
else: keepindex =False
if station ==0 :
# set index to 0 , is station `S00` is found for instance.
keepindex =True
st = copy.deepcopy(station)
if isinstance(station, int):
msg = 'Station numbering must start'\
' from {0!r} or set `keepindex` argument to {1!r}.'
msg = msg.format('0', 'False') if keepindex else msg.format(
'1', 'True')
if not keepindex: # station starts from 1
if station <=0:
raise ValueError (msg )
station , ix = "S{:02}".format(station), station - 1
elif keepindex:
if station < 0: raise ValueError (msg) # for consistency
station, ix = "S{:02}".format(station ), station
# Recompute the station position if the dipole value are given
if dipole is not None:
if isinstance(dipole, str): #'10m'
if dipole.find('km')>=0:
dipole = dipole.lower().replace('km', '000')
dipole = dipole.lower().replace('m', '')
try :
dipole = float(dipole)
except :
raise StationError(f'Invalid literal value for dipole: {dipole!r}')
# since the renamed from dipole starts at 0
# e.g. 0(S1)---10(S2)---20(S3) ---30(S4)etc ..
ix = int(st//dipole) ; station= "S{:02}".format(ix +1)
return station, ix
def _parse_args (
args:Union[List | str ]
)-> Tuple [ pd.DataFrame, List[str|Any]]:
""" `Parse_args` function returns array of rho and coordinates
values (X, Y).
Arguments can be a list of data, a dataframe or a Path like object. If
a Path-like object is set, it should be the priority of reading.
:param args: arguments
:return: ndarray or array-like arranged with apparent
resistivity at the first index
.. note:: If a list of arrays is given or numpy.ndarray is given,
we assume that the columns at the first index fits the
apparent resistivity values.
:Example:
>>> import numpy as np
>>> from watex.utils.coreutils import _parse_args
>>> a, b = np.arange (1, 10 , 0.5), np.random.randn(9).reshape(3, 3)
>>> _parse_args ([a, 'data/erp/l2_gbalo.xlsx', b])
... array([[1.1010000e+03, 0.0000000e+00, 7.9075200e+05, 1.0927500e+06],
[1.1470000e+03, 1.0000000e+01, 7.9074700e+05, 1.0927580e+06],
[1.3450000e+03, 2.0000000e+01, 7.9074300e+05, 1.0927630e+06],
[1.3690000e+03, 3.0000000e+01, 7.9073800e+05, 1.0927700e+06],
[1.4060000e+03, 4.0000000e+01, 7.9073300e+05, 1.0927765e+06],
[1.5430000e+03, 5.0000000e+01, 7.9072900e+05, 1.0927830e+06],
[1.4800000e+03, 6.0000000e+01, 7.9072400e+05, 1.0927895e+06],
[1.5170000e+03, 7.0000000e+01, 7.9072000e+05, 1.0927960e+06],
[1.7540000e+03, 8.0000000e+01, 7.9071500e+05, 1.0928025e+06],
[1.5910000e+03, 9.0000000e+01, 7.9071100e+05, 1.0928090e+06]])
"""
keys= ['res', 'rho', 'app.res', 'appres', 'rhoa']
col=None
if isinstance(args, list):
args, isfile = _assert_file(args) # file to datafame
if not isfile: # list of values
# _assert _list of array_length
args = np.array(args, dtype =np.float64).T
if isinstance(args, pd.DataFrame):
# firt drop all untitled items
# if data is from xlsx sheets
args.drop([ c for c in args.columns if c.find('untitle')>=0 ],
axis =1, inplace =True)
# get the index of items `resistivity`
ixs = [ii for ii, name in enumerate(args.columns )
for item in keys if name.lower().find(item)>=0]
if len(set(ixs))==0:
raise ValueError(
f"Column name `resistivity` not found in {list(args.columns)}"
" Please provide the resistivity column.")
elif len(set(ixs))>1:
raise ValueError (
f"Expected 1 but got {len(ixs)} resistivity columns "
f"{tuple([list(args.columns)[i] for i in ixs])}.")
rc= args.pop(args.columns[ixs[0]])
args.insert(0, 'app.res', rc)
col =list(args.columns )
args = args.values
if isinstance(args, pd.Series):
col =args.name
args = args.values
return args, col
def _assert_file (
args: List[str, Any]
)-> Tuple [List [str , pd.DataFrame] | Any , bool]:
""" Check whether the data is gathering into a Excel sheet workbook file.
If the workbook is detected, will read the data and grab all into a
dataframe.
:param args: argument into a list
:returns:
- dataframe
- assert whether workbook was successful read.
:Example:
>>> import numpy as np
>>> from watex.utils.coreutils import _assert_file
>>> a, b = np.arange (1, 10 , 0.5), np.random.randn(9).reshape(3, 3)
>>> data = [a, 'data/erp/l2_gbalo', b] # collection of 03 objects
>>> # but read only the Path-Like object
>>> _assert_file([a, 'data/erp/l2_gbalo.xlsx', b])
...
['l2_gbalo',
pk x y rho
0 0 790752 1092750.0 1101
1 10 790747 1092758.0 1147
2 20 790743 1092763.0 1345
3 30 790738 1092770.0 1369
4 40 790733 1092776.5 1406
5 50 790729 1092783.0 1543
6 60 790724 1092789.5 1480
7 70 790720 1092796.0 1517
8 80 790715 1092802.5 1754
9 90 790711 1092809.0 1591]
"""
isfile =False
file = [ item for item in args if isinstance(item, str)
if os.path.isfile (item)]
if len(file) > 1:
raise ValueError (
f"Expected a single file but got {len(file)}. "
"Please select the right file expected to contain the data.")
if len(file) ==1 :
_, args = read_from_excelsheets(file[0])
isfile =True
return args , isfile
[docs]
def makeCoords(
reflong: str | Tuple[float],
reflat: str | Tuple[float],
nsites: int ,
*,
r: int =45.,
utm_zone: Optional[str] =None,
step: Optional[str|float] ='1km',
order: str = '+',
todms: bool =False,
is_utm: bool =False,
raise_warning: bool=True,
**kws
)-> Tuple[ArrayLike[DType[float]]]:
""" Generate multiple stations coordinates (longitudes, latitudes)
from a reference station/site.
One degree of latitude equals approximately 364,000 feet (69 miles),
one minute equals 6,068 feet (1.15 miles), and one-second equals 101 feet.
One-degree of longitude equals 288,200 feet (54.6 miles), one minute equals
4,800 feet (0.91 mile) , and one second equals 80 feet. Illustration showing
longitude convergence. (1 feet ~=0.3048 meter)
Parameters
----------
reflong: float or string or list of [start, stop]
Reference longitude in degree decimal or in DD:MM:SS for the first
site considered as the origin of the landmark.
reflat: float or string or list of [start, stop]
Reference latitude in degree decimal or in DD:MM:SS for the reference
site considered as the landmark origin. If value is given in a list,
it can contain the start point and the stop point.
nsites: int or float
Number of site to generate the coordinates onto.
r: float or int
The rotate angle in degrees. Rotate the angle features the direction
of the projection line. Default value is ``45`` degrees.
step: float or str
Offset or the distance of seperation between different sites in meters.
If the value is given as string type, except the ``km``, it should be
considered as a ``m`` value. Only meters and kilometers are accepables.
order: str
Direction of the projection line. By default the projected line is
in ascending order i.e. from SW to NE with angle `r` set to ``45``
degrees. Could be ``-`` for descending order. Any other value should
be in ascending order.
is_utm: bool,
Consider the first two positional arguments as UTM coordinate values.
This is an alternative way to assume `reflong` and `reflat` are UTM
coordinates 'easting'and 'northing` by default. If `utm2deg` is ``False``,
any value greater than 180 degrees for longitude and 90 degrees for
latitude will raise an error. Default is ``False``.
utm_zone: string (##N or ##S)
utm zone in the form of number and North or South hemisphere, 10S or 03N
Must be given if `utm2deg` is set to ``True``.
todms: bool
Convert the degree decimal values into the DD:MM:SS. Default is ``False``.
raise_warning: bool, default=True,
Raises warnings if GDAL is not set or the coordinates accurately status.
kws: dict,
Additional keywords of :func:`.gistools.project_point_utm2ll`.
Returns
-------
Tuple of generated projected coordinates longitudes and latitudes
either in degree decimals or DD:MM:SS
Notes
------
The distances vary. A degree, minute, or second of latitude remains
fairly constant from the equator to the poles; however a degree, minute,
or second of longitude can vary greatly as one approaches the poles
and the meridians converge.
References
----------
https://math.answers.com/Q/How_do_you_convert_degrees_to_meters
Examples
--------
>>> from watex.utils.coreutils import makeCoords
>>> rlons, rlats = makeCoords('110:29:09.00', '26:03:05.00',
... nsites = 7, todms=True)
>>> rlons
... array(['110:29:09.00', '110:29:35.77', '110:30:02.54', '110:30:29.30',
'110:30:56.07', '110:31:22.84', '110:31:49.61'], dtype='<U12')
>>> rlats
... array(['26:03:05.00', '26:03:38.81', '26:04:12.62', '26:04:46.43',
'26:05:20.23', '26:05:54.04', '26:06:27.85'], dtype='<U11')
>>> rlons, rlats = makeCoords ((116.7, 119.90) , (44.2 , 40.95),
nsites = 238, step =20. ,
order = '-', r= 125)
>>> rlons
... array(['119:54:00.00', '119:53:11.39', '119:52:22.78', '119:51:34.18',
'119:50:45.57', '119:49:56.96', '119:49:08.35', '119:48:19.75',
...
'116:46:03.04', '116:45:14.43', '116:44:25.82', '116:43:37.22',
'116:42:48.61', '116:42:00.00'], dtype='<U12')
>>> rlats
... array(['40:57:00.00', '40:57:49.37', '40:58:38.73', '40:59:28.10',
'41:00:17.47', '41:01:06.84', '41:01:56.20', '41:02:45.57',
...
'44:07:53.16', '44:08:42.53', '44:09:31.90', '44:10:21.27',
'44:11:10.63', '44:12:00.00'], dtype='<U11')
"""
def assert_ll(coord):
""" Assert coordinate when the type of the value is string."""
try: coord= float(coord)
except ValueError:
if ':' not in coord:
raise ValueError(f'Could not convert value to float: {coord!r}')
else :
coord = convert_position_str2float(coord)
return coord
xinf, yinf = None, None
nsites = int(_assert_all_types(nsites,int, float))
if isinstance (reflong, (list, tuple, np.ndarray)):
reflong , xinf, *_ = reflong
if isinstance (reflat, (list, tuple, np.ndarray)):
reflat , yinf, *_ = reflat
step=str(step).lower()
if step.find('km')>=0: # convert to meter
step = float(step.replace('km', '')) *1e3
elif step.find('m')>=0: step = float(step.replace('m', ''))
step = float(step) # for consistency
if str(order).lower() in ('descending', 'down', '-'): order = '-'
else: order ='+'
# compute length of line using the reflong and reflat
# the origin of the landmark is x0, y0= reflong, reflat
x0= assert_ll(reflong) if is_utm else assert_ll(
assert_lon_value(reflong))
y0= assert_ll(reflat) if is_utm else assert_ll(
assert_lat_value(reflat))
xinf = xinf or x0 + (np.sin(np.deg2rad(r)) * step * nsites
) / (364e3 *.3048)
yinf = yinf or y0 + (np.cos(np.deg2rad(r)) * step * nsites
) /(2882e2 *.3048)
reflon_ar = np.linspace(x0 , xinf, nsites )
reflat_ar = np.linspace(y0, yinf, nsites)
#--------------------------------------------------------------------------
# r0 = np.sqrt(((x0-xinf)*364e3 *.3048)**2 + ((y0 -yinf)*2882e2 *.3048)**2)
# print('recover distance = ', r0/nsites )
#--------------------------------------------------------------------------
if is_utm :
if utm_zone is None:
raise TypeError("Please provide your UTM zone e.g.'10S' or '03N' !")
lon = np.zeros_like(reflon_ar)
lat = lon.copy()
for kk , (lo, la) in enumerate (zip( reflon_ar, reflat_ar)):
try :
with warnings.catch_warnings(): # ignore multiple warnings
warnings.simplefilter('ignore')
lat[kk], lon[kk] = project_point_utm2ll(
easting= la, northing=lo, utm_zone=utm_zone, **kws)
except :
lat[kk], lon[kk] = utm_to_ll(
23, northing=lo, easting=la, zone=utm_zone)
if not HAS_GDAL :
if raise_warning:
warnings.warn("It seems GDAL is not set! will use the equations"
" from USGS Bulletin 1532. Be aware, the positionning"
" is less accurate than using GDAL.")
if raise_warning:
warnings.warn("By default,'easting/northing' are assumed to"
" fit the 'longitude/latitude' respectively.")
reflat_ar, reflon_ar = lat , lon
if todms:
reflat_ar = np.array(list(
map(lambda l: convert_position_float2str(float(l)), reflat_ar)))
reflon_ar = np.array(list(
map(lambda l: convert_position_float2str(float(l)), reflon_ar)))
return (reflon_ar , reflat_ar ) if order =='+' else (
reflon_ar[::-1] , reflat_ar[::-1] )
#XXX OPTIMIZE
[docs]
def parseDCArgs(fn :str ,
delimiter:Optional[str]=None,
arg='stations'
)-> ArrayLike [str]:
""" Parse DC `stations` and `search` arguments from file and output to
array accordingly.
The `froms` argument is the depth in meters from which one expects to find
a fracture zone outside of pollutions. Indeed, the `fromS` parameter is
used to speculate about the expected groundwater in the fractured rocks
under the average level of water inrush in a specific area. For more details
refer to :attr:`watex.methods.electrical.VerticalSounding.fromS`
documentation.
:param fn: path-like object, full path to DC station or fromS file.
if data is considered as a station file, it must be composed
the station names. Commonly it can be used to specify the selected
station of all DC-resistity line where one expects
to locate the drilling.
Conversly, the fromS file should not include any letter so if given,
ot sould be removed.
:param arg: str of the attribute of the DC methods.Any other value except
``station`` should considered as ``fromS`` value and will parse the
file accordingly.
:param delimiter: str , delimiter to separate the different stations
or 'fromS' value. For instance, use use < delimiter=' '> when all
values are separated with space and be arranged in the same line like::
>>> 'S02 S12 S12 S15 S28 S30' # line of the file.
:return:
array: array of station name.
:note: if all station prefixes belong to the module station property object
i.e :class:`watex.property.P.istation`, the prefix should be overwritten
to only keep the `S`. For instance 'pk25'-> 'S25'
:Example:
>>> from watex.utils.coreutils import parseDCArgs
>>> sf='data/sfn.txt' # use delimiter if values are in the same line.
>>> sdata= parseDCArgs(sf)
>>> sdata
...
>>> # considered that the digits in the file correspond to the depths
>>> fdata= parseDCArgs(sf, arg='froms')
>>> fdata
...
"""
if not os.path.isfile (fn):
raise FileNotFoundError("No file found:")
arg= str(arg).lower().strip()
if arg.find('station')>=0 :
arg ='station'
with open(fn, 'r', encoding ='utf8') as f :
sdata = f.readlines ()
if delimiter is not None:
# flatter list into a list
sdata = list(map (lambda l: l.split(delimiter), sdata ))
sdata = list(itertools.chain(*sdata))
regex =re.compile (rf"{'|'.join([a for a in (P().istation+['S'])])}",
flags =re.IGNORECASE
) if arg =='station' else re.compile (
r'\d+', flags=re.IGNORECASE )
sdata = list(map(lambda o: regex.sub('S', o.strip()),
sdata )
) if arg =='station' else list(map(
lambda o: regex.findall(o.strip()), sdata )
)
# for consitency delte all empty string in the list
sdata = list(filter (None, sdata ))
return np.array(sdata )if arg=='station' else reshape (np.array(
sdata ).astype(float))
[docs]
def read_data (
f: str|pathlib.PurePath,
sanitize: bool= ...,
reset_index: bool=...,
comments: str="#",
delimiter: str=None,
columns: List[str]=None,
npz_objkey: str= None,
verbose: bool= ...,
**read_kws
) -> DataFrame:
""" Assert and read specific files and url allowed by the package
Readable files are systematically convert to a data frame.
Parameters
-----------
f: str, Path-like object
File path or Pathlib object. Must contain a valid file name and
should be a readable file or url
sanitize: bool, default=False,
Push a minimum sanitization of the data such as:
- replace a non-alphabetic column items with a pattern '_'
- cast data values to numeric if applicable
- drop full NaN columns and rows in the data
reset_index: bool, default=False,
Reset index if full NaN columns are dropped after sanitization.
.. versionadded:: 0.2.5
Apply minimum data sanitization after reading data.
comments: str or sequence of str or None, default='#'
The characters or list of characters used to indicate the start
of a comment. None implies no comments. For backwards compatibility,
byte strings will be decoded as 'latin1'.
delimiter: str, optional
The character used to separate the values. For backwards compatibility,
byte strings will be decoded as 'latin1'. The default is whitespace.
npz_objkey: str, optional
Dataset key to indentify array in multiples array storages in '.npz'
format. If key is not set during 'npz' storage, ``arr_0`` should
be used.
.. versionadded:: 0.2.7
Capable to read text and numpy formats ('.npy' and '.npz') data. Note
that when data is stored in compressed ".npz" format, provided the
'.npz' object key as argument of parameter `npz_objkey`. If None,
only the first array should be read and ``npz_objkey='arr_0'``.
verbose: bool, default=0
Outputs message for user guide.
read_kws: dict,
Additional keywords arguments passed to pandas readable file keywords.
Returns
-------
f: :class:`pandas.DataFrame`
A dataframe with head contents by default.
See Also
---------
np.loadtxt:
load text file.
np.load
Load uncompressed or compressed numpy `.npy` and `.npz` formats.
watex.utils.baseutils.save_or_load:
Save or load numpy arrays.
"""
def min_sanitizer ( d, /):
""" Apply a minimum sanitization to the data `d`."""
return to_numeric_dtypes(
d, sanitize_columns= True,
drop_nan_columns= True,
reset_index=reset_index,
verbose = verbose ,
fill_pattern='_',
drop_index = True
)
sanitize, reset_index, verbose = ellipsis2false (
sanitize, reset_index, verbose )
if ( isinstance ( f, str )
and str(os.path.splitext(f)[1]).lower()in (
'.txt', '.npy', '.npz')
):
f = save_or_load(f, task = 'load', comments=comments,
delimiter=delimiter )
# if extension is .npz
if isinstance(f, np.lib.npyio.NpzFile):
npz_objkey = npz_objkey or "arr_0"
f = f[npz_objkey]
if columns is not None:
columns = is_iterable(columns, exclude_string= True,
transform =True, parse_string =True
)
if len( columns )!= f.shape [1]:
warnings.warn(f"Columns expect {f.shape[1]} attributes."
f" Got {len(columns)}")
f = pd.DataFrame(f, columns=columns )
if isinstance (f, pd.DataFrame):
if sanitize:
f = min_sanitizer (f)
return f
cpObj= Config().parsers
f= _check_readable_file(f)
_, ex = os.path.splitext(f)
if ex.lower() not in tuple (cpObj.keys()):
raise TypeError(f"Can only parse the {smft(cpObj.keys(), 'or')} files"
)
try :
f = cpObj[ex](f, **read_kws)
except FileNotFoundError:
raise FileNotFoundError (
f"No such file in directory: {os.path.basename (f)!r}")
except BaseException as e :
raise FileHandlingError (
f"Cannot parse the file : {os.path.basename (f)!r}. "+ str(e))
if sanitize:
f = min_sanitizer (f)
return f
def _check_readable_file (f):
""" Return file name from path objects """
msg =(f"Expects a Path-like object or URL. Please, check your"
f" file: {os.path.basename(f)!r}")
if not os.path.isfile (f): # force pandas read html etc
if not ('http://' in f or 'https://' in f ):
raise TypeError (msg)
elif not isinstance (f, (str , pathlib.PurePath)):
raise TypeError (msg)
if isinstance(f, str): f =f.strip() # for consistency
return f
def _validate_ves_data_if(data, index_rhoa , err , **kws):
""" Validate VES data if data is given as a Path-like object and
returns AB/2 position, MN if exists and resistivity data.
:param data: str, path-like object
litteral path string or PathLib object
:param index_rhoa: int,
Index to retreive the resistivity data is the number of sounding
point are greater than 1
:param err: :class:`~watex.exceptions.VESError`
VESerror messages
:returns:
- rhoa: resistivity data
- AB : current electodes measurement values
- MN: potential electrodes measurement if exists in the data file.
- rxy: Accept the coordinates xy of the place where the sounding is
taken.
"""
if isinstance(data, (str, pathlib.PurePath)):
try :
data = _is_readable(data, **kws)
except TypeError as typError:
raise VESError (str(typError))
data = _assert_all_types(data, pd.DataFrame )
# sanitize the dataframe
pObj =P() ; ncols = pObj(hl = list(data.columns), kind ='ves')
if ncols is None:
raise HeaderError (f"Columns {smft(pObj.icpr)} are missing in "
"the given dataset.")
err_msg = ("VES data must contain 'the resistivity' and the depth"
" measurement 'AB/2'. A sample of VES data can be found"
" in `watex.datasets`. For e.g. 'watex.datasets.load_semien'"
" fetches a 'semien' locality dataset and its docstring"
" `~.load_semien.__doc__` can give a furher details about"
" the VES data arrangement."
)
try:
data.columns = ncols
except : pass
data = is_valid_dc_data(data, method ="ves", exception =VESError,
extra = err_msg)
try :
rhoa= data.resistivity
except :
raise ResistivityError(
"Data validation aborted! Missing resistivity values.")
else :
# In the case, we got a multiple resistivity values
# corresponding to the different sounding values
index_rhoa = index_rhoa or 0
# for consistency
index_rhoa = int (index_rhoa )
if ( not _is_arraylike_1d( rhoa)
and (
index_rhoa >= rhoa.shape[1]
or index_rhoa < 0
)
):
warnings.warn(f"The index {index_rhoa} is out of the range."
f" '{len(rhoa.columns)-1}' is max index for "
"selecting the specific resistivity data. "
"However, the resistivity data at index 0 is "
" kept by default."
)
index_rhoa= 0
rhoa = rhoa.iloc[:, index_rhoa] if not _is_arraylike_1d(
rhoa) else rhoa
if 'MN' in data.columns:
MN = data.MN
try:
AB= data.AB
except:
raise err
ext = [ xy for xy in data.columns if xy in (
'longitude', 'latitude', 'easting', 'northing')]
rxy = None if len(ext)==0 else data [ext ]
return rhoa, AB, MN , rxy
def _is_readable (
f:str,
*,
as_frame:bool=False,
columns:List[str]=None,
input_name='f',
**kws
) -> DataFrame:
""" Assert and read specific files and url allowed by the package
Readable files are systematically convert to a pandas frame.
Parameters
-----------
f: Path-like object -Should be a readable files or url
columns: str or list of str
Series name or columns names for pandas.Series and DataFrame.
to_frame: str, default=False
If ``True`` , reconvert the array to frame using the columns orthewise
no-action is performed and return the same array.
input_name : str, default=""
The data name used to construct the error message.
raise_warning : bool, default=True
If True then raise a warning if conversion is required.
If ``ignore``, warnings silence mode is triggered.
raise_exception : bool, default=False
If True then raise an exception if array is not symmetric.
force:bool, default=False
Force conversion array to a frame is columns is not supplied.
Use the combinaison, `input_name` and `X.shape[1]` range.
kws: dict,
Pandas readableformats additional keywords arguments.
Returns
---------
f: pandas dataframe
A dataframe with head contents...
"""
if hasattr (f, '__array__' ) :
f = array_to_frame(
f,
to_frame= True ,
columns =columns,
input_name=input_name ,
raise_exception= True,
force= True,
)
return f
cpObj= Config().parsers
f= _check_readable_file(f)
_, ex = os.path.splitext(f)
if ex.lower() not in tuple (cpObj.keys()):
raise TypeError(f"Can only parse the {smft(cpObj.keys(), 'or')} files"
f" not {ex!r}.")
try :
f = cpObj[ex](f, **kws)
except FileNotFoundError:
raise FileNotFoundError (
f"No such file in directory: {os.path.basename (f)!r}")
except:
raise FileHandlingError (
f" Can not parse the file : {os.path.basename (f)!r}")
return f