Source code for cf_pandas.utils

"""
Utilities for cf-pandas.
"""

from collections import ChainMap
from typing import Any, Iterable, Optional, Union

import numpy as np
import pandas as pd
import regex
from pandas import Series

from .options import OPTIONS


[docs]def always_iterable(obj: Any, allowed=(tuple, list, set, dict)) -> Iterable:
    """This is from cf-xarray."""
    return [obj] if not isinstance(obj, allowed) else obj


[docs]def astype(value, type_):
    """Return `value` as type `type_`.
    Particularly made to work correctly for returning string, `PosixPath`, or `Timestamp` as list.
    """

    if not isinstance(value, type_):
        import pathlib

        if type_ == list and isinstance(value, (str, pathlib.PurePath, pd.Timestamp)):
            return [value]
        return type_(value)
    return value


[docs]def set_up_criteria(criteria: Union[dict, Iterable] = None) -> ChainMap:
    """Get custom criteria from options.

    Parameters
    ----------
    criteria : dict, optional
        Criteria to use to map from variable to attributes describing the variable. If user has defined
        custom_criteria, this will be used by default.

    Returns
    -------
    ChainMap
        Criteria
    """

    if criteria is None:
        if not OPTIONS["custom_criteria"]:
            raise ValueError(
                "criteria needs to be defined either using set_options or directly input."
            )
        criteria_it = OPTIONS["custom_criteria"]
    else:
        criteria_it = always_iterable(criteria, allowed=(tuple, list, set))

    # # Add in coordinate_criteria to be able to identify coordinates too
    # criteria_it[0].update(coordinate_criteria)

    return ChainMap(*criteria_it)


[docs]def match_criteria_key(
    available_values: list,
    keys_to_match: Union[str, list],
    criteria: Optional[dict] = None,
    split: bool = False,
) -> list:
    """Use criteria to choose match to key from available available_values.

    Parameters
    ----------
    available_values: list
        String or list of strings to compare against list of category values. They should be keys in `criteria`.
    keys_to_match : str, list
        Key(s) from criteria to match with available_values.
    criteria : dict, optional
        Criteria to use to map from variable to attributes describing the variable. If user has defined custom_criteria, this will be used by default.
    split : bool, optional
        If split is True, split the available_values by white space before performing matches. This is helpful e.g. when columns headers have the form "standard_name (units)" and you want to match standard_name.

    Returns
    -------
    list
        Values from available_values that match keys_to_match, according to criteria.

    Notes
    -----
    This uses logic from `cf-xarray`.
    """

    custom_criteria = set_up_criteria(criteria)

    keys_to_match = astype(keys_to_match, list)
    results = []
    for key in keys_to_match:

        if custom_criteria is not None and key in custom_criteria:
            # criterion is the attribute type — in this function we don't use it,
            # instead we use all the patterns available in criteria to match with available_values
            for criterion, patterns in custom_criteria[key].items():
                if split:
                    results.extend(
                        list(
                            set(
                                [
                                    value
                                    for value in available_values
                                    for value_part in value.split()
                                    if regex.match(patterns, value_part)
                                ]
                            )
                        )
                    )

                else:
                    results.extend(
                        list(
                            set(
                                [
                                    value
                                    for value in available_values
                                    if regex.match(patterns, value)
                                ]
                            )
                        )
                    )

        # catch scenario that user input valid reader variable names
        else:
            if key in available_values:
                results.append(key)
    return list(set(results))


[docs]def standard_names():
    """Returns list of CF standard_names.

    Returns
    -------
    list
        All CF standard_names
    """

    import requests
    from bs4 import BeautifulSoup

    url = "https://cfconventions.org/Data/cf-standard-names/79/src/cf-standard-name-table.xml"
    req = requests.get(url)
    soup = BeautifulSoup(req.content, features="xml")

    standard_names = [entry.get("id") for entry in soup.find_all("entry")]

    return standard_names


def _is_datetime_like(da: Series) -> bool:
    if np.issubdtype(da.dtype, np.datetime64) or np.issubdtype(
        da.dtype, np.timedelta64
    ):
        return True

    return False