Source code for cf_pandas.utils

"""
Utilities for cf-pandas.
"""

from collections import ChainMap
from typing import Any, Iterable, Optional, Union

import numpy as np
import pandas as pd
import regex
from pandas import Series

from .options import OPTIONS


[docs]def always_iterable(obj: Any, allowed=(tuple, list, set, dict)) -> Iterable: """This is from cf-xarray.""" return [obj] if not isinstance(obj, allowed) else obj
[docs]def astype(value, type_): """Return `value` as type `type_`. Particularly made to work correctly for returning string, `PosixPath`, or `Timestamp` as list. """ if not isinstance(value, type_): import pathlib if type_ == list and isinstance(value, (str, pathlib.PurePath, pd.Timestamp)): return [value] return type_(value) return value
[docs]def set_up_criteria(criteria: Union[dict, Iterable] = None) -> ChainMap: """Get custom criteria from options. Parameters ---------- criteria : dict, optional Criteria to use to map from variable to attributes describing the variable. If user has defined custom_criteria, this will be used by default. Returns ------- ChainMap Criteria """ if criteria is None: if not OPTIONS["custom_criteria"]: raise ValueError( "criteria needs to be defined either using set_options or directly input." ) criteria_it = OPTIONS["custom_criteria"] else: criteria_it = always_iterable(criteria, allowed=(tuple, list, set)) # # Add in coordinate_criteria to be able to identify coordinates too # criteria_it[0].update(coordinate_criteria) return ChainMap(*criteria_it)
[docs]def match_criteria_key( available_values: list, keys_to_match: Union[str, list], criteria: Optional[dict] = None, split: bool = False, ) -> list: """Use criteria to choose match to key from available available_values. Parameters ---------- available_values: list String or list of strings to compare against list of category values. They should be keys in `criteria`. keys_to_match : str, list Key(s) from criteria to match with available_values. criteria : dict, optional Criteria to use to map from variable to attributes describing the variable. If user has defined custom_criteria, this will be used by default. split : bool, optional If split is True, split the available_values by white space before performing matches. This is helpful e.g. when columns headers have the form "standard_name (units)" and you want to match standard_name. Returns ------- list Values from available_values that match keys_to_match, according to criteria. Notes ----- This uses logic from `cf-xarray`. """ custom_criteria = set_up_criteria(criteria) keys_to_match = astype(keys_to_match, list) results = [] for key in keys_to_match: if custom_criteria is not None and key in custom_criteria: # criterion is the attribute type — in this function we don't use it, # instead we use all the patterns available in criteria to match with available_values for criterion, patterns in custom_criteria[key].items(): if split: results.extend( list( set( [ value for value in available_values for value_part in value.split() if regex.match(patterns, value_part) ] ) ) ) else: results.extend( list( set( [ value for value in available_values if regex.match(patterns, value) ] ) ) ) # catch scenario that user input valid reader variable names else: if key in available_values: results.append(key) return list(set(results))
[docs]def standard_names(): """Returns list of CF standard_names. Returns ------- list All CF standard_names """ import requests from bs4 import BeautifulSoup url = "https://cfconventions.org/Data/cf-standard-names/79/src/cf-standard-name-table.xml" req = requests.get(url) soup = BeautifulSoup(req.content, features="xml") standard_names = [entry.get("id") for entry in soup.find_all("entry")] return standard_names
def _is_datetime_like(da: Series) -> bool: if np.issubdtype(da.dtype, np.datetime64) or np.issubdtype( da.dtype, np.timedelta64 ): return True return False