"""Class for writing regular expressions."""
from typing import List, Optional, Sequence, Type, Union
from .utils import astype
[docs]class Reg(object):
"""Class to write a regular expression.
Notes
-----
* Input strings are never allowed to be empty.
* Need escape characters on any special characters, and then convert to raw, e.g., r"\[celsius\]" for "[celsius]".
* The `exclude` options are logical "or".
* The `include` option is logical "and", `include_or` is logical "or", and the other `include_` options allow for only one selection. If you want to use more than one `include_start` for example, you should make an additional regular expression.
"""
def __init__(
self,
exclude: Optional[Union[List[str], str]] = None,
exclude_start: Optional[Union[List[str], str]] = None,
exclude_end: Optional[Union[List[str], str]] = None,
include: Optional[Union[List[str], str]] = None,
include_or: Optional[Union[List[str], str]] = None,
include_exact: Optional[str] = None,
include_start: Optional[str] = None,
include_end: Optional[str] = None,
ignore_case: bool = True,
):
self._exclude = (
[] if exclude is None or exclude == "" else astype(exclude, list)
)
self._exclude_start = (
[]
if exclude_start is None or exclude_start == ""
else astype(exclude_start, list)
)
self._exclude_end = (
[]
if exclude_end is None or exclude_start == ""
else astype(exclude_end, list)
)
self._include = (
[] if include is None or include == "" else astype(include, list)
)
self._include_or = (
[] if include_or is None or include_or == "" else astype(include_or, list)
)
self._include_exact = (
"" if include_exact is None or include_exact == "" else include_exact
)
self._include_start = (
"" if include_start is None or include_start == "" else include_start
)
self._include_end = (
"" if include_end is None or include_end == "" else include_end
)
self.ignore_case = ignore_case
self.check()
[docs] def check(self):
"""Check to make sure selected options are compatible."""
others = [
self._exclude,
self._exclude_start,
self._exclude_end,
self._include,
self._include_or,
self._include_end,
self._include_start,
]
if (len(self._include_exact) > 0) and any([len(attr) > 0 for attr in others]):
raise ValueError(
"If `include_exact` is used, do not input any other options."
)
if not isinstance(self._include_exact, str):
raise TypeError("`include_exact` should be a str.")
if not isinstance(self._include_end, str):
raise TypeError("`include_end` should be a str.")
if not isinstance(self._include_start, str):
raise TypeError("`include_start` should be a str.")
[docs] def exclude(self, string: Union[str, list]):
"""Exclude string from anywhere in matches.
Parameters
----------
string: str, list
Matches with regular expression `pattern` will not contain string(s).
Notes
-----
As a list of strings, this acts as a logical "or" for the exclusions.
"""
if string != "":
self._exclude += astype(string, list)
self.check()
[docs] def exclude_start(self, string: Union[str, list]):
"""Exclude string from start of matches.
Parameters
----------
string: str, list
Matches with regular expression `pattern` will not start with string(s).
Notes
-----
As a list of strings, this acts as a logical "or" for the exclusions.
"""
if string != "":
self._exclude_start += astype(string, list)
self.check()
[docs] def exclude_end(self, string: Union[str, list]):
"""Exclude string from end of matches.
Parameters
----------
string: str, list
Matches with regular expression `pattern` will not end with string(s).
Notes
-----
As a list of strings, this acts as a logical "or" for the exclusions.
"""
if string != "":
self._exclude_end += astype(string, list)
self.check()
[docs] def include_exact(self, string: str):
"""String must match exactly.
Parameters
----------
string: str
A match with regular expression `pattern` will be exactly string.
"""
if len(self._include_exact) > 0:
raise ValueError("`include_exact` already contains a string.")
if string != "":
self._include_exact = string
self.check()
[docs] def include(self, string: Union[str, list]):
"""String must be present anywhere in matches, logical "and".
Parameters
----------
string: str, list
Matches with regular expression `pattern` will contain all string(s).
Notes
-----
A list of strings will be treated as a logical "and".
"""
if string != "":
self._include += astype(string, list)
self.check()
[docs] def include_or(self, string: Union[str, list]):
"""String must be present anywhere in matches, logical "or".
Parameters
----------
string: str, list
Matches with regular expression `pattern` will contain at lease one of string(s).
Notes
-----
A list of strings will be treated as a logical "or".
"""
if string != "":
self._include += astype(string, list)
self.check()
[docs] def include_end(self, string: str):
"""String must be present at the end of matches.
Parameters
----------
string: str
Matches with regular expression `pattern` will end with string.
"""
if len(self._include_end) > 0:
raise ValueError("`include_end` already contains a string.")
if string != "":
self._include_end = string
self.check()
[docs] def include_start(self, string: str):
"""String must be present at the start of matches.
Parameters
----------
string: str
Matches with regular expression `pattern` will start with string.
"""
if len(self._include_start) > 0:
raise ValueError("`include_start` already contains a string.")
if string != "":
self._include_start = string
self.check()
[docs] def pattern(self) -> str:
"""Generate regular expression pattern from user rules.
Returns
-------
str
Regular expression accounting for all input selections.
"""
self._pattern = ""
# the order of these statements is critical to get expressions correct
if self.ignore_case:
self._pattern += "(?i)"
if len(self._exclude) > 0: # this should be first
self._pattern += f"^(?!.*({'|'.join(self._exclude)}))"
if len(self._exclude_start) > 0:
self._pattern += f"^(?!({'|'.join(self._exclude_start)}))"
if len(self._exclude_end) > 0:
self._pattern += f"(?!.*({'|'.join(self._exclude_end)})$)"
if len(self._include_start) > 0:
self._pattern += f"^{self._include_start}.*"
# this is logical "or"
if len(self._include_or) > 0:
self._pattern += f".*({'|'.join(self._include_or)}).*"
# this is logical "and"
if len(self._include) > 0:
self._pattern += "".join([f"(?=.*{string})" for string in self._include])
if len(self._include_end) > 0:
self._pattern += f".*{self._include_end}$"
# Exact matches are their own expression, not combined with any others.
if len(self._include_exact) > 0:
self._pattern = f"{self._include_exact}$"
return self._pattern
[docs]def joinpat(regs: Sequence[Reg]) -> str:
"""Join patterns from Reg objects.
Parameters
----------
regs: Sequence
Reg objects from which `.pattern()` will be used.
Returns
-------
str
Regular expression patterns from regs joined together with "|"
"""
return "|".join([reg.pattern() for reg in regs])