Source code for talkgooder

# SPDX-License-Identifier: MIT

import re
from typing import Dict, List, Pattern, Tuple

"""Utilities to smooth out language rules.

``talkgooder`` attempts to smooth out grammar, punctuation, and
number-related corner cases when formatting text for human consumption.
It is intended for applications where you know there's a noun and are
trying to generate text, but you don't know much about it.
"""


def _get_plural_data(
    addl_same: List[str] = None,
    addl_special_s: List[str] = None,
    addl_irregular: Dict[str, str] = None,
) -> Tuple[List[str], List[str], Dict[str, str], Pattern[str]]:
    """Get plural data structures for en-US locale."""
    if addl_same is None:
        addl_same = []
    if addl_special_s is None:
        addl_special_s = []
    if addl_irregular is None:
        addl_irregular = {}

    # Same singular as plural, can be extended via addl_same parameter
    en_us_same = [
        "aircraft",
        "buffalo",
        "deer",
        "fish",
        "goose",
        "hovercraft",
        "moose",
        "salmon",
        "sheep",
        "shrimp",
        "spacecraft",
        "trout",
        "watercraft",
    ] + addl_same

    # Doesn't follow other rules, plural is always s, can be extended via addl_special_s
    en_us_special_s = [
        "cello",
        "hello",
        "photo",
        "piano",
        "proof",
        "roof",
        "spoof",
        "zero",
    ] + addl_special_s

    # Irregular plurals where there's no rule, it just is, can be extended via addl_irregular
    en_us_irregular = dict(
        list(
            {
                "child": "children",
                "criterion": "criteria",
                "die": "dice",
                "louse": "lice",
                "man": "men",
                "mouse": "mice",
                "ox": "oxen",
                "person": "people",
                "phenomenon": "phenomena",
                "tooth": "teeth",
                "woman": "women",
            }.items()
        )
        + list(addl_irregular.items())
    )

    # Consonant before y pattern
    en_us_ies_pattern = re.compile(
        r"[b-df-hj-np-tv-z]+y$",
        re.IGNORECASE,
    )

    return en_us_same, en_us_special_s, en_us_irregular, en_us_ies_pattern


def _get_plural_suffixes(text: str, caps_mode: int) -> Dict[str, str]:
    """Get appropriate suffixes based on casing mode."""
    # If the entire word is upper case or caps_mode is 1, capitalize it
    if caps_mode == 2:
        casing = "lower"
    elif text.isupper() or caps_mode == 1:
        casing = "upper"
    else:
        casing = "lower"

    if casing == "upper":
        return {
            "i": "I",
            "a": "A",
            "ices": "ICES",
            "es": "ES",
            "ies": "IES",
            "ves": "VES",
            "s": "S",
        }
    else:
        return {
            "i": "i",
            "a": "a",
            "ices": "ices",
            "es": "es",
            "ies": "ies",
            "ves": "ves",
            "s": "s",
        }


def _check_same_singular_plural(text: str, en_us_same: List[str]) -> str | None:
    """Check if word has same singular and plural form."""
    if text.lower() in en_us_same:
        return text
    return None


def _check_irregular_plurals(text: str, en_us_irregular: Dict[str, str]) -> str | None:
    """Check and apply irregular plural rules."""
    for item in en_us_irregular.keys():
        if text.lower().endswith(item.lower()):
            if text.isupper():
                return en_us_irregular[item].upper()
            else:
                return en_us_irregular[item]
    return None


def _apply_suffix_rules(
    text: str,
    suffixes: Dict[str, str],
    en_us_special_s: List[str],
    en_us_ies_pattern: Pattern[str],
) -> str:
    """Apply standard suffix-based pluralization rules."""
    text_lower = text.lower()

    if text_lower in en_us_special_s:
        # Certain words always end with s for Reasons
        return f"{text}{suffixes['s']}"

    if text_lower.endswith("us"):
        # Words that end in "us" change to "i" when plural
        return f"{text[:-2]}{suffixes['i']}"

    if text_lower.endswith("um"):
        # Words that end in "um" change to "a" when plural
        return f"{text[:-2]}{suffixes['a']}"

    if text_lower.endswith(("ix", "ex")):
        # Words that end in "ix" or "ex" change to "ices" when plural
        return f"{text[:-2]}{suffixes['ices']}"

    if text_lower.endswith(("o", "s", "x", "z", "ch", "sh", "is")):
        # Words ending in these letters/combinations change to "es"
        return f"{text}{suffixes['es']}"

    if text_lower.endswith(("f", "fe")):
        # Words that end in "f" or "fe" end in "ves" when plural
        return f"{text[:-1]}{suffixes['ves']}"

    if en_us_ies_pattern.findall(text):
        # Words ending in consonant then "y" end in "ies" when plural
        return f"{text[:-1]}{suffixes['ies']}"

    # Remaining words end in "s" when plural
    return f"{text}{suffixes['s']}"


def _apply_plural_rules(
    text: str,
    suffixes: Dict[str, str],
    en_us_same: List[str],
    en_us_special_s: List[str],
    en_us_irregular: Dict[str, str],
    en_us_ies_pattern: Pattern[str],
) -> str:
    """Apply plural rules to determine the correct plural form."""
    # Check if word is same whether singular or plural
    same_result = _check_same_singular_plural(text, en_us_same)
    if same_result is not None:
        return same_result

    # Check irregular plurals
    irregular_result = _check_irregular_plurals(text, en_us_irregular)
    if irregular_result is not None:
        return irregular_result

    # Apply standard suffix rules
    return _apply_suffix_rules(text, suffixes, en_us_special_s, en_us_ies_pattern)


[docs] def plural( text: str, number: int | float, language: str = "en-US", addl_same: List[str] = None, addl_special_s: List[str] = None, addl_irregular: Dict[str, str] = None, caps_mode: int = 0, ) -> str: """Determine the plural of a noun depending upon quantity. Given a quantity of nouns, return the most likely plural form. Language is complicated and pluralization rules are not always consistent, so this function supports user-supplied rules to accommodate exceptions specific to the situation. **Supported locales:** * ``en-US``: American English Args: text (str): The noun to convert. number (int or float): The quantity of nouns. language (str): Which language rules to apply, specified by locale (default: ``en-US``). addl_same (list): Additional words where the singular and plural are the same. addl_special_s (list): Additional words that always end in s for odd reasons (e.g., ``["piano","hello",...]``). addl_irregular (dict): Additional pairs of irregular plural nouns (e.g., ``{"mouse": "mice", "person": "people", ...}``). caps_mode (int): * ``0``: Attempt to infer whether suffix is lower or upper case (default). * ``1``: Force suffix to be upper case. * ``2``: Force suffix to be lower case. Returns: String: The plural of the provided noun. Raises: TypeError: Text must be a string. ValueError: Language must be a supported locale. """ # Thanks to Grammarly for publishing a guideline that helped inspire these rules: # https://www.grammarly.com/blog/irregular-plural-nouns/ # Make sure something reasonable was supplied if not isinstance(text, str): raise TypeError("Text must be a string") if not isinstance(number, (int, float)): raise TypeError("Number must be an int or a float") if language.lower() == "en-us": # If the number is an integer that is exactly 1, nothing to do if isinstance(number, int) and number == 1: return text # Handle None defaults if addl_same is None: addl_same = [] if addl_special_s is None: addl_special_s = [] if addl_irregular is None: addl_irregular = {} # Get plural data structures ( en_us_same, en_us_special_s, en_us_irregular, en_us_ies_pattern, ) = _get_plural_data(addl_same, addl_special_s, addl_irregular) # Get appropriate suffixes suffixes = _get_plural_suffixes(text, caps_mode) # Apply plural rules return _apply_plural_rules( text, suffixes, en_us_same, en_us_special_s, en_us_irregular, en_us_ies_pattern, ) else: raise ValueError("Language must be a supported locale.")
[docs] def possessive( text: str, language: str = "en-US", caps_mode: int = 0, ) -> str: """Convert a noun to its possessive, because apostrophes can be hard. **Supported locales:** * ``en-US``: American English Args: text (str): A noun to be made possessive. language (str): Which language rules to apply (default ``en-US``). caps_mode (int): * ``0``: Attempt to infer whether suffix is lower or upper case (default). * ``1``: Force suffix to be upper case. * ``2``: Force suffix to be lower case. Returns: String: The possessive of the provided string. Raises: TypeError: Text must be a string. ValueError: Language must be a supported locale. """ if not isinstance(text, str): raise TypeError("Text must be a string") if language.lower() == "en-us": if text.endswith("s"): # When a noun ends in "s", just add an apostrophe return f"{text}'" else: if caps_mode == 2: # Force lower case return f"{text}'s" elif text.isupper() or caps_mode == 1: # Force upper case or detect upper case return f"{text}'S" # Default is lower else: return f"{text}'s" else: raise ValueError("Language must be a supported locale.")
[docs] def num2word( number: int, language: str = "en-US", ) -> str: """Determine if an integer should be expanded to a word (per the APA style manual). The APA style manual specifies integers between 1 and 9 should be written out as a word. Everything else should be represented as digits. **Supported locales:** * ``en-US``: American English Args: number (int): An integer. language (str): Which language rules to apply (default ``en-US``). Returns: String: The word or string-formatted number, as appropriate. Raises: TypeError: Number must be an int. ValueError: Language must be a supported locale. """ # Make sure something reasonable was supplied if not isinstance(number, int): raise TypeError("Number must be an int.") # Per APA style guide, only 1-9 should be expanded if number < 1 or number > 9: return str(number) if language.lower() == "en-us": numbers = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", ] else: raise ValueError("Language must be a supported locale.") return numbers[number - 1]
[docs] def isAre( number: int | float, language: str = "en-US", ) -> str: """Given a quantity, determine if article should be ``is`` or ``are``. Given a quantity of nouns or noun-equivalents, determine whether the article should be ``is`` or ``are``. For example, "there is one cat," and "there are two cats." **Supported locales:** * ``en-US``: American English Args: number (int | float): Quantity of items. language (str): Which language rules to apply, specified by locale (default ``en-US``). Returns: String: ``is`` or ``are``, as appropriate. Raises: TypeError: number must be an int or float. ValueError: language must be a supported locale. """ if not isinstance(number, (int, float)): raise TypeError("Number must be an int or a float.") if language.lower() == "en-us": # Anything other than integer 1 (even 1.0) uses "are" if number == 1 and isinstance(number, int): return "is" else: return "are" else: raise ValueError("Language must be a supported locale.")
[docs] def wasWere( number: int | float, language: str = "en-US", ) -> str: """Given a quantity, determine if article should be ``was`` or ``were``. Given a quantity of nouns or noun-equivalents, determine whether the article should be ``was`` or ``were``. For example, "there was one cat," and "there were two cats." **Supported locales:** * ``en-US``: American English Args: number (int | float): Quantity of items. language (str): Which language rules to apply, specified by locale (default ``en-US``). Returns: String: ``was`` or ``were``, as appropriate. Raises: TypeError: number must be an int or float. ValueError: language must be a supported locale. """ if not isinstance(number, (int, float)): raise TypeError("Number must be an int or a float.") if language.lower() == "en-us": # Anything other than integer 1 (even 1.0) uses "were" if number == 1 and isinstance(number, int): return "was" else: return "were" else: raise ValueError("Language must be a supported locale.")
[docs] def aAn( noun: str | int | float, language: str = "en-US", ) -> str: """Given a noun or noun-equivalent, determine whether the article is ``a`` or ``an``. Nouns and noun-equivalents with a soft vowel beginning generally use ``an``, and everything else uses ``a``. **Supported locales:** * ``en-US``: American English Args: noun (str | int | float): A noun or noun-equivalent, as a word or a number. language (str): Which language rules to apply, specified by locale (default ``en-US``). Returns: String: ``a`` or ``an``, as appropriate. Raises: TypeError: Noun must be a string, int, or float. ValueError: Language must be a supported locale. """ if not isinstance(noun, (str, int, float)): raise TypeError("Noun must be a string, int, or float.") if language.lower() == "en-us": # Vowels, numbers that start with 8, and 18 use the "an" article if ( str(noun) .lower() .startswith( ( "a", "e", "i", "o", "u", "8", "18.", ) ) or str(noun) == "18" ): return "an" else: return "a" else: raise ValueError("Language must be a supported locale.")