Source code for zcloud.rapidfuzz

import re
import string
import unicodedata
from rapidfuzz import fuzz, process
from typing import List, Dict, Callable, Optional

# --- Text Normalization Helpers ---


[docs]
def remove_accents(text: str) -> str:
    """Remove accents and diacritics from text."""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()



[docs]
def strip_punctuation(text: str) -> str:
    """Remove all punctuation characters from text."""
    return text.translate(str.maketrans('', '', string.punctuation))



[docs]
def collapse_spaces(text: str) -> str:
    """Replace multiple whitespace with a single space."""
    return re.sub(r'\s+', ' ', text).strip()



[docs]
def replace_synonyms(text: str, synonyms: Dict[str, str]) -> str:
    """Replace known abbreviations or synonyms."""
    for key, val in synonyms.items():
        text = text.replace(key, val)
    return text



[docs]
def normalize(text: str, synonyms: Optional[Dict[str, str]] = None) -> str:
    """Apply full normalization stack: synonyms, accents, punctuation, spaces, casefold."""
    if synonyms:
        text = replace_synonyms(text, synonyms)
    text = remove_accents(text)
    text = strip_punctuation(text)
    text = collapse_spaces(text)
    return text.casefold()


# --- Scoring Utilities ---


[docs]
def combined_score(s1: str, s2: str) -> float:
    """Average multiple scorers from RapidFuzz."""
    return sum([
        fuzz.ratio(s1, s2),
        fuzz.partial_ratio(s1, s2),
        fuzz.token_sort_ratio(s1, s2),
        fuzz.token_set_ratio(s1, s2)
    ]) / 4



[docs]
def fuzzy_match(query: str, choices: List[str], 
                synonyms: Optional[Dict[str, str]] = None,
                threshold: int = 70,
                scorer: Callable = combined_score,
                limit: int = 5) -> List:
    """Run fuzzy match against a list of choices with preprocessing."""
    query_norm = normalize(query, synonyms) 
    processed_choices = [normalize(c, synonyms) for c in choices]

    matches = [
        (original, scorer(query_norm, normed))
        for original, normed in zip(choices, processed_choices)
    ]
    
    return sorted(
        [(s, score) for s, score in matches if score >= threshold],
        key=lambda x: x[1],
        reverse=True
    )[:limit]