import re
import string
import unicodedata
from rapidfuzz import fuzz, process
from typing import List, Dict, Callable, Optional
# --- Text Normalization Helpers ---
[docs]
def remove_accents(text: str) -> str:
"""Remove accents and diacritics from text."""
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
[docs]
def strip_punctuation(text: str) -> str:
"""Remove all punctuation characters from text."""
return text.translate(str.maketrans('', '', string.punctuation))
[docs]
def collapse_spaces(text: str) -> str:
"""Replace multiple whitespace with a single space."""
return re.sub(r'\s+', ' ', text).strip()
[docs]
def replace_synonyms(text: str, synonyms: Dict[str, str]) -> str:
"""Replace known abbreviations or synonyms."""
for key, val in synonyms.items():
text = text.replace(key, val)
return text
[docs]
def normalize(text: str, synonyms: Optional[Dict[str, str]] = None) -> str:
"""Apply full normalization stack: synonyms, accents, punctuation, spaces, casefold."""
if synonyms:
text = replace_synonyms(text, synonyms)
text = remove_accents(text)
text = strip_punctuation(text)
text = collapse_spaces(text)
return text.casefold()
# --- Scoring Utilities ---
[docs]
def combined_score(s1: str, s2: str) -> float:
"""Average multiple scorers from RapidFuzz."""
return sum([
fuzz.ratio(s1, s2),
fuzz.partial_ratio(s1, s2),
fuzz.token_sort_ratio(s1, s2),
fuzz.token_set_ratio(s1, s2)
]) / 4
[docs]
def fuzzy_match(query: str, choices: List[str],
synonyms: Optional[Dict[str, str]] = None,
threshold: int = 70,
scorer: Callable = combined_score,
limit: int = 5) -> List:
"""Run fuzzy match against a list of choices with preprocessing."""
query_norm = normalize(query, synonyms)
processed_choices = [normalize(c, synonyms) for c in choices]
matches = [
(original, scorer(query_norm, normed))
for original, normed in zip(choices, processed_choices)
]
return sorted(
[(s, score) for s, score in matches if score >= threshold],
key=lambda x: x[1],
reverse=True
)[:limit]