Source code for zcloud.rapidfuzz

import re
import string
import unicodedata
from rapidfuzz import fuzz, process
from typing import List, Dict, Callable, Optional

# --- Text Normalization Helpers ---

[docs] def remove_accents(text: str) -> str: """Remove accents and diacritics from text.""" return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
[docs] def strip_punctuation(text: str) -> str: """Remove all punctuation characters from text.""" return text.translate(str.maketrans('', '', string.punctuation))
[docs] def collapse_spaces(text: str) -> str: """Replace multiple whitespace with a single space.""" return re.sub(r'\s+', ' ', text).strip()
[docs] def replace_synonyms(text: str, synonyms: Dict[str, str]) -> str: """Replace known abbreviations or synonyms.""" for key, val in synonyms.items(): text = text.replace(key, val) return text
[docs] def normalize(text: str, synonyms: Optional[Dict[str, str]] = None) -> str: """Apply full normalization stack: synonyms, accents, punctuation, spaces, casefold.""" if synonyms: text = replace_synonyms(text, synonyms) text = remove_accents(text) text = strip_punctuation(text) text = collapse_spaces(text) return text.casefold()
# --- Scoring Utilities ---
[docs] def combined_score(s1: str, s2: str) -> float: """Average multiple scorers from RapidFuzz.""" return sum([ fuzz.ratio(s1, s2), fuzz.partial_ratio(s1, s2), fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2) ]) / 4
[docs] def fuzzy_match(query: str, choices: List[str], synonyms: Optional[Dict[str, str]] = None, threshold: int = 70, scorer: Callable = combined_score, limit: int = 5) -> List: """Run fuzzy match against a list of choices with preprocessing.""" query_norm = normalize(query, synonyms) processed_choices = [normalize(c, synonyms) for c in choices] matches = [ (original, scorer(query_norm, normed)) for original, normed in zip(choices, processed_choices) ] return sorted( [(s, score) for s, score in matches if score >= threshold], key=lambda x: x[1], reverse=True )[:limit]