| Situation | Integration steps | |-----------|-------------------| | A web‑app (Flask/Django/FastAPI) | Import moderate in the request‑handling view, call it on incoming POST data, reject or auto‑sanitize before persisting. | | A Discord / Telegram bot | Run moderate(message.content) for each incoming message; if is_clean is False, delete the message or warn the user. | | A mobile game chat | Call the function on each chat packet server‑side; only forward the sanitized version to other players. | | A classroom‑learning platform | Use return_cleaned=True to automatically hide profanity while still storing the original for moderation logs. |


def load_wordlists(json_path: str) -> None:
    import json
    with open(json_path, "r", encoding="utf-8") as fp:
        data = json.load(fp)          # "en": [...], "pt": [...]
    global OFFENSIVE_WORDS, _WORD_PATTERN
    OFFENSIVE_WORDS = w.lower() for w in data["en"] + data["pt"]
    _WORD_PATTERN = re.compile(
        r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
        flags=re.IGNORECASE | re.UNICODE,
    )

"""
content_moderator.py
--------------------
A tiny, dependency‑free content‑moderation utility.
Features
--------
* Detects profanity, slurs and adult‑content keywords (both EN & PT).
* Returns:
    - a boolean flag (is_clean)
    - a list of detected offensive tokens
    - a "sanitized" version of the original text (optional)
* Configurable: you can extend the word‑lists, adjust sensitivity,
  or replace the simple regex engine with a custom matcher.
Usage example
-------------
>>> from content_moderator import moderate
>>> text = "Esse cara é um babalu e fala merda pra criança."
>>> clean, hits, safe = moderate(text, return_cleaned=True)
>>> print(clean)   # False
>>> print(hits)    # ['babalu', 'merda']
>>> print(safe)    # "Esse cara é um ***** e fala ***** pra criança."
"""
import re
from typing import List, Tuple, Optional
# ----------------------------------------------------------------------
# 1️⃣  WORD LISTS
# ----------------------------------------------------------------------
# The lists below are deliberately short; you can expand them as needed.
# They contain the most common profanity/sexually‑explicit terms in English
# and Portuguese.  All entries are lower‑cased for case‑insensitive matching.
EN_PROFANITY = 
    "fuck", "shit", "bitch", "cunt", "asshole", "dick", "pussy",
    "cum", "blowjob", "nigger", "fag", "slut", "whore"
PT_PROFANITY = 
    "porra", "caralho", "buceta", "pinto", "cu", "filho da puta",
    "merda", "baba", "babalu", "bengala", "idiota", "piranha",
    "viado", "sacanagem"
# Combine them for a single lookup set.
OFFENSIVE_WORDS =  PT_PROFANITY
# ----------------------------------------------------------------------
# 2️⃣  REGEX PRE‑COMPILATION
# ----------------------------------------------------------------------
# Build a regex that matches any whole word from the set.
#   \b   – word boundary
#   re.IGNORECASE – case‑insensitive
#   re.UNICODE – proper handling of accented characters
_WORD_PATTERN = re.compile(
    r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
    flags=re.IGNORECASE | re.UNICODE,
)
# ----------------------------------------------------------------------
# 3️⃣  CORE FUNCTION
# ----------------------------------------------------------------------
def moderate(
    text: str,
    *,
    return_hits: bool = True,
    return_cleaned: bool = False,
    mask_char: str = "*"
) -> Tuple[bool, Optional[List[str]], Optional[str]]:
    """
    Scan *text* for offensive words.
Parameters
    ----------
    text : str
        The raw user‑generated content.
    return_hits : bool, optional
        If True, the function also returns a list with the matched words.
    return_cleaned : bool, optional
        If True, returns a sanitized version where each detected token
        is replaced by ``mask_char`` repeated to the same length.
    mask_char : str, optional
        The character used for masking (default "*").
Returns
    -------
    is_clean : bool
        ``True`` if no offensive token was found.
    hits : list[str] | None
        List of detected words (lower‑cased). ``None`` when *return_hits=False*.
    cleaned_text : str | None
        Masked version of the original text. ``None`` when
        *return_cleaned=False*.
Example
    -------
    >>> moderate("Oi, seu babalu!", return_cleaned=True)
    (False, ['babalu'], 'Oi, seu ******!')
    """
    # 1️⃣ Find all matches (case‑insensitive)
    matches = _WORD_PATTERN.findall(text)
# Normalise to lower‑case for the output list
    hits = [m.lower() for m in matches] if return_hits else None
# 2️⃣ Build a sanitized version if requested
    cleaned = None
    if return_cleaned:
        def _mask(match: re.Match) -> str:
            return mask_char * len(match.group(0))
        cleaned = _WORD_PATTERN.sub(_mask, text)
# 3️⃣ Determine overall cleanliness
    is_clean = len(matches) == 0
return is_clean, hits, cleaned
# ----------------------------------------------------------------------
# 4️⃣  QUICK DEMO (runs when executed directly)
# ----------------------------------------------------------------------
if __name__ == "__main__":
    demo_sentences = [
        "Esse cara é um babalu e fala merda pra criança.",
        "What a beautiful day! No bad words here.",
        "You are such a bitch!",
        "Fala, bambambú! (just a nonsense word)",
    ]
for s in demo_sentences:
        clean, hits, safe = moderate(s, return_cleaned=True)
        print("\nOriginal :", s)
        print("Is clean? :", clean)
        if hits:
            print("Detected :", ", ".join(hits))
        if safe:
            print("Sanitized :", safe)