Brasileirinhas Kid Bengala E Musa Babalu Full -

| Situation | Integration steps | |-----------|-------------------| | A web‑app (Flask/Django/FastAPI) | Import moderate in the request‑handling view, call it on incoming POST data, reject or auto‑sanitize before persisting. | | A Discord / Telegram bot | Run moderate(message.content) for each incoming message; if is_clean is False, delete the message or warn the user. | | A mobile game chat | Call the function on each chat packet server‑side; only forward the sanitized version to other players. | | A classroom‑learning platform | Use return_cleaned=True to automatically hide profanity while still storing the original for moderation logs. |

def load_wordlists(json_path: str) -> None:
    import json
    with open(json_path, "r", encoding="utf-8") as fp:
        data = json.load(fp)          # "en": [...], "pt": [...]
    global OFFENSIVE_WORDS, _WORD_PATTERN
    OFFENSIVE_WORDS = w.lower() for w in data["en"] + data["pt"]
    _WORD_PATTERN = re.compile(
        r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
        flags=re.IGNORECASE | re.UNICODE,
    )

"""
content_moderator.py
--------------------
A tiny, dependency‑free content‑moderation utility.
Features
--------
* Detects profanity, slurs and adult‑content keywords (both EN & PT).
* Returns:
    - a boolean flag (is_clean)
    - a list of detected offensive tokens
    - a "sanitized" version of the original text (optional)
* Configurable: you can extend the word‑lists, adjust sensitivity,
  or replace the simple regex engine with a custom matcher.
Usage example
-------------
>>> from content_moderator import moderate
>>> text = "Esse cara é um babalu e fala merda pra criança."
>>> clean, hits, safe = moderate(text, return_cleaned=True)
>>> print(clean)   # False
>>> print(hits)    # ['babalu', 'merda']
>>> print(safe)    # "Esse cara é um ***** e fala ***** pra criança."
"""
import re
from typing import List, Tuple, Optional
# ----------------------------------------------------------------------
# 1️⃣  WORD LISTS
# ----------------------------------------------------------------------
# The lists below are deliberately short; you can expand them as needed.
# They contain the most common profanity/sexually‑explicit terms in English
# and Portuguese.  All entries are lower‑cased for case‑insensitive matching.
EN_PROFANITY = 
    "fuck", "shit", "bitch", "cunt", "asshole", "dick", "pussy",
    "cum", "blowjob", "nigger", "fag", "slut", "whore"
PT_PROFANITY = 
    "porra", "caralho", "buceta", "pinto", "cu", "filho da puta",
    "merda", "baba", "babalu", "bengala", "idiota", "piranha",
    "viado", "sacanagem"
# Combine them for a single lookup set.
OFFENSIVE_WORDS =  PT_PROFANITY
# ----------------------------------------------------------------------
# 2️⃣  REGEX PRE‑COMPILATION
# ----------------------------------------------------------------------
# Build a regex that matches any whole word from the set.
#   \b   – word boundary
#   re.IGNORECASE – case‑insensitive
#   re.UNICODE – proper handling of accented characters
_WORD_PATTERN = re.compile(
    r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
    flags=re.IGNORECASE | re.UNICODE,
)
# ----------------------------------------------------------------------
# 3️⃣  CORE FUNCTION
# ----------------------------------------------------------------------
def moderate(
    text: str,
    *,
    return_hits: bool = True,
    return_cleaned: bool = False,
    mask_char: str = "*"
) -> Tuple[bool, Optional[List[str]], Optional[str]]:
    """
    Scan *text* for offensive words.
Parameters
    ----------
    text : str
        The raw user‑generated content.
    return_hits : bool, optional
        If True, the function also returns a list with the matched words.
    return_cleaned : bool, optional
        If True, returns a sanitized version where each detected token
        is replaced by ``mask_char`` repeated to the same length.
    mask_char : str, optional
        The character used for masking (default "*").
Returns
    -------
    is_clean : bool
        ``True`` if no offensive token was found.
    hits : list[str] | None
        List of detected words (lower‑cased). ``None`` when *return_hits=False*.
    cleaned_text : str | None
        Masked version of the original text. ``None`` when
        *return_cleaned=False*.
Example
    -------
    >>> moderate("Oi, seu babalu!", return_cleaned=True)
    (False, ['babalu'], 'Oi, seu ******!')
    """
    # 1️⃣ Find all matches (case‑insensitive)
    matches = _WORD_PATTERN.findall(text)
# Normalise to lower‑case for the output list
    hits = [m.lower() for m in matches] if return_hits else None
# 2️⃣ Build a sanitized version if requested
    cleaned = None
    if return_cleaned:
        def _mask(match: re.Match) -> str:
            return mask_char * len(match.group(0))
        cleaned = _WORD_PATTERN.sub(_mask, text)
# 3️⃣ Determine overall cleanliness
    is_clean = len(matches) == 0
return is_clean, hits, cleaned
# ----------------------------------------------------------------------
# 4️⃣  QUICK DEMO (runs when executed directly)
# ----------------------------------------------------------------------
if __name__ == "__main__":
    demo_sentences = [
        "Esse cara é um babalu e fala merda pra criança.",
        "What a beautiful day! No bad words here.",
        "You are such a bitch!",
        "Fala, bambambú! (just a nonsense word)",
    ]
for s in demo_sentences:
        clean, hits, safe = moderate(s, return_cleaned=True)
        print("\nOriginal :", s)
        print("Is clean? :", clean)
        if hits:
            print("Detected :", ", ".join(hits))
        if safe:
            print("Sanitized :", safe)

def load_wordlists(json_path: str) -> None: import json with open(json_path, "r", encoding="utf-8") as fp: data = json.load(fp) # "en": [...], "pt": [...] global OFFENSIVE_WORDS, _WORD_PATTERN OFFENSIVE_WORDS = w.lower() for w in data["en"] + data["pt"] _WORD_PATTERN = re.compile( r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b", flags=re.IGNORECASE | re.UNICODE, )

""" content_moderator.py -------------------- A tiny, dependency‑free content‑moderation utility. Features -------- * Detects profanity, slurs and adult‑content keywords (both EN & PT). * Returns: - a boolean flag (is_clean) - a list of detected offensive tokens - a "sanitized" version of the original text (optional) * Configurable: you can extend the word‑lists, adjust sensitivity, or replace the simple regex engine with a custom matcher. Usage example ------------- >>> from content_moderator import moderate >>> text = "Esse cara é um babalu e fala merda pra criança." >>> clean, hits, safe = moderate(text, return_cleaned=True) >>> print(clean) # False >>> print(hits) # ['babalu', 'merda'] >>> print(safe) # "Esse cara é um ***** e fala ***** pra criança." """ import re from typing import List, Tuple, Optional # ---------------------------------------------------------------------- # 1️⃣ WORD LISTS # ---------------------------------------------------------------------- # The lists below are deliberately short; you can expand them as needed. # They contain the most common profanity/sexually‑explicit terms in English # and Portuguese. All entries are lower‑cased for case‑insensitive matching. EN_PROFANITY = "fuck", "shit", "bitch", "cunt", "asshole", "dick", "pussy", "cum", "blowjob", "nigger", "fag", "slut", "whore" PT_PROFANITY = "porra", "caralho", "buceta", "pinto", "cu", "filho da puta", "merda", "baba", "babalu", "bengala", "idiota", "piranha", "viado", "sacanagem" # Combine them for a single lookup set. OFFENSIVE_WORDS = PT_PROFANITY # ---------------------------------------------------------------------- # 2️⃣ REGEX PRE‑COMPILATION # ---------------------------------------------------------------------- # Build a regex that matches any whole word from the set. # \b – word boundary # re.IGNORECASE – case‑insensitive # re.UNICODE – proper handling of accented characters _WORD_PATTERN = re.compile( r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b", flags=re.IGNORECASE | re.UNICODE, ) # ---------------------------------------------------------------------- # 3️⃣ CORE FUNCTION # ---------------------------------------------------------------------- def moderate( text: str, *, return_hits: bool = True, return_cleaned: bool = False, mask_char: str = "*" ) -> Tuple[bool, Optional[List[str]], Optional[str]]: """ Scan *text* for offensive words. Parameters ---------- text : str The raw user‑generated content. return_hits : bool, optional If True, the function also returns a list with the matched words. return_cleaned : bool, optional If True, returns a sanitized version where each detected token is replaced by ``mask_char`` repeated to the same length. mask_char : str, optional The character used for masking (default "*"). Returns ------- is_clean : bool ``True`` if no offensive token was found. hits : list[str] | None List of detected words (lower‑cased). ``None`` when *return_hits=False*. cleaned_text : str | None Masked version of the original text. ``None`` when *return_cleaned=False*. Example ------- >>> moderate("Oi, seu babalu!", return_cleaned=True) (False, ['babalu'], 'Oi, seu ******!') """ # 1️⃣ Find all matches (case‑insensitive) matches = _WORD_PATTERN.findall(text) # Normalise to lower‑case for the output list hits = [m.lower() for m in matches] if return_hits else None # 2️⃣ Build a sanitized version if requested cleaned = None if return_cleaned: def _mask(match: re.Match) -> str: return mask_char * len(match.group(0)) cleaned = _WORD_PATTERN.sub(_mask, text) # 3️⃣ Determine overall cleanliness is_clean = len(matches) == 0 return is_clean, hits, cleaned # ---------------------------------------------------------------------- # 4️⃣ QUICK DEMO (runs when executed directly) # ---------------------------------------------------------------------- if __name__ == "__main__": demo_sentences = [ "Esse cara é um babalu e fala merda pra criança.", "What a beautiful day! No bad words here.", "You are such a bitch!", "Fala, bambambú! (just a nonsense word)", ] for s in demo_sentences: clean, hits, safe = moderate(s, return_cleaned=True) print("\nOriginal :", s) print("Is clean? :", clean) if hits: print("Detected :", ", ".join(hits)) if safe: print("Sanitized :", safe)