| Situation | Integration steps |
|-----------|-------------------|
| A web‑app (Flask/Django/FastAPI) | Import moderate in the request‑handling view, call it on incoming POST data, reject or auto‑sanitize before persisting. |
| A Discord / Telegram bot | Run moderate(message.content) for each incoming message; if is_clean is False, delete the message or warn the user. |
| A mobile game chat | Call the function on each chat packet server‑side; only forward the sanitized version to other players. |
| A classroom‑learning platform | Use return_cleaned=True to automatically hide profanity while still storing the original for moderation logs. |
def load_wordlists(json_path: str) -> None:
import json
with open(json_path, "r", encoding="utf-8") as fp:
data = json.load(fp) # "en": [...], "pt": [...]
global OFFENSIVE_WORDS, _WORD_PATTERN
OFFENSIVE_WORDS = w.lower() for w in data["en"] + data["pt"]
_WORD_PATTERN = re.compile(
r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
flags=re.IGNORECASE | re.UNICODE,
)
"""
content_moderator.py
--------------------
A tiny, dependency‑free content‑moderation utility.
Features
--------
* Detects profanity, slurs and adult‑content keywords (both EN & PT).
* Returns:
- a boolean flag (is_clean)
- a list of detected offensive tokens
- a "sanitized" version of the original text (optional)
* Configurable: you can extend the word‑lists, adjust sensitivity,
or replace the simple regex engine with a custom matcher.
Usage example
-------------
>>> from content_moderator import moderate
>>> text = "Esse cara é um babalu e fala merda pra criança."
>>> clean, hits, safe = moderate(text, return_cleaned=True)
>>> print(clean) # False
>>> print(hits) # ['babalu', 'merda']
>>> print(safe) # "Esse cara é um ***** e fala ***** pra criança."
"""
import re
from typing import List, Tuple, Optional
# ----------------------------------------------------------------------
# 1️⃣ WORD LISTS
# ----------------------------------------------------------------------
# The lists below are deliberately short; you can expand them as needed.
# They contain the most common profanity/sexually‑explicit terms in English
# and Portuguese. All entries are lower‑cased for case‑insensitive matching.
EN_PROFANITY =
"fuck", "shit", "bitch", "cunt", "asshole", "dick", "pussy",
"cum", "blowjob", "nigger", "fag", "slut", "whore"
PT_PROFANITY =
"porra", "caralho", "buceta", "pinto", "cu", "filho da puta",
"merda", "baba", "babalu", "bengala", "idiota", "piranha",
"viado", "sacanagem"
# Combine them for a single lookup set.
OFFENSIVE_WORDS = PT_PROFANITY
# ----------------------------------------------------------------------
# 2️⃣ REGEX PRE‑COMPILATION
# ----------------------------------------------------------------------
# Build a regex that matches any whole word from the set.
# \b – word boundary
# re.IGNORECASE – case‑insensitive
# re.UNICODE – proper handling of accented characters
_WORD_PATTERN = re.compile(
r"\b(" + "|".join(map(re.escape, OFFENSIVE_WORDS)) + r")\b",
flags=re.IGNORECASE | re.UNICODE,
)
# ----------------------------------------------------------------------
# 3️⃣ CORE FUNCTION
# ----------------------------------------------------------------------
def moderate(
text: str,
*,
return_hits: bool = True,
return_cleaned: bool = False,
mask_char: str = "*"
) -> Tuple[bool, Optional[List[str]], Optional[str]]:
"""
Scan *text* for offensive words.
Parameters
----------
text : str
The raw user‑generated content.
return_hits : bool, optional
If True, the function also returns a list with the matched words.
return_cleaned : bool, optional
If True, returns a sanitized version where each detected token
is replaced by ``mask_char`` repeated to the same length.
mask_char : str, optional
The character used for masking (default "*").
Returns
-------
is_clean : bool
``True`` if no offensive token was found.
hits : list[str] | None
List of detected words (lower‑cased). ``None`` when *return_hits=False*.
cleaned_text : str | None
Masked version of the original text. ``None`` when
*return_cleaned=False*.
Example
-------
>>> moderate("Oi, seu babalu!", return_cleaned=True)
(False, ['babalu'], 'Oi, seu ******!')
"""
# 1️⃣ Find all matches (case‑insensitive)
matches = _WORD_PATTERN.findall(text)
# Normalise to lower‑case for the output list
hits = [m.lower() for m in matches] if return_hits else None
# 2️⃣ Build a sanitized version if requested
cleaned = None
if return_cleaned:
def _mask(match: re.Match) -> str:
return mask_char * len(match.group(0))
cleaned = _WORD_PATTERN.sub(_mask, text)
# 3️⃣ Determine overall cleanliness
is_clean = len(matches) == 0
return is_clean, hits, cleaned
# ----------------------------------------------------------------------
# 4️⃣ QUICK DEMO (runs when executed directly)
# ----------------------------------------------------------------------
if __name__ == "__main__":
demo_sentences = [
"Esse cara é um babalu e fala merda pra criança.",
"What a beautiful day! No bad words here.",
"You are such a bitch!",
"Fala, bambambú! (just a nonsense word)",
]
for s in demo_sentences:
clean, hits, safe = moderate(s, return_cleaned=True)
print("\nOriginal :", s)
print("Is clean? :", clean)
if hits:
print("Detected :", ", ".join(hits))
if safe:
print("Sanitized :", safe)