Intermediate

Text Moderation

Build the text moderation module with toxicity detection using OpenAI Moderation API, PII filtering with regex patterns, and custom banned word/pattern rules.

OpenAI Moderation API

The OpenAI Moderation API is free and detects: hate, harassment, self-harm, sexual content, violence, and more. We will use it as our primary text analyzer.

# app/moderation/text.py
import re
import logging
from dataclasses import dataclass
from openai import OpenAI
from app.config import get_settings

logger = logging.getLogger(__name__)
settings = get_settings()


@dataclass
class TextModerationResult:
    flagged: bool
    categories: dict[str, bool]
    scores: dict[str, float]
    pii_found: list[dict]
    custom_matches: list[str]
    max_score: float
    recommended_action: str  # "approve", "review", "reject"


class TextModerator:
    def __init__(self):
        self.client = OpenAI(api_key=settings.openai_api_key)
        self.pii_patterns = {
            "email": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
            "phone": r"\d{3}[-.]?\d{3}[-.]?\d{4}",
            "ssn": r"\d{3}-\d{2}-\d{4}",
            "credit_card": r"\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}",
        }
        self.banned_words: list[str] = []  # Load from config

    def moderate(self, text: str) -> TextModerationResult:
        # OpenAI moderation
        response = self.client.moderations.create(input=text)
        result = response.results[0]

        categories = {k: v for k, v in result.categories.__dict__.items()
                      if not k.startswith("_")}
        scores = {k: v for k, v in result.category_scores.__dict__.items()
                  if not k.startswith("_")}
        max_score = max(scores.values()) if scores else 0.0

        # PII detection
        pii_found = self._detect_pii(text)

        # Custom rules
        custom_matches = self._check_custom_rules(text)

        # Determine action
        if max_score >= settings.moderation_threshold or result.flagged:
            action = "reject"
        elif max_score >= settings.review_threshold or pii_found or custom_matches:
            action = "review"
        else:
            action = "approve"

        return TextModerationResult(
            flagged=result.flagged, categories=categories,
            scores=scores, pii_found=pii_found,
            custom_matches=custom_matches,
            max_score=max_score, recommended_action=action,
        )

    def _detect_pii(self, text: str) -> list[dict]:
        found = []
        for pii_type, pattern in self.pii_patterns.items():
            matches = re.findall(pattern, text)
            for match in matches:
                found.append({"type": pii_type, "value": match[:4] + "***"})
        return found

    def _check_custom_rules(self, text: str) -> list[str]:
        text_lower = text.lower()
        return [w for w in self.banned_words if w.lower() in text_lower]

Testing Text Moderation

from app.moderation.text import TextModerator

mod = TextModerator()

# Test safe content
result = mod.moderate("Hello, this is a normal message about the weather.")
print(f"Action: {result.recommended_action}")  # approve
print(f"Max score: {result.max_score:.4f}")

# Test with PII
result = mod.moderate("My email is test@example.com and SSN is 123-45-6789")
print(f"Action: {result.recommended_action}")  # review
print(f"PII found: {result.pii_found}")

# Test toxic content
result = mod.moderate("I hate you and want to hurt you badly")
print(f"Action: {result.recommended_action}")  # reject
print(f"Categories: {[k for k, v in result.categories.items() if v]}")
💡
PII redaction: For production, replace detected PII with placeholders before storing content. Use re.sub(pattern, "[REDACTED]", text) to strip sensitive data while keeping the moderation decision.

Key Takeaways

  • OpenAI Moderation API is free and covers hate, harassment, self-harm, sexual, and violence categories.
  • PII detection uses regex patterns for emails, phones, SSNs, and credit card numbers.
  • Two-threshold system: high scores auto-reject, medium scores go to human review, low scores auto-approve.
  • Custom banned word lists add organization-specific content rules.