Intermediate

Image Moderation

Implement image moderation with NSFW detection using CLIP, violence screening, and OCR-based text extraction to catch policy violations embedded in images.

Image Analysis Pipeline

Images can contain policy violations in three ways: visual content (NSFW, violence), embedded text (slurs in memes), and context (misleading imagery). We handle all three.

# app/moderation/image.py
import logging
import pytesseract
from PIL import Image
from dataclasses import dataclass
from openai import OpenAI
from app.config import get_settings

logger = logging.getLogger(__name__)
settings = get_settings()


@dataclass
class ImageModerationResult:
    flagged: bool
    categories: dict[str, float]
    ocr_text: str
    text_moderation: dict | None
    recommended_action: str


class ImageModerator:
    def __init__(self):
        self.client = OpenAI(api_key=settings.openai_api_key)

    def moderate(self, image_path: str) -> ImageModerationResult:
        # Step 1: Extract text from image via OCR
        ocr_text = self._extract_text(image_path)

        # Step 2: Analyze image with GPT-4 Vision
        vision_result = self._analyze_with_vision(image_path)

        # Step 3: If OCR found text, moderate it too
        text_mod = None
        if ocr_text.strip():
            from app.moderation.text import TextModerator
            text_mod_result = TextModerator().moderate(ocr_text)
            text_mod = {
                "flagged": text_mod_result.flagged,
                "max_score": text_mod_result.max_score,
                "action": text_mod_result.recommended_action,
            }

        # Combine results
        max_score = max(vision_result["scores"].values()) if vision_result["scores"] else 0
        if text_mod and text_mod["max_score"] > max_score:
            max_score = text_mod["max_score"]

        flagged = max_score >= settings.moderation_threshold
        if flagged:
            action = "reject"
        elif max_score >= settings.review_threshold:
            action = "review"
        else:
            action = "approve"

        return ImageModerationResult(
            flagged=flagged, categories=vision_result["scores"],
            ocr_text=ocr_text, text_moderation=text_mod,
            recommended_action=action,
        )

    def _extract_text(self, image_path: str) -> str:
        try:
            img = Image.open(image_path)
            return pytesseract.image_to_string(img)
        except Exception as e:
            logger.warning(f"OCR failed: {e}")
            return ""

    def _analyze_with_vision(self, image_path: str) -> dict:
        import base64
        with open(image_path, "rb") as f:
            b64 = base64.b64encode(f.read()).decode()

        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text":
                        "Analyze this image for content moderation. "
                        "Rate each category 0.0-1.0:\n"
                        "- nsfw\n- violence\n- hate_symbols\n- drugs\n- self_harm\n"
                        "Reply as JSON only: {"nsfw": 0.1, "violence": 0.0, ...}"},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{b64}", "detail": "low"
                    }}
                ]
            }],
            max_tokens=200, temperature=0.0,
        )
        import json
        try:
            scores = json.loads(response.choices[0].message.content)
        except json.JSONDecodeError:
            scores = {"parse_error": 0.5}
        return {"scores": scores}

Testing Image Moderation

from app.moderation.image import ImageModerator

mod = ImageModerator()
result = mod.moderate("test-images/sample.png")

print(f"Flagged: {result.flagged}")
print(f"Categories: {result.categories}")
print(f"OCR text: {result.ocr_text[:100]}")
print(f"Action: {result.recommended_action}")
💡
Cost control: Use "detail": "low" for initial screening ($0.003/image). Only upgrade to "detail": "high" for borderline cases that need closer inspection.

Key Takeaways

  • Image moderation combines visual analysis (GPT-4 Vision) with OCR text extraction (Tesseract).
  • Memes and screenshots can contain text-based violations that pure image classifiers miss.
  • Category-specific scores enable granular policy decisions rather than binary flag/no-flag.
  • Low-detail vision mode provides cost-effective initial screening at $0.003 per image.