Intermediate
Image Moderation
Implement image moderation with NSFW detection using CLIP, violence screening, and OCR-based text extraction to catch policy violations embedded in images.
Image Analysis Pipeline
Images can contain policy violations in three ways: visual content (NSFW, violence), embedded text (slurs in memes), and context (misleading imagery). We handle all three.
# app/moderation/image.py
import logging
import pytesseract
from PIL import Image
from dataclasses import dataclass
from openai import OpenAI
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
@dataclass
class ImageModerationResult:
flagged: bool
categories: dict[str, float]
ocr_text: str
text_moderation: dict | None
recommended_action: str
class ImageModerator:
def __init__(self):
self.client = OpenAI(api_key=settings.openai_api_key)
def moderate(self, image_path: str) -> ImageModerationResult:
# Step 1: Extract text from image via OCR
ocr_text = self._extract_text(image_path)
# Step 2: Analyze image with GPT-4 Vision
vision_result = self._analyze_with_vision(image_path)
# Step 3: If OCR found text, moderate it too
text_mod = None
if ocr_text.strip():
from app.moderation.text import TextModerator
text_mod_result = TextModerator().moderate(ocr_text)
text_mod = {
"flagged": text_mod_result.flagged,
"max_score": text_mod_result.max_score,
"action": text_mod_result.recommended_action,
}
# Combine results
max_score = max(vision_result["scores"].values()) if vision_result["scores"] else 0
if text_mod and text_mod["max_score"] > max_score:
max_score = text_mod["max_score"]
flagged = max_score >= settings.moderation_threshold
if flagged:
action = "reject"
elif max_score >= settings.review_threshold:
action = "review"
else:
action = "approve"
return ImageModerationResult(
flagged=flagged, categories=vision_result["scores"],
ocr_text=ocr_text, text_moderation=text_mod,
recommended_action=action,
)
def _extract_text(self, image_path: str) -> str:
try:
img = Image.open(image_path)
return pytesseract.image_to_string(img)
except Exception as e:
logger.warning(f"OCR failed: {e}")
return ""
def _analyze_with_vision(self, image_path: str) -> dict:
import base64
with open(image_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": [
{"type": "text", "text":
"Analyze this image for content moderation. "
"Rate each category 0.0-1.0:\n"
"- nsfw\n- violence\n- hate_symbols\n- drugs\n- self_harm\n"
"Reply as JSON only: {"nsfw": 0.1, "violence": 0.0, ...}"},
{"type": "image_url", "image_url": {
"url": f"data:image/png;base64,{b64}", "detail": "low"
}}
]
}],
max_tokens=200, temperature=0.0,
)
import json
try:
scores = json.loads(response.choices[0].message.content)
except json.JSONDecodeError:
scores = {"parse_error": 0.5}
return {"scores": scores}
Testing Image Moderation
from app.moderation.image import ImageModerator
mod = ImageModerator()
result = mod.moderate("test-images/sample.png")
print(f"Flagged: {result.flagged}")
print(f"Categories: {result.categories}")
print(f"OCR text: {result.ocr_text[:100]}")
print(f"Action: {result.recommended_action}")
Cost control: Use
"detail": "low" for initial screening ($0.003/image). Only upgrade to "detail": "high" for borderline cases that need closer inspection.Key Takeaways
- Image moderation combines visual analysis (GPT-4 Vision) with OCR text extraction (Tesseract).
- Memes and screenshots can contain text-based violations that pure image classifiers miss.
- Category-specific scores enable granular policy decisions rather than binary flag/no-flag.
- Low-detail vision mode provides cost-effective initial screening at $0.003 per image.
Lilly Tech Systems