bluesky-collector/src/analyzer/classifier.py
Pieter b1fd78e0c1 Initial commit: Bluesky collector with toxicity analysis
- Bluesky post collector with mention tracking
- PostgreSQL database for storage
- OpenAI-based toxicity analysis
- Web UI for viewing and analyzing posts
- Docker compose setup for deployment

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-08 13:54:36 +01:00

287 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""OpenAI-powered toxicity classifier.
Sends batches of posts to GPT-4.1-nano for multi-category toxicity scoring.
Returns a list of dicts of category → score (0.01.0).
"""
from __future__ import annotations
import asyncio
import json
import logging
import random
from dataclasses import dataclass
from openai import AsyncOpenAI, RateLimitError, APITimeoutError, APIConnectionError
logger = logging.getLogger("analyzer.classifier")
# The 12 toxicity categories we score
CATEGORIES = [
"toxic",
"threat",
"hate_speech",
"racism",
"antisemitism",
"islamophobia",
"sexism",
"homophobia",
"insult",
"dehumanization",
"extremism",
"ableism",
]
# System prompt for batch classification — multiple numbered posts per call
SYSTEM_PROMPT = """\
You are a toxicity classifier for Dutch and English social media posts about politics.
You will receive one or more numbered posts. Score EACH post on every category from 0.0 (none) to 1.0 (extreme).
Respond ONLY with a JSON object mapping post numbers (as strings) to their scores. No other text.
Categories:
- toxic: rude, disrespectful, or aggressive language
- threat: threats of violence, harm, intimidation, or calls to action against a person
- hate_speech: targeting people based on any protected characteristic (race, religion, gender, sexual orientation, disability, nationality)
- racism: specifically targeting race or ethnicity (e.g. anti-Black, anti-Asian, anti-Moroccan sentiment, "Zwarte Piet" debates when derogatory)
- antisemitism: targeting Jewish people, Holocaust denial or minimization, Jewish conspiracy theories, coded language like "globalists", "Rothschilds", triple parentheses
- islamophobia: anti-Muslim hate, mosque opposition framed as hate, "Islam is not a religion" rhetoric, "takeover/omvolking" narratives, halal/hijab targeting
- sexism: gender-based discrimination, harassment, misogyny, or misandry
- homophobia: targeting sexual orientation or gender identity, anti-LGBTQ+ rhetoric
- insult: personal attacks, name-calling, belittling
- dehumanization: comparing people to animals, vermin, disease, parasites, or other dehumanizing language
- extremism: far-right or far-left extremist rhetoric, Nazi symbolism or glorification, white supremacist language, Great Replacement theory ("omvolkingstheorie"), calls for political violence, fascist/authoritarian glorification
- ableism: targeting people with disabilities, using mental health conditions as insults (e.g. "gestoord", "autist" as slur, "mongool")
Important context:
- Many posts are in Dutch. Handle Dutch slang, insults, and coded political language.
- Dutch-specific coded terms: "gelukszoekers", "kutmarokkanen", "omvolking", "landverrader", "volksverrader", "linkse ratten", "wappie", "tokkie" — score appropriately based on context.
- Political disagreement and criticism are NOT toxic — only score actual hostility, hate, or threats.
- Satire and parody accounts may use irony — consider context but still score the literal content.
- A score of 0.0 means the category is completely absent. A score of 1.0 means extreme/explicit.
- Most posts will score 0.0 on most categories. Only flag genuine toxicity.
Example for 2 posts:
{"1":{"toxic":0.0,"threat":0.0,"hate_speech":0.0,"racism":0.0,"antisemitism":0.0,"islamophobia":0.0,"sexism":0.0,"homophobia":0.0,"insult":0.0,"dehumanization":0.0,"extremism":0.0,"ableism":0.0},"2":{"toxic":0.3,"threat":0.0,"hate_speech":0.0,"racism":0.0,"antisemitism":0.0,"islamophobia":0.0,"sexism":0.0,"homophobia":0.0,"insult":0.2,"dehumanization":0.0,"extremism":0.0,"ableism":0.0}}"""
@dataclass
class ToxicityScores:
"""Classification result for a single post."""
toxic: float = 0.0
threat: float = 0.0
hate_speech: float = 0.0
racism: float = 0.0
antisemitism: float = 0.0
islamophobia: float = 0.0
sexism: float = 0.0
homophobia: float = 0.0
insult: float = 0.0
dehumanization: float = 0.0
extremism: float = 0.0
ableism: float = 0.0
@property
def overall(self) -> float:
"""Overall toxicity = max of all categories."""
return max(
self.toxic,
self.threat,
self.hate_speech,
self.racism,
self.antisemitism,
self.islamophobia,
self.sexism,
self.homophobia,
self.insult,
self.dehumanization,
self.extremism,
self.ableism,
)
def is_flagged(self, threshold: float = 0.5) -> bool:
return self.overall >= threshold
def to_dict(self) -> dict:
return {
"toxic": self.toxic,
"threat": self.threat,
"hate_speech": self.hate_speech,
"racism": self.racism,
"antisemitism": self.antisemitism,
"islamophobia": self.islamophobia,
"sexism": self.sexism,
"homophobia": self.homophobia,
"insult": self.insult,
"dehumanization": self.dehumanization,
"extremism": self.extremism,
"ableism": self.ableism,
"overall": self.overall,
}
# Approximate token counts for cost tracking
input_tokens: int = 0
output_tokens: int = 0
def parse_scores(raw: str) -> ToxicityScores:
"""Parse the JSON scores for a single post into ToxicityScores."""
try:
data = json.loads(raw) if isinstance(raw, str) else raw
except json.JSONDecodeError:
logger.warning("Failed to parse JSON response: %s", str(raw)[:200])
return ToxicityScores()
def clamp(val) -> float:
try:
f = float(val)
return max(0.0, min(1.0, f))
except (TypeError, ValueError):
return 0.0
return ToxicityScores(
toxic=clamp(data.get("toxic")),
threat=clamp(data.get("threat")),
hate_speech=clamp(data.get("hate_speech")),
racism=clamp(data.get("racism")),
antisemitism=clamp(data.get("antisemitism")),
islamophobia=clamp(data.get("islamophobia")),
sexism=clamp(data.get("sexism")),
homophobia=clamp(data.get("homophobia")),
insult=clamp(data.get("insult")),
dehumanization=clamp(data.get("dehumanization")),
extremism=clamp(data.get("extremism")),
ableism=clamp(data.get("ableism")),
)
def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
"""Parse a batched JSON response into a list of ToxicityScores.
Expected format: {"1": {...scores...}, "2": {...scores...}, ...}
Returns a list of ToxicityScores in the same order as the input batch.
"""
try:
data = json.loads(raw)
except json.JSONDecodeError:
logger.warning("Failed to parse batch JSON: %s", raw[:300])
return [ToxicityScores() for _ in range(batch_size)]
results = []
for i in range(1, batch_size + 1):
key = str(i)
if key in data and isinstance(data[key], dict):
results.append(parse_scores(data[key]))
else:
logger.warning("Missing scores for post %d in batch response", i)
results.append(ToxicityScores())
return results
class ToxicityClassifier:
"""Async OpenAI-based toxicity classifier with batch support."""
def __init__(self, api_key: str, model: str = "gpt-4.1-nano"):
self.client = AsyncOpenAI(api_key=api_key)
self.model = model
async def classify_batch(
self, texts: list[str], max_retries: int = 5
) -> list[ToxicityScores]:
"""Classify multiple posts in a single API call.
Args:
texts: List of post texts to classify (1batch_size items).
max_retries: Number of retries on rate limit / transient errors.
Returns:
List of ToxicityScores, one per input text, in the same order.
"""
if not texts:
return []
# Handle single-item batches efficiently
batch_size = len(texts)
# Build the numbered user message
parts = []
for i, text in enumerate(texts, 1):
# Truncate very long posts
t = text.strip() if text else ""
if len(t) > 2000:
t = t[:2000]
if not t:
t = "(empty)"
parts.append(f"[{i}] {t}")
user_message = "\n\n".join(parts)
# Scale max_tokens by batch size.
# Each post's JSON scores ≈ 60 tokens compact, but the model often
# outputs formatted JSON (whitespace/newlines) which can double that.
# Use a generous budget to avoid truncation.
max_tokens = max(300, batch_size * 200)
last_err = None
for attempt in range(max_retries):
try:
response = await self.client.chat.completions.create(
model=self.model,
temperature=0,
max_tokens=max_tokens,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message},
],
)
content = response.choices[0].message.content or "{}"
results = parse_batch_response(content, batch_size)
# Distribute token usage evenly for cost tracking
if response.usage:
per_post_input = response.usage.prompt_tokens // batch_size
per_post_output = response.usage.completion_tokens // batch_size
for scores in results:
scores.input_tokens = per_post_input
scores.output_tokens = per_post_output
return results
except RateLimitError as e:
last_err = e
wait = min(2 ** attempt + random.uniform(0.5, 1.5), 30)
logger.debug(
"Rate limited (attempt %d/%d), waiting %.1fs",
attempt + 1, max_retries, wait,
)
await asyncio.sleep(wait)
except (APITimeoutError, APIConnectionError) as e:
last_err = e
wait = 2 ** attempt + random.uniform(0, 1)
logger.debug(
"Transient error (attempt %d/%d), retrying in %.1fs: %s",
attempt + 1, max_retries, wait, e,
)
await asyncio.sleep(wait)
except Exception:
logger.exception(
"Batch classification API call failed (%d posts)", batch_size
)
raise
# All retries exhausted
logger.error("Rate limit retries exhausted for batch of %d posts", batch_size)
raise last_err
async def classify(self, text: str, max_retries: int = 5) -> ToxicityScores:
"""Classify a single post (convenience wrapper around classify_batch)."""
results = await self.classify_batch([text], max_retries=max_retries)
return results[0]
async def close(self):
await self.client.close()