288 lines
11 KiB
Python
288 lines
11 KiB
Python
|
|
"""OpenAI-powered toxicity classifier.
|
|||
|
|
|
|||
|
|
Sends batches of posts to GPT-4.1-nano for multi-category toxicity scoring.
|
|||
|
|
Returns a list of dicts of category → score (0.0–1.0).
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import random
|
|||
|
|
from dataclasses import dataclass
|
|||
|
|
|
|||
|
|
from openai import AsyncOpenAI, RateLimitError, APITimeoutError, APIConnectionError
|
|||
|
|
|
|||
|
|
logger = logging.getLogger("analyzer.classifier")
|
|||
|
|
|
|||
|
|
# The 12 toxicity categories we score
|
|||
|
|
CATEGORIES = [
|
|||
|
|
"toxic",
|
|||
|
|
"threat",
|
|||
|
|
"hate_speech",
|
|||
|
|
"racism",
|
|||
|
|
"antisemitism",
|
|||
|
|
"islamophobia",
|
|||
|
|
"sexism",
|
|||
|
|
"homophobia",
|
|||
|
|
"insult",
|
|||
|
|
"dehumanization",
|
|||
|
|
"extremism",
|
|||
|
|
"ableism",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# System prompt for batch classification — multiple numbered posts per call
|
|||
|
|
SYSTEM_PROMPT = """\
|
|||
|
|
You are a toxicity classifier for Dutch and English social media posts about politics.
|
|||
|
|
|
|||
|
|
You will receive one or more numbered posts. Score EACH post on every category from 0.0 (none) to 1.0 (extreme).
|
|||
|
|
Respond ONLY with a JSON object mapping post numbers (as strings) to their scores. No other text.
|
|||
|
|
|
|||
|
|
Categories:
|
|||
|
|
- toxic: rude, disrespectful, or aggressive language
|
|||
|
|
- threat: threats of violence, harm, intimidation, or calls to action against a person
|
|||
|
|
- hate_speech: targeting people based on any protected characteristic (race, religion, gender, sexual orientation, disability, nationality)
|
|||
|
|
- racism: specifically targeting race or ethnicity (e.g. anti-Black, anti-Asian, anti-Moroccan sentiment, "Zwarte Piet" debates when derogatory)
|
|||
|
|
- antisemitism: targeting Jewish people, Holocaust denial or minimization, Jewish conspiracy theories, coded language like "globalists", "Rothschilds", triple parentheses
|
|||
|
|
- islamophobia: anti-Muslim hate, mosque opposition framed as hate, "Islam is not a religion" rhetoric, "takeover/omvolking" narratives, halal/hijab targeting
|
|||
|
|
- sexism: gender-based discrimination, harassment, misogyny, or misandry
|
|||
|
|
- homophobia: targeting sexual orientation or gender identity, anti-LGBTQ+ rhetoric
|
|||
|
|
- insult: personal attacks, name-calling, belittling
|
|||
|
|
- dehumanization: comparing people to animals, vermin, disease, parasites, or other dehumanizing language
|
|||
|
|
- extremism: far-right or far-left extremist rhetoric, Nazi symbolism or glorification, white supremacist language, Great Replacement theory ("omvolkingstheorie"), calls for political violence, fascist/authoritarian glorification
|
|||
|
|
- ableism: targeting people with disabilities, using mental health conditions as insults (e.g. "gestoord", "autist" as slur, "mongool")
|
|||
|
|
|
|||
|
|
Important context:
|
|||
|
|
- Many posts are in Dutch. Handle Dutch slang, insults, and coded political language.
|
|||
|
|
- Dutch-specific coded terms: "gelukszoekers", "kutmarokkanen", "omvolking", "landverrader", "volksverrader", "linkse ratten", "wappie", "tokkie" — score appropriately based on context.
|
|||
|
|
- Political disagreement and criticism are NOT toxic — only score actual hostility, hate, or threats.
|
|||
|
|
- Satire and parody accounts may use irony — consider context but still score the literal content.
|
|||
|
|
- A score of 0.0 means the category is completely absent. A score of 1.0 means extreme/explicit.
|
|||
|
|
- Most posts will score 0.0 on most categories. Only flag genuine toxicity.
|
|||
|
|
|
|||
|
|
Example for 2 posts:
|
|||
|
|
{"1":{"toxic":0.0,"threat":0.0,"hate_speech":0.0,"racism":0.0,"antisemitism":0.0,"islamophobia":0.0,"sexism":0.0,"homophobia":0.0,"insult":0.0,"dehumanization":0.0,"extremism":0.0,"ableism":0.0},"2":{"toxic":0.3,"threat":0.0,"hate_speech":0.0,"racism":0.0,"antisemitism":0.0,"islamophobia":0.0,"sexism":0.0,"homophobia":0.0,"insult":0.2,"dehumanization":0.0,"extremism":0.0,"ableism":0.0}}"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class ToxicityScores:
|
|||
|
|
"""Classification result for a single post."""
|
|||
|
|
|
|||
|
|
toxic: float = 0.0
|
|||
|
|
threat: float = 0.0
|
|||
|
|
hate_speech: float = 0.0
|
|||
|
|
racism: float = 0.0
|
|||
|
|
antisemitism: float = 0.0
|
|||
|
|
islamophobia: float = 0.0
|
|||
|
|
sexism: float = 0.0
|
|||
|
|
homophobia: float = 0.0
|
|||
|
|
insult: float = 0.0
|
|||
|
|
dehumanization: float = 0.0
|
|||
|
|
extremism: float = 0.0
|
|||
|
|
ableism: float = 0.0
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def overall(self) -> float:
|
|||
|
|
"""Overall toxicity = max of all categories."""
|
|||
|
|
return max(
|
|||
|
|
self.toxic,
|
|||
|
|
self.threat,
|
|||
|
|
self.hate_speech,
|
|||
|
|
self.racism,
|
|||
|
|
self.antisemitism,
|
|||
|
|
self.islamophobia,
|
|||
|
|
self.sexism,
|
|||
|
|
self.homophobia,
|
|||
|
|
self.insult,
|
|||
|
|
self.dehumanization,
|
|||
|
|
self.extremism,
|
|||
|
|
self.ableism,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def is_flagged(self, threshold: float = 0.5) -> bool:
|
|||
|
|
return self.overall >= threshold
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"toxic": self.toxic,
|
|||
|
|
"threat": self.threat,
|
|||
|
|
"hate_speech": self.hate_speech,
|
|||
|
|
"racism": self.racism,
|
|||
|
|
"antisemitism": self.antisemitism,
|
|||
|
|
"islamophobia": self.islamophobia,
|
|||
|
|
"sexism": self.sexism,
|
|||
|
|
"homophobia": self.homophobia,
|
|||
|
|
"insult": self.insult,
|
|||
|
|
"dehumanization": self.dehumanization,
|
|||
|
|
"extremism": self.extremism,
|
|||
|
|
"ableism": self.ableism,
|
|||
|
|
"overall": self.overall,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Approximate token counts for cost tracking
|
|||
|
|
input_tokens: int = 0
|
|||
|
|
output_tokens: int = 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_scores(raw: str) -> ToxicityScores:
|
|||
|
|
"""Parse the JSON scores for a single post into ToxicityScores."""
|
|||
|
|
try:
|
|||
|
|
data = json.loads(raw) if isinstance(raw, str) else raw
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
logger.warning("Failed to parse JSON response: %s", str(raw)[:200])
|
|||
|
|
return ToxicityScores()
|
|||
|
|
|
|||
|
|
def clamp(val) -> float:
|
|||
|
|
try:
|
|||
|
|
f = float(val)
|
|||
|
|
return max(0.0, min(1.0, f))
|
|||
|
|
except (TypeError, ValueError):
|
|||
|
|
return 0.0
|
|||
|
|
|
|||
|
|
return ToxicityScores(
|
|||
|
|
toxic=clamp(data.get("toxic")),
|
|||
|
|
threat=clamp(data.get("threat")),
|
|||
|
|
hate_speech=clamp(data.get("hate_speech")),
|
|||
|
|
racism=clamp(data.get("racism")),
|
|||
|
|
antisemitism=clamp(data.get("antisemitism")),
|
|||
|
|
islamophobia=clamp(data.get("islamophobia")),
|
|||
|
|
sexism=clamp(data.get("sexism")),
|
|||
|
|
homophobia=clamp(data.get("homophobia")),
|
|||
|
|
insult=clamp(data.get("insult")),
|
|||
|
|
dehumanization=clamp(data.get("dehumanization")),
|
|||
|
|
extremism=clamp(data.get("extremism")),
|
|||
|
|
ableism=clamp(data.get("ableism")),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
|
|||
|
|
"""Parse a batched JSON response into a list of ToxicityScores.
|
|||
|
|
|
|||
|
|
Expected format: {"1": {...scores...}, "2": {...scores...}, ...}
|
|||
|
|
Returns a list of ToxicityScores in the same order as the input batch.
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
data = json.loads(raw)
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
logger.warning("Failed to parse batch JSON: %s", raw[:300])
|
|||
|
|
return [ToxicityScores() for _ in range(batch_size)]
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
for i in range(1, batch_size + 1):
|
|||
|
|
key = str(i)
|
|||
|
|
if key in data and isinstance(data[key], dict):
|
|||
|
|
results.append(parse_scores(data[key]))
|
|||
|
|
else:
|
|||
|
|
logger.warning("Missing scores for post %d in batch response", i)
|
|||
|
|
results.append(ToxicityScores())
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ToxicityClassifier:
|
|||
|
|
"""Async OpenAI-based toxicity classifier with batch support."""
|
|||
|
|
|
|||
|
|
def __init__(self, api_key: str, model: str = "gpt-4.1-nano"):
|
|||
|
|
self.client = AsyncOpenAI(api_key=api_key)
|
|||
|
|
self.model = model
|
|||
|
|
|
|||
|
|
async def classify_batch(
|
|||
|
|
self, texts: list[str], max_retries: int = 5
|
|||
|
|
) -> list[ToxicityScores]:
|
|||
|
|
"""Classify multiple posts in a single API call.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
texts: List of post texts to classify (1–batch_size items).
|
|||
|
|
max_retries: Number of retries on rate limit / transient errors.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
List of ToxicityScores, one per input text, in the same order.
|
|||
|
|
"""
|
|||
|
|
if not texts:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Handle single-item batches efficiently
|
|||
|
|
batch_size = len(texts)
|
|||
|
|
|
|||
|
|
# Build the numbered user message
|
|||
|
|
parts = []
|
|||
|
|
for i, text in enumerate(texts, 1):
|
|||
|
|
# Truncate very long posts
|
|||
|
|
t = text.strip() if text else ""
|
|||
|
|
if len(t) > 2000:
|
|||
|
|
t = t[:2000]
|
|||
|
|
if not t:
|
|||
|
|
t = "(empty)"
|
|||
|
|
parts.append(f"[{i}] {t}")
|
|||
|
|
user_message = "\n\n".join(parts)
|
|||
|
|
|
|||
|
|
# Scale max_tokens by batch size.
|
|||
|
|
# Each post's JSON scores ≈ 60 tokens compact, but the model often
|
|||
|
|
# outputs formatted JSON (whitespace/newlines) which can double that.
|
|||
|
|
# Use a generous budget to avoid truncation.
|
|||
|
|
max_tokens = max(300, batch_size * 200)
|
|||
|
|
|
|||
|
|
last_err = None
|
|||
|
|
for attempt in range(max_retries):
|
|||
|
|
try:
|
|||
|
|
response = await self.client.chat.completions.create(
|
|||
|
|
model=self.model,
|
|||
|
|
temperature=0,
|
|||
|
|
max_tokens=max_tokens,
|
|||
|
|
response_format={"type": "json_object"},
|
|||
|
|
messages=[
|
|||
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|||
|
|
{"role": "user", "content": user_message},
|
|||
|
|
],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
content = response.choices[0].message.content or "{}"
|
|||
|
|
results = parse_batch_response(content, batch_size)
|
|||
|
|
|
|||
|
|
# Distribute token usage evenly for cost tracking
|
|||
|
|
if response.usage:
|
|||
|
|
per_post_input = response.usage.prompt_tokens // batch_size
|
|||
|
|
per_post_output = response.usage.completion_tokens // batch_size
|
|||
|
|
for scores in results:
|
|||
|
|
scores.input_tokens = per_post_input
|
|||
|
|
scores.output_tokens = per_post_output
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
except RateLimitError as e:
|
|||
|
|
last_err = e
|
|||
|
|
wait = min(2 ** attempt + random.uniform(0.5, 1.5), 30)
|
|||
|
|
logger.debug(
|
|||
|
|
"Rate limited (attempt %d/%d), waiting %.1fs",
|
|||
|
|
attempt + 1, max_retries, wait,
|
|||
|
|
)
|
|||
|
|
await asyncio.sleep(wait)
|
|||
|
|
|
|||
|
|
except (APITimeoutError, APIConnectionError) as e:
|
|||
|
|
last_err = e
|
|||
|
|
wait = 2 ** attempt + random.uniform(0, 1)
|
|||
|
|
logger.debug(
|
|||
|
|
"Transient error (attempt %d/%d), retrying in %.1fs: %s",
|
|||
|
|
attempt + 1, max_retries, wait, e,
|
|||
|
|
)
|
|||
|
|
await asyncio.sleep(wait)
|
|||
|
|
|
|||
|
|
except Exception:
|
|||
|
|
logger.exception(
|
|||
|
|
"Batch classification API call failed (%d posts)", batch_size
|
|||
|
|
)
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
# All retries exhausted
|
|||
|
|
logger.error("Rate limit retries exhausted for batch of %d posts", batch_size)
|
|||
|
|
raise last_err
|
|||
|
|
|
|||
|
|
async def classify(self, text: str, max_retries: int = 5) -> ToxicityScores:
|
|||
|
|
"""Classify a single post (convenience wrapper around classify_batch)."""
|
|||
|
|
results = await self.classify_batch([text], max_retries=max_retries)
|
|||
|
|
return results[0]
|
|||
|
|
|
|||
|
|
async def close(self):
|
|||
|
|
await self.client.close()
|