mastodon-collector/scripts/02-toxicity.sql
Pieter 27582c7b77 Add toxicity analysis system for Mastodon statuses
Implements comprehensive toxicity analysis following the Bluesky collector architecture:

- Analyzer module with async batch processing using GPT-4o-mini
- Database schema for toxicity scores and analysis run tracking
- 12 toxicity categories (toxic, threat, hate_speech, racism, antisemitism, islamophobia, sexism, homophobia, insult, dehumanization, extremism, ableism)
- Web interface routes for analysis dashboard and flagged content review
- Manual review API endpoint for human validation
- Analysis helper functions for database queries
- Dutch language support with coded political term recognition

Usage:
  docker exec mastodon-collector-collector-1 python -m app.analyzer

See TOXICITY_ANALYSIS.md for full documentation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-30 14:43:35 +02:00

44 lines
2 KiB
SQL

-- Toxicity Analysis Schema
-- Stores per-status toxicity scores from LLM classification.
-- Toxicity scores for statuses
CREATE TABLE IF NOT EXISTS toxicity_scores (
status_id BIGINT PRIMARY KEY REFERENCES statuses(id) ON DELETE CASCADE,
overall REAL NOT NULL,
toxic REAL NOT NULL DEFAULT 0,
threat REAL NOT NULL DEFAULT 0,
hate_speech REAL NOT NULL DEFAULT 0,
racism REAL NOT NULL DEFAULT 0,
antisemitism REAL NOT NULL DEFAULT 0,
islamophobia REAL NOT NULL DEFAULT 0,
sexism REAL NOT NULL DEFAULT 0,
homophobia REAL NOT NULL DEFAULT 0,
insult REAL NOT NULL DEFAULT 0,
dehumanization REAL NOT NULL DEFAULT 0,
extremism REAL NOT NULL DEFAULT 0,
ableism REAL NOT NULL DEFAULT 0,
flagged BOOLEAN NOT NULL DEFAULT false,
model TEXT NOT NULL DEFAULT 'gpt-4o-mini',
scored_at TIMESTAMPTZ NOT NULL DEFAULT now(),
human_reviewed BOOLEAN NOT NULL DEFAULT false,
review_status TEXT, -- 'correct', 'incorrect', 'unsure'
reviewed_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS idx_tox_flagged ON toxicity_scores (flagged) WHERE flagged = true;
CREATE INDEX IF NOT EXISTS idx_tox_overall ON toxicity_scores (overall DESC);
CREATE INDEX IF NOT EXISTS idx_tox_scored ON toxicity_scores (scored_at DESC);
CREATE INDEX IF NOT EXISTS idx_tox_reviewed ON toxicity_scores (human_reviewed, review_status);
-- Analysis run audit trail
CREATE TABLE IF NOT EXISTS analysis_runs (
id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
finished_at TIMESTAMPTZ,
status TEXT NOT NULL DEFAULT 'running', -- running | completed | failed | partial
statuses_scored INTEGER NOT NULL DEFAULT 0,
errors INTEGER NOT NULL DEFAULT 0,
model TEXT NOT NULL,
cost_usd NUMERIC(10,6) DEFAULT 0,
duration_secs NUMERIC
);