mastodon-collector/app/analysis_helpers.py

"""Database query helpers for toxicity analysis views."""

from sqlalchemy import func, desc, and_, or_, cast, Float
from sqlalchemy.orm import Session
from app.db import Status, MonitoredAccount
from datetime import datetime, timedelta

# Toxicity categories for display
TOXICITY_CATEGORIES = [
    "toxic", "threat", "hate_speech", "racism",
    "antisemitism", "islamophobia", "sexism", "homophobia",
    "insult", "dehumanization", "extremism", "ableism"
]


def get_analysis_stats(session: Session) -> dict:
    """Get overall toxicity analysis statistics."""
    from sqlalchemy import text

    # Total statuses and scored statuses
    total_statuses = session.query(func.count(Status.id)).scalar() or 0

    scored = session.execute(text("""
        SELECT COUNT(*) as total_scored,
               COUNT(*) FILTER (WHERE flagged = true) as flagged,
               AVG(overall) as avg_toxicity
        FROM toxicity_scores
    """)).fetchone()

    return {
        "total_statuses": total_statuses,
        "total_scored_statuses": scored[0] if scored else 0,
        "flagged_statuses": scored[1] if scored else 0,
        "avg_toxicity_statuses": float(scored[2]) if scored and scored[2] else 0.0,
    }


def get_toxicity_trend(session: Session, weeks: int = 12) -> list[dict]:
    """Get toxicity trend over time (weekly aggregates)."""
    from sqlalchemy import text

    result = session.execute(text("""
        SELECT
            DATE_TRUNC('week', s.created_at) as week,
            AVG(ts.overall) as avg_toxicity,
            COUNT(*) FILTER (WHERE ts.flagged = true) as flagged_count
        FROM statuses s
        JOIN toxicity_scores ts ON ts.status_id = s.id
        WHERE s.created_at >= NOW() - INTERVAL ':weeks weeks'
        GROUP BY week
        ORDER BY week DESC
    """), {"weeks": weeks})

    return [{"week": r[0], "avg_toxicity": float(r[1]) if r[1] else 0.0, "flagged_statuses": r[2]} for r in result]


def get_category_averages(session: Session) -> dict:
    """Get average toxicity score for each category."""
    from sqlalchemy import text

    result = session.execute(text(f"""
        SELECT
            AVG(toxic) as toxic,
            AVG(threat) as threat,
            AVG(hate_speech) as hate_speech,
            AVG(racism) as racism,
            AVG(antisemitism) as antisemitism,
            AVG(islamophobia) as islamophobia,
            AVG(sexism) as sexism,
            AVG(homophobia) as homophobia,
            AVG(insult) as insult,
            AVG(dehumanization) as dehumanization,
            AVG(extremism) as extremism,
            AVG(ableism) as ableism
        FROM toxicity_scores
    """)).fetchone()

    if not result:
        return {cat: 0.0 for cat in TOXICITY_CATEGORIES}

    return {cat: float(result[i]) if result[i] else 0.0 for i, cat in enumerate(TOXICITY_CATEGORIES)}


def get_recent_analysis_runs(session: Session, limit: int = 5) -> list[dict]:
    """Get recent analysis runs."""
    from sqlalchemy import text

    result = session.execute(text("""
        SELECT id, started_at, finished_at, status, statuses_scored, errors, cost_usd, duration_secs
        FROM analysis_runs
        ORDER BY started_at DESC
        LIMIT :limit
    """), {"limit": limit})

    return [
        {
            "id": r[0],
            "started_at": r[1],
            "finished_at": r[2],
            "status": r[3],
            "statuses_scored": r[4],
            "errors": r[5],
            "cost_usd": r[6],
            "duration_secs": r[7]
        }
        for r in result
    ]


def get_flagged_content(
    session: Session,
    category: str = None,
    account_id: int = None,
    threshold: float = 0.5,
    review_status: str = None,
    date_from: str = None,
    date_to: str = None,
    sort: str = "overall",
    direction: str = "desc",
    limit: int = 50,
    offset: int = 0,
) -> tuple[list[dict], int]:
    """Get flagged content with filters."""
    from sqlalchemy import text

    # Build WHERE clauses
    where_clauses = ["ts.flagged = true"]
    params = {"threshold": threshold, "limit": limit, "offset": offset}

    if category:
        where_clauses.append(f"ts.{category} >= :threshold")
        params["category_threshold"] = threshold

    if account_id:
        where_clauses.append("s.account_db_id = :account_id")
        params["account_id"] = account_id

    if review_status:
        if review_status == "unreviewed":
            where_clauses.append("ts.human_reviewed = false")
        else:
            where_clauses.append("ts.review_status = :review_status")
            params["review_status"] = review_status

    if date_from:
        where_clauses.append("s.created_at >= :date_from")
        params["date_from"] = date_from

    if date_to:
        where_clauses.append("s.created_at <= :date_to")
        params["date_to"] = date_to

    where_sql = " AND ".join(where_clauses)

    # Valid sort columns
    valid_sorts = ["overall", "created_at", "toxic", "threat", "hate_speech", "racism",
                   "antisemitism", "islamophobia", "sexism", "homophobia", "insult",
                   "dehumanization", "extremism", "ableism"]
    if sort not in valid_sorts:
        sort = "overall"

    direction = "DESC" if direction == "desc" else "ASC"

    # Get total count
    count_query = f"""
        SELECT COUNT(*)
        FROM statuses s
        JOIN toxicity_scores ts ON ts.status_id = s.id
        WHERE {where_sql}
    """
    total = session.execute(text(count_query), params).scalar() or 0

    # Get items with details
    query = f"""
        SELECT
            s.id,
            s.status_id,
            s.content,
            s.text_content,
            s.created_at,
            s.url,
            s.status_type,
            ma.username,
            ma.instance,
            ts.overall,
            ts.toxic, ts.threat, ts.hate_speech, ts.racism,
            ts.antisemitism, ts.islamophobia, ts.sexism, ts.homophobia,
            ts.insult, ts.dehumanization, ts.extremism, ts.ableism,
            ts.human_reviewed,
            ts.review_status,
            ts.reviewed_at
        FROM statuses s
        JOIN toxicity_scores ts ON ts.status_id = s.id
        JOIN monitored_accounts ma ON ma.id = s.account_db_id
        WHERE {where_sql}
        ORDER BY ts.{sort} {direction}, s.created_at DESC
        LIMIT :limit OFFSET :offset
    """

    result = session.execute(text(query), params)

    items = []
    for r in result:
        # Find top category
        scores = {
            "toxic": r[10], "threat": r[11], "hate_speech": r[12], "racism": r[13],
            "antisemitism": r[14], "islamophobia": r[15], "sexism": r[16], "homophobia": r[17],
            "insult": r[18], "dehumanization": r[19], "extremism": r[20], "ableism": r[21]
        }
        top_category = max(scores, key=scores.get) if any(scores.values()) else None

        items.append({
            "id": r[0],
            "status_id": r[1],
            "content": r[2],
            "text_content": r[3],
            "created_at": r[4],
            "url": r[5],
            "status_type": r[6],
            "author_username": r[7],
            "author_instance": r[8],
            "author_handle": f"@{r[7]}@{r[8]}",
            "overall": float(r[9]),
            "top_category": top_category,
            "scores": scores,
            "human_reviewed": r[22],
            "review_status": r[23],
            "reviewed_at": r[24],
        })

    return items, total


def get_accounts_for_select(session: Session) -> list[dict]:
    """Get all monitored accounts for dropdowns."""
    accounts = session.query(MonitoredAccount.id, MonitoredAccount.username, MonitoredAccount.instance).all()
    return [{"id": a[0], "handle": f"@{a[1]}@{a[2]}"} for a in accounts]
Add toxicity analysis system for Mastodon statuses Implements comprehensive toxicity analysis following the Bluesky collector architecture: - Analyzer module with async batch processing using GPT-4o-mini - Database schema for toxicity scores and analysis run tracking - 12 toxicity categories (toxic, threat, hate_speech, racism, antisemitism, islamophobia, sexism, homophobia, insult, dehumanization, extremism, ableism) - Web interface routes for analysis dashboard and flagged content review - Manual review API endpoint for human validation - Analysis helper functions for database queries - Dutch language support with coded political term recognition Usage: docker exec mastodon-collector-collector-1 python -m app.analyzer See TOXICITY_ANALYSIS.md for full documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2026-03-30 14:43:35 +02:00			`"""Database query helpers for toxicity analysis views."""`

			`from sqlalchemy import func, desc, and_, or_, cast, Float`
			`from sqlalchemy.orm import Session`
			`from app.db import Status, MonitoredAccount`
			`from datetime import datetime, timedelta`

			`# Toxicity categories for display`
			`TOXICITY_CATEGORIES = [`
			`"toxic", "threat", "hate_speech", "racism",`
			`"antisemitism", "islamophobia", "sexism", "homophobia",`
			`"insult", "dehumanization", "extremism", "ableism"`
			`]`


			`def get_analysis_stats(session: Session) -> dict:`
			`"""Get overall toxicity analysis statistics."""`
			`from sqlalchemy import text`

			`# Total statuses and scored statuses`
			`total_statuses = session.query(func.count(Status.id)).scalar() or 0`

			`scored = session.execute(text("""`
			`SELECT COUNT(*) as total_scored,`
			`COUNT(*) FILTER (WHERE flagged = true) as flagged,`
			`AVG(overall) as avg_toxicity`
			`FROM toxicity_scores`
			`""")).fetchone()`

			`return {`
			`"total_statuses": total_statuses,`
			`"total_scored_statuses": scored[0] if scored else 0,`
			`"flagged_statuses": scored[1] if scored else 0,`
			`"avg_toxicity_statuses": float(scored[2]) if scored and scored[2] else 0.0,`
			`}`


			`def get_toxicity_trend(session: Session, weeks: int = 12) -> list[dict]:`
			`"""Get toxicity trend over time (weekly aggregates)."""`
			`from sqlalchemy import text`

			`result = session.execute(text("""`
			`SELECT`
			`DATE_TRUNC('week', s.created_at) as week,`
			`AVG(ts.overall) as avg_toxicity,`
			`COUNT(*) FILTER (WHERE ts.flagged = true) as flagged_count`
			`FROM statuses s`
			`JOIN toxicity_scores ts ON ts.status_id = s.id`
			`WHERE s.created_at >= NOW() - INTERVAL ':weeks weeks'`
			`GROUP BY week`
			`ORDER BY week DESC`
			`"""), {"weeks": weeks})`

			`return [{"week": r[0], "avg_toxicity": float(r[1]) if r[1] else 0.0, "flagged_statuses": r[2]} for r in result]`


			`def get_category_averages(session: Session) -> dict:`
			`"""Get average toxicity score for each category."""`
			`from sqlalchemy import text`

			`result = session.execute(text(f"""`
			`SELECT`
			`AVG(toxic) as toxic,`
			`AVG(threat) as threat,`
			`AVG(hate_speech) as hate_speech,`
			`AVG(racism) as racism,`
			`AVG(antisemitism) as antisemitism,`
			`AVG(islamophobia) as islamophobia,`
			`AVG(sexism) as sexism,`
			`AVG(homophobia) as homophobia,`
			`AVG(insult) as insult,`
			`AVG(dehumanization) as dehumanization,`
			`AVG(extremism) as extremism,`
			`AVG(ableism) as ableism`
			`FROM toxicity_scores`
			`""")).fetchone()`

			`if not result:`
			`return {cat: 0.0 for cat in TOXICITY_CATEGORIES}`

			`return {cat: float(result[i]) if result[i] else 0.0 for i, cat in enumerate(TOXICITY_CATEGORIES)}`


			`def get_recent_analysis_runs(session: Session, limit: int = 5) -> list[dict]:`
			`"""Get recent analysis runs."""`
			`from sqlalchemy import text`

			`result = session.execute(text("""`
			`SELECT id, started_at, finished_at, status, statuses_scored, errors, cost_usd, duration_secs`
			`FROM analysis_runs`
			`ORDER BY started_at DESC`
			`LIMIT :limit`
			`"""), {"limit": limit})`

			`return [`
			`{`
			`"id": r[0],`
			`"started_at": r[1],`
			`"finished_at": r[2],`
			`"status": r[3],`
			`"statuses_scored": r[4],`
			`"errors": r[5],`
			`"cost_usd": r[6],`
			`"duration_secs": r[7]`
			`}`
			`for r in result`
			`]`


			`def get_flagged_content(`
			`session: Session,`
			`category: str = None,`
			`account_id: int = None,`
			`threshold: float = 0.5,`
			`review_status: str = None,`
			`date_from: str = None,`
			`date_to: str = None,`
			`sort: str = "overall",`
			`direction: str = "desc",`
			`limit: int = 50,`
			`offset: int = 0,`
			`) -> tuple[list[dict], int]:`
			`"""Get flagged content with filters."""`
			`from sqlalchemy import text`

			`# Build WHERE clauses`
			`where_clauses = ["ts.flagged = true"]`
			`params = {"threshold": threshold, "limit": limit, "offset": offset}`

			`if category:`
			`where_clauses.append(f"ts.{category} >= :threshold")`
			`params["category_threshold"] = threshold`

			`if account_id:`
			`where_clauses.append("s.account_db_id = :account_id")`
			`params["account_id"] = account_id`

			`if review_status:`
			`if review_status == "unreviewed":`
			`where_clauses.append("ts.human_reviewed = false")`
			`else:`
			`where_clauses.append("ts.review_status = :review_status")`
			`params["review_status"] = review_status`

			`if date_from:`
			`where_clauses.append("s.created_at >= :date_from")`
			`params["date_from"] = date_from`

			`if date_to:`
			`where_clauses.append("s.created_at <= :date_to")`
			`params["date_to"] = date_to`

			`where_sql = " AND ".join(where_clauses)`

			`# Valid sort columns`
			`valid_sorts = ["overall", "created_at", "toxic", "threat", "hate_speech", "racism",`
			`"antisemitism", "islamophobia", "sexism", "homophobia", "insult",`
			`"dehumanization", "extremism", "ableism"]`
			`if sort not in valid_sorts:`
			`sort = "overall"`

			`direction = "DESC" if direction == "desc" else "ASC"`

			`# Get total count`
			`count_query = f"""`
			`SELECT COUNT(*)`
			`FROM statuses s`
			`JOIN toxicity_scores ts ON ts.status_id = s.id`
			`WHERE {where_sql}`
			`"""`
			`total = session.execute(text(count_query), params).scalar() or 0`

			`# Get items with details`
			`query = f"""`
			`SELECT`
			`s.id,`
			`s.status_id,`
			`s.content,`
			`s.text_content,`
			`s.created_at,`
			`s.url,`
			`s.status_type,`
			`ma.username,`
			`ma.instance,`
			`ts.overall,`
			`ts.toxic, ts.threat, ts.hate_speech, ts.racism,`
			`ts.antisemitism, ts.islamophobia, ts.sexism, ts.homophobia,`
			`ts.insult, ts.dehumanization, ts.extremism, ts.ableism,`
			`ts.human_reviewed,`
			`ts.review_status,`
			`ts.reviewed_at`
			`FROM statuses s`
			`JOIN toxicity_scores ts ON ts.status_id = s.id`
			`JOIN monitored_accounts ma ON ma.id = s.account_db_id`
			`WHERE {where_sql}`
			`ORDER BY ts.{sort} {direction}, s.created_at DESC`
			`LIMIT :limit OFFSET :offset`
			`"""`

			`result = session.execute(text(query), params)`

			`items = []`
			`for r in result:`
			`# Find top category`
			`scores = {`
			`"toxic": r[10], "threat": r[11], "hate_speech": r[12], "racism": r[13],`
			`"antisemitism": r[14], "islamophobia": r[15], "sexism": r[16], "homophobia": r[17],`
			`"insult": r[18], "dehumanization": r[19], "extremism": r[20], "ableism": r[21]`
			`}`
			`top_category = max(scores, key=scores.get) if any(scores.values()) else None`

			`items.append({`
			`"id": r[0],`
			`"status_id": r[1],`
			`"content": r[2],`
			`"text_content": r[3],`
			`"created_at": r[4],`
			`"url": r[5],`
			`"status_type": r[6],`
			`"author_username": r[7],`
			`"author_instance": r[8],`
			`"author_handle": f"@{r[7]}@{r[8]}",`
			`"overall": float(r[9]),`
			`"top_category": top_category,`
			`"scores": scores,`
			`"human_reviewed": r[22],`
			`"review_status": r[23],`
			`"reviewed_at": r[24],`
			`})`

			`return items, total`


			`def get_accounts_for_select(session: Session) -> list[dict]:`
			`"""Get all monitored accounts for dropdowns."""`
			`accounts = session.query(MonitoredAccount.id, MonitoredAccount.username, MonitoredAccount.instance).all()`
			`return [{"id": a[0], "handle": f"@{a[1]}@{a[2]}"} for a in accounts]`