238 lines
7.6 KiB
Python
238 lines
7.6 KiB
Python
|
|
"""Database query helpers for toxicity analysis views."""
|
||
|
|
|
||
|
|
from sqlalchemy import func, desc, and_, or_, cast, Float
|
||
|
|
from sqlalchemy.orm import Session
|
||
|
|
from app.db import Status, MonitoredAccount
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
|
||
|
|
# Toxicity categories for display
|
||
|
|
TOXICITY_CATEGORIES = [
|
||
|
|
"toxic", "threat", "hate_speech", "racism",
|
||
|
|
"antisemitism", "islamophobia", "sexism", "homophobia",
|
||
|
|
"insult", "dehumanization", "extremism", "ableism"
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def get_analysis_stats(session: Session) -> dict:
|
||
|
|
"""Get overall toxicity analysis statistics."""
|
||
|
|
from sqlalchemy import text
|
||
|
|
|
||
|
|
# Total statuses and scored statuses
|
||
|
|
total_statuses = session.query(func.count(Status.id)).scalar() or 0
|
||
|
|
|
||
|
|
scored = session.execute(text("""
|
||
|
|
SELECT COUNT(*) as total_scored,
|
||
|
|
COUNT(*) FILTER (WHERE flagged = true) as flagged,
|
||
|
|
AVG(overall) as avg_toxicity
|
||
|
|
FROM toxicity_scores
|
||
|
|
""")).fetchone()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"total_statuses": total_statuses,
|
||
|
|
"total_scored_statuses": scored[0] if scored else 0,
|
||
|
|
"flagged_statuses": scored[1] if scored else 0,
|
||
|
|
"avg_toxicity_statuses": float(scored[2]) if scored and scored[2] else 0.0,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_toxicity_trend(session: Session, weeks: int = 12) -> list[dict]:
|
||
|
|
"""Get toxicity trend over time (weekly aggregates)."""
|
||
|
|
from sqlalchemy import text
|
||
|
|
|
||
|
|
result = session.execute(text("""
|
||
|
|
SELECT
|
||
|
|
DATE_TRUNC('week', s.created_at) as week,
|
||
|
|
AVG(ts.overall) as avg_toxicity,
|
||
|
|
COUNT(*) FILTER (WHERE ts.flagged = true) as flagged_count
|
||
|
|
FROM statuses s
|
||
|
|
JOIN toxicity_scores ts ON ts.status_id = s.id
|
||
|
|
WHERE s.created_at >= NOW() - INTERVAL ':weeks weeks'
|
||
|
|
GROUP BY week
|
||
|
|
ORDER BY week DESC
|
||
|
|
"""), {"weeks": weeks})
|
||
|
|
|
||
|
|
return [{"week": r[0], "avg_toxicity": float(r[1]) if r[1] else 0.0, "flagged_statuses": r[2]} for r in result]
|
||
|
|
|
||
|
|
|
||
|
|
def get_category_averages(session: Session) -> dict:
|
||
|
|
"""Get average toxicity score for each category."""
|
||
|
|
from sqlalchemy import text
|
||
|
|
|
||
|
|
result = session.execute(text(f"""
|
||
|
|
SELECT
|
||
|
|
AVG(toxic) as toxic,
|
||
|
|
AVG(threat) as threat,
|
||
|
|
AVG(hate_speech) as hate_speech,
|
||
|
|
AVG(racism) as racism,
|
||
|
|
AVG(antisemitism) as antisemitism,
|
||
|
|
AVG(islamophobia) as islamophobia,
|
||
|
|
AVG(sexism) as sexism,
|
||
|
|
AVG(homophobia) as homophobia,
|
||
|
|
AVG(insult) as insult,
|
||
|
|
AVG(dehumanization) as dehumanization,
|
||
|
|
AVG(extremism) as extremism,
|
||
|
|
AVG(ableism) as ableism
|
||
|
|
FROM toxicity_scores
|
||
|
|
""")).fetchone()
|
||
|
|
|
||
|
|
if not result:
|
||
|
|
return {cat: 0.0 for cat in TOXICITY_CATEGORIES}
|
||
|
|
|
||
|
|
return {cat: float(result[i]) if result[i] else 0.0 for i, cat in enumerate(TOXICITY_CATEGORIES)}
|
||
|
|
|
||
|
|
|
||
|
|
def get_recent_analysis_runs(session: Session, limit: int = 5) -> list[dict]:
|
||
|
|
"""Get recent analysis runs."""
|
||
|
|
from sqlalchemy import text
|
||
|
|
|
||
|
|
result = session.execute(text("""
|
||
|
|
SELECT id, started_at, finished_at, status, statuses_scored, errors, cost_usd, duration_secs
|
||
|
|
FROM analysis_runs
|
||
|
|
ORDER BY started_at DESC
|
||
|
|
LIMIT :limit
|
||
|
|
"""), {"limit": limit})
|
||
|
|
|
||
|
|
return [
|
||
|
|
{
|
||
|
|
"id": r[0],
|
||
|
|
"started_at": r[1],
|
||
|
|
"finished_at": r[2],
|
||
|
|
"status": r[3],
|
||
|
|
"statuses_scored": r[4],
|
||
|
|
"errors": r[5],
|
||
|
|
"cost_usd": r[6],
|
||
|
|
"duration_secs": r[7]
|
||
|
|
}
|
||
|
|
for r in result
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def get_flagged_content(
|
||
|
|
session: Session,
|
||
|
|
category: str = None,
|
||
|
|
account_id: int = None,
|
||
|
|
threshold: float = 0.5,
|
||
|
|
review_status: str = None,
|
||
|
|
date_from: str = None,
|
||
|
|
date_to: str = None,
|
||
|
|
sort: str = "overall",
|
||
|
|
direction: str = "desc",
|
||
|
|
limit: int = 50,
|
||
|
|
offset: int = 0,
|
||
|
|
) -> tuple[list[dict], int]:
|
||
|
|
"""Get flagged content with filters."""
|
||
|
|
from sqlalchemy import text
|
||
|
|
|
||
|
|
# Build WHERE clauses
|
||
|
|
where_clauses = ["ts.flagged = true"]
|
||
|
|
params = {"threshold": threshold, "limit": limit, "offset": offset}
|
||
|
|
|
||
|
|
if category:
|
||
|
|
where_clauses.append(f"ts.{category} >= :threshold")
|
||
|
|
params["category_threshold"] = threshold
|
||
|
|
|
||
|
|
if account_id:
|
||
|
|
where_clauses.append("s.account_db_id = :account_id")
|
||
|
|
params["account_id"] = account_id
|
||
|
|
|
||
|
|
if review_status:
|
||
|
|
if review_status == "unreviewed":
|
||
|
|
where_clauses.append("ts.human_reviewed = false")
|
||
|
|
else:
|
||
|
|
where_clauses.append("ts.review_status = :review_status")
|
||
|
|
params["review_status"] = review_status
|
||
|
|
|
||
|
|
if date_from:
|
||
|
|
where_clauses.append("s.created_at >= :date_from")
|
||
|
|
params["date_from"] = date_from
|
||
|
|
|
||
|
|
if date_to:
|
||
|
|
where_clauses.append("s.created_at <= :date_to")
|
||
|
|
params["date_to"] = date_to
|
||
|
|
|
||
|
|
where_sql = " AND ".join(where_clauses)
|
||
|
|
|
||
|
|
# Valid sort columns
|
||
|
|
valid_sorts = ["overall", "created_at", "toxic", "threat", "hate_speech", "racism",
|
||
|
|
"antisemitism", "islamophobia", "sexism", "homophobia", "insult",
|
||
|
|
"dehumanization", "extremism", "ableism"]
|
||
|
|
if sort not in valid_sorts:
|
||
|
|
sort = "overall"
|
||
|
|
|
||
|
|
direction = "DESC" if direction == "desc" else "ASC"
|
||
|
|
|
||
|
|
# Get total count
|
||
|
|
count_query = f"""
|
||
|
|
SELECT COUNT(*)
|
||
|
|
FROM statuses s
|
||
|
|
JOIN toxicity_scores ts ON ts.status_id = s.id
|
||
|
|
WHERE {where_sql}
|
||
|
|
"""
|
||
|
|
total = session.execute(text(count_query), params).scalar() or 0
|
||
|
|
|
||
|
|
# Get items with details
|
||
|
|
query = f"""
|
||
|
|
SELECT
|
||
|
|
s.id,
|
||
|
|
s.status_id,
|
||
|
|
s.content,
|
||
|
|
s.text_content,
|
||
|
|
s.created_at,
|
||
|
|
s.url,
|
||
|
|
s.status_type,
|
||
|
|
ma.username,
|
||
|
|
ma.instance,
|
||
|
|
ts.overall,
|
||
|
|
ts.toxic, ts.threat, ts.hate_speech, ts.racism,
|
||
|
|
ts.antisemitism, ts.islamophobia, ts.sexism, ts.homophobia,
|
||
|
|
ts.insult, ts.dehumanization, ts.extremism, ts.ableism,
|
||
|
|
ts.human_reviewed,
|
||
|
|
ts.review_status,
|
||
|
|
ts.reviewed_at
|
||
|
|
FROM statuses s
|
||
|
|
JOIN toxicity_scores ts ON ts.status_id = s.id
|
||
|
|
JOIN monitored_accounts ma ON ma.id = s.account_db_id
|
||
|
|
WHERE {where_sql}
|
||
|
|
ORDER BY ts.{sort} {direction}, s.created_at DESC
|
||
|
|
LIMIT :limit OFFSET :offset
|
||
|
|
"""
|
||
|
|
|
||
|
|
result = session.execute(text(query), params)
|
||
|
|
|
||
|
|
items = []
|
||
|
|
for r in result:
|
||
|
|
# Find top category
|
||
|
|
scores = {
|
||
|
|
"toxic": r[10], "threat": r[11], "hate_speech": r[12], "racism": r[13],
|
||
|
|
"antisemitism": r[14], "islamophobia": r[15], "sexism": r[16], "homophobia": r[17],
|
||
|
|
"insult": r[18], "dehumanization": r[19], "extremism": r[20], "ableism": r[21]
|
||
|
|
}
|
||
|
|
top_category = max(scores, key=scores.get) if any(scores.values()) else None
|
||
|
|
|
||
|
|
items.append({
|
||
|
|
"id": r[0],
|
||
|
|
"status_id": r[1],
|
||
|
|
"content": r[2],
|
||
|
|
"text_content": r[3],
|
||
|
|
"created_at": r[4],
|
||
|
|
"url": r[5],
|
||
|
|
"status_type": r[6],
|
||
|
|
"author_username": r[7],
|
||
|
|
"author_instance": r[8],
|
||
|
|
"author_handle": f"@{r[7]}@{r[8]}",
|
||
|
|
"overall": float(r[9]),
|
||
|
|
"top_category": top_category,
|
||
|
|
"scores": scores,
|
||
|
|
"human_reviewed": r[22],
|
||
|
|
"review_status": r[23],
|
||
|
|
"reviewed_at": r[24],
|
||
|
|
})
|
||
|
|
|
||
|
|
return items, total
|
||
|
|
|
||
|
|
|
||
|
|
def get_accounts_for_select(session: Session) -> list[dict]:
|
||
|
|
"""Get all monitored accounts for dropdowns."""
|
||
|
|
accounts = session.query(MonitoredAccount.id, MonitoredAccount.username, MonitoredAccount.instance).all()
|
||
|
|
return [{"id": a[0], "handle": f"@{a[1]}@{a[2]}"} for a in accounts]
|