mastodon-collector/app/web.py

521 lines
17 KiB
Python
Raw Normal View History

"""Flask web application for managing monitored accounts and viewing collected data."""
import os
import logging
from datetime import datetime, timezone
from flask import Flask, render_template, request, redirect, url_for, flash, jsonify
from sqlalchemy import func, desc
from app.db import (
init_db,
get_session,
MonitoredAccount,
Status,
Mention,
CollectionLog,
)
from app.mastodon_api import lookup_account, MastodonAPIError
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
app.secret_key = os.environ.get("FLASK_SECRET_KEY", "dev-secret-key")
# Initialize database on startup
with app.app_context():
init_db()
@app.route("/")
def index():
"""Dashboard overview."""
session = get_session()
try:
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
total_statuses = session.query(func.count(Status.id)).scalar() or 0
total_posts = session.query(func.count(Status.id)).filter(Status.status_type == "post").scalar() or 0
total_replies = session.query(func.count(Status.id)).filter(Status.status_type == "reply").scalar() or 0
total_mentions = session.query(func.count(Status.id)).filter(Status.status_type == "mention").scalar() or 0
total_reblogs = session.query(func.count(Status.id)).filter(Status.status_type == "reblog").scalar() or 0
# Per-account stats
account_stats = []
for acct in accounts:
count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
last_log = (
session.query(CollectionLog)
.filter_by(account_db_id=acct.id)
.order_by(desc(CollectionLog.started_at))
.first()
)
account_stats.append({
"account": acct,
"status_count": count,
"last_log": last_log,
})
# Recent collection logs
recent_logs = (
session.query(CollectionLog)
.order_by(desc(CollectionLog.started_at))
.limit(20)
.all()
)
return render_template(
"index.html",
account_stats=account_stats,
total_statuses=total_statuses,
total_posts=total_posts,
total_replies=total_replies,
total_mentions=total_mentions,
total_reblogs=total_reblogs,
recent_logs=recent_logs,
)
finally:
session.close()
@app.route("/accounts")
def accounts_list():
"""List all monitored accounts."""
session = get_session()
try:
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
return render_template("accounts.html", accounts=accounts)
finally:
session.close()
@app.route("/accounts/add", methods=["POST"])
def accounts_add():
"""Add a new account to monitor."""
handle = request.form.get("handle", "").strip().lstrip("@")
if "@" not in handle:
flash("Invalid handle format. Use @user@instance.social", "error")
return redirect(url_for("accounts_list"))
username, instance = handle.split("@", 1)
if not username or not instance:
flash("Invalid handle format. Use @user@instance.social", "error")
return redirect(url_for("accounts_list"))
session = get_session()
try:
existing = session.query(MonitoredAccount).filter_by(username=username, instance=instance).first()
if existing:
if not existing.is_active:
existing.is_active = True
session.commit()
flash(f"Re-activated {existing.handle}", "success")
else:
flash(f"{existing.handle} is already being monitored", "info")
return redirect(url_for("accounts_list"))
# Try to resolve the account first
try:
data = lookup_account(instance, username)
acct = MonitoredAccount(
username=username,
instance=instance,
account_id=data["id"],
display_name=data.get("display_name", ""),
avatar_url=data.get("avatar", ""),
note=data.get("note", ""),
is_active=True,
)
except MastodonAPIError as e:
logger.warning("Could not resolve account @%s@%s: %s — adding anyway", username, instance, e)
acct = MonitoredAccount(
username=username,
instance=instance,
is_active=True,
)
session.add(acct)
session.commit()
flash(f"Added {acct.handle} to monitoring list", "success")
return redirect(url_for("accounts_list"))
finally:
session.close()
@app.route("/accounts/<int:account_id>/toggle", methods=["POST"])
def accounts_toggle(account_id):
"""Toggle an account's active status."""
session = get_session()
try:
acct = session.query(MonitoredAccount).get(account_id)
if acct:
acct.is_active = not acct.is_active
session.commit()
state = "activated" if acct.is_active else "paused"
flash(f"{state.capitalize()} monitoring for {acct.handle}", "success")
return redirect(url_for("accounts_list"))
finally:
session.close()
@app.route("/accounts/<int:account_id>/delete", methods=["POST"])
def accounts_delete(account_id):
"""Delete an account and all its collected data."""
session = get_session()
try:
acct = session.query(MonitoredAccount).get(account_id)
if acct:
handle = acct.handle
# Delete associated statuses (cascades to mentions, media, tags)
session.query(Status).filter_by(account_db_id=acct.id).delete()
session.query(CollectionLog).filter_by(account_db_id=acct.id).delete()
session.delete(acct)
session.commit()
flash(f"Deleted {handle} and all collected data", "success")
return redirect(url_for("accounts_list"))
finally:
session.close()
@app.route("/statuses")
def statuses_list():
"""Browse collected statuses with filters."""
session = get_session()
try:
page = request.args.get("page", 1, type=int)
per_page = request.args.get("per_page", 50, type=int)
account_id = request.args.get("account_id", type=int)
status_type = request.args.get("type", "")
search = request.args.get("q", "").strip()
query = session.query(Status).join(MonitoredAccount)
if account_id:
query = query.filter(Status.account_db_id == account_id)
if status_type:
query = query.filter(Status.status_type == status_type)
if search:
query = query.filter(Status.text_content.ilike(f"%{search}%"))
total = query.count()
statuses = (
query.order_by(desc(Status.created_at))
.offset((page - 1) * per_page)
.limit(per_page)
.all()
)
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.username).all()
total_pages = max(1, (total + per_page - 1) // per_page)
return render_template(
"statuses.html",
statuses=statuses,
accounts=accounts,
page=page,
per_page=per_page,
total=total,
total_pages=total_pages,
current_account_id=account_id,
current_type=status_type,
search=search,
)
finally:
session.close()
@app.route("/statuses/<int:status_db_id>")
def status_detail(status_db_id):
"""View a single status with all details."""
session = get_session()
try:
status = session.query(Status).get(status_db_id)
if not status:
flash("Status not found", "error")
return redirect(url_for("statuses_list"))
return render_template("status_detail.html", status=status)
finally:
session.close()
@app.route("/api/stats")
def api_stats():
"""JSON API endpoint for stats (useful for your analysis pipeline)."""
session = get_session()
try:
stats = {
"total_statuses": session.query(func.count(Status.id)).scalar() or 0,
"by_type": {},
"accounts": [],
}
for stype in ["post", "reply", "mention", "reblog"]:
stats["by_type"][stype] = (
session.query(func.count(Status.id)).filter(Status.status_type == stype).scalar() or 0
)
accounts = session.query(MonitoredAccount).filter_by(is_active=True).all()
for acct in accounts:
count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
stats["accounts"].append({
"handle": acct.handle,
"status_count": count,
"last_collected": acct.last_collected_at.isoformat() if acct.last_collected_at else None,
})
return jsonify(stats)
finally:
session.close()
@app.route("/api/statuses")
def api_statuses():
"""JSON API endpoint for statuses (for your analysis pipeline)."""
session = get_session()
try:
page = request.args.get("page", 1, type=int)
per_page = min(request.args.get("per_page", 100, type=int), 500)
account_id = request.args.get("account_id", type=int)
status_type = request.args.get("type", "")
since = request.args.get("since", "") # ISO datetime
query = session.query(Status)
if account_id:
query = query.filter(Status.account_db_id == account_id)
if status_type:
query = query.filter(Status.status_type == status_type)
if since:
query = query.filter(Status.created_at >= since)
total = query.count()
statuses = (
query.order_by(desc(Status.created_at))
.offset((page - 1) * per_page)
.limit(per_page)
.all()
)
return jsonify({
"total": total,
"page": page,
"per_page": per_page,
"statuses": [
{
"id": s.id,
"status_id": s.status_id,
"account": s.account.handle,
"url": s.url,
"content": s.content,
"text_content": s.text_content,
"visibility": s.visibility,
"created_at": s.created_at.isoformat() if s.created_at else None,
"language": s.language,
"status_type": s.status_type,
"in_reply_to_id": s.in_reply_to_id,
"replies_count": s.replies_count,
"reblogs_count": s.reblogs_count,
"favourites_count": s.favourites_count,
"mentions": [
{"acct": m.mentioned_acct, "url": m.mentioned_url}
for m in s.mentions
],
"tags": [t.name for t in s.tags],
}
for s in statuses
],
})
finally:
session.close()
@app.route("/export")
def export_csv():
"""Export statuses as CSV for analysis."""
from io import StringIO
import csv
session = get_session()
try:
account_id = request.args.get("account_id", type=int)
status_type = request.args.get("type", "")
query = session.query(Status).join(MonitoredAccount)
if account_id:
query = query.filter(Status.account_db_id == account_id)
if status_type:
query = query.filter(Status.status_type == status_type)
statuses = query.order_by(desc(Status.created_at)).all()
output = StringIO()
writer = csv.writer(output)
writer.writerow([
"id", "account", "status_type", "created_at", "url",
"text_content", "language", "visibility", "in_reply_to_id",
"replies_count", "reblogs_count", "favourites_count",
"mentions", "tags", "sensitive", "spoiler_text",
])
for s in statuses:
mentions_str = "; ".join(m.mentioned_acct for m in s.mentions)
tags_str = "; ".join(t.name for t in s.tags)
writer.writerow([
s.status_id, s.account.handle, s.status_type,
s.created_at.isoformat() if s.created_at else "",
s.url, s.text_content, s.language, s.visibility,
s.in_reply_to_id, s.replies_count, s.reblogs_count,
s.favourites_count, mentions_str, tags_str,
s.sensitive, s.spoiler_text,
])
from flask import Response
return Response(
output.getvalue(),
mimetype="text/csv",
headers={"Content-Disposition": "attachment; filename=mastodon_statuses.csv"},
)
finally:
session.close()
@app.route("/analysis")
def analysis_dashboard():
"""Toxicity analysis dashboard."""
from app.analysis_helpers import (
get_analysis_stats,
get_toxicity_trend,
get_category_averages,
get_recent_analysis_runs,
TOXICITY_CATEGORIES,
)
import json
session = get_session()
try:
stats = get_analysis_stats(session)
trend = get_toxicity_trend(session, weeks=12)
categories = get_category_averages(session)
runs = get_recent_analysis_runs(session, limit=5)
# Prepare chart data
trend_json = json.dumps([
{
"week": r["week"].strftime("%Y-%m-%d") if r["week"] else "",
"avg_toxicity": round(float(r["avg_toxicity"]), 4),
"flagged_statuses": int(r["flagged_statuses"]),
}
for r in trend
])
categories_json = json.dumps({k: round(float(v), 4) for k, v in categories.items()})
return render_template(
"analysis.html",
stats=stats,
trend_json=trend_json,
categories_json=categories_json,
categories=TOXICITY_CATEGORIES,
runs=runs,
)
finally:
session.close()
@app.route("/analysis/flagged")
def analysis_flagged():
"""View flagged content."""
from app.analysis_helpers import (
get_flagged_content,
get_accounts_for_select,
TOXICITY_CATEGORIES,
)
session = get_session()
try:
category = request.args.get("category") or None
account_id = request.args.get("account_id", type=int) or None
threshold = request.args.get("threshold", 0.5, type=float)
review_status = request.args.get("review_status") or None
date_from = request.args.get("date_from") or None
date_to = request.args.get("date_to") or None
sort = request.args.get("sort", "overall")
direction = request.args.get("dir", "desc")
page = max(1, request.args.get("page", 1, type=int))
per_page = 50
items, total = get_flagged_content(
session,
category=category,
account_id=account_id,
threshold=threshold,
review_status=review_status,
date_from=date_from,
date_to=date_to,
sort=sort,
direction=direction,
limit=per_page,
offset=(page - 1) * per_page,
)
total_pages = max(1, (total + per_page - 1) // per_page)
accounts = get_accounts_for_select(session)
return render_template(
"flagged.html",
items=items,
total=total,
page=page,
total_pages=total_pages,
accounts=accounts,
categories=TOXICITY_CATEGORIES,
category=category or "",
account_id=account_id or "",
threshold=threshold,
review_status=review_status or "",
date_from=date_from or "",
date_to=date_to or "",
sort=sort,
direction=direction,
)
finally:
session.close()
@app.route("/api/review/submit", methods=["POST"])
def api_review_submit():
"""Submit a human review for a flagged status."""
from sqlalchemy import text
data = request.get_json()
status_id = data.get("status_id")
review_status = data.get("review_status")
if not all([status_id, review_status]):
return jsonify({"error": "Missing required fields"}), 400
if review_status not in ["correct", "incorrect", "unsure"]:
return jsonify({"error": "Invalid review_status"}), 400
session = get_session()
try:
session.execute(text("""
UPDATE toxicity_scores
SET human_reviewed = true,
review_status = :review_status,
reviewed_at = NOW()
WHERE status_id = :status_id
"""), {"review_status": review_status, "status_id": status_id})
session.commit()
return jsonify({"success": True, "message": "Review submitted"}), 200
except Exception as e:
session.rollback()
logger.error(f"Failed to submit review: {e}")
return jsonify({"error": str(e)}), 500
finally:
session.close()
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000, debug=True)