commit 72dbf0d2b66194a5913255bd190e02f5a45182a8 Author: Pieter Date: Mon Feb 9 08:05:54 2026 +0100 Initial commit: Mastodon collector application Add Flask-based application for collecting and archiving Mastodon posts from configured accounts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..24a9226 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +__pycache__ +*.pyc +.env +.git +.gitignore +README.md diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..1904194 --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# PostgreSQL Configuration +POSTGRES_PASSWORD=your_secure_password_here + +# Flask Configuration +FLASK_SECRET_KEY=your_secure_secret_key_here + +# Polling Configuration +POLL_INTERVAL_SECONDS=14400 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..595ac22 --- /dev/null +++ b/.gitignore @@ -0,0 +1,115 @@ +# Environment variables and secrets +.env +.env.local +.env.*.local +*.secret +secrets/ +credentials/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environments +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ +.venv/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Database files +*.sqlite +*.sqlite3 +*.db +*.db-journal +*.db-shm +*.db-wal +postgres_data/ +pgdata/ + +# Logs +*.log +logs/ +*.log.* + +# Docker volumes and local data +docker-compose.override.yml +.docker/ +volumes/ + +# Certificates and keys +*.pem +*.key +*.crt +*.cer +*.p12 +*.pfx + +# Backup files +*.bak +*.backup +*.tmp +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.hypothesis/ + +# Jupyter Notebook +.ipynb_checkpoints + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# pyenv +.python-version + +# pipenv +Pipfile.lock + +# Poetry +poetry.lock diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b1b4a17 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq-dev gcc \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Create empty accounts file if it doesn't exist +RUN touch /app/accounts.txt + +EXPOSE 5000 + +CMD ["python", "-m", "app.collector"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b8a56c --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# Mastodon Collector + +Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes a web UI for account management and data browsing, plus JSON/CSV APIs for your analysis pipeline. + +## Quick Start + +```bash +# 1. Add accounts to monitor +echo "@user@mastodon.social" >> accounts.txt + +# 2. Start everything +docker compose up -d + +# 3. Open the dashboard +open http://localhost:8585 +``` + +## Architecture + +| Service | Description | Port | +|---------------|------------------------------------------------|-------| +| **db** | PostgreSQL 16 | 5432 | +| **web** | Flask dashboard (Gunicorn) | 8585 | +| **collector** | Background service, polls every 4 hours | — | + +## Adding Accounts + +Two methods: + +1. **Text file** — edit `accounts.txt`, one handle per line (`@user@instance.social`). Picked up on next collection cycle. +2. **Web UI** — go to http://localhost:8585/accounts and use the form. + +## Configuration + +Edit `.env` to customize: + +``` +POSTGRES_PASSWORD=collector_secret # Change for production +FLASK_SECRET_KEY=change-me-in-production +POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s) +``` + +## API Endpoints + +For plugging into your analysis pipeline: + +| Endpoint | Description | +|-----------------------|--------------------------------------| +| `GET /api/stats` | Overview stats (counts by type) | +| `GET /api/statuses` | Paginated statuses as JSON | +| `GET /export` | Download all statuses as CSV | + +### `/api/statuses` parameters + +- `page` — page number (default: 1) +- `per_page` — results per page (default: 100, max: 500) +- `account_id` — filter by internal account ID +- `type` — filter by status type: `post`, `reply`, `mention`, `reblog` +- `since` — ISO datetime, only return statuses after this time + +## Database Schema + +Main tables: + +- `monitored_accounts` — accounts being tracked +- `statuses` — collected posts with plain text + HTML content +- `mentions` — who was @-mentioned in each status +- `media_attachments` — images/videos attached to statuses +- `tags` — hashtags used +- `collection_logs` — audit trail of each collection run + +Each status stores `raw_json` with the full Mastodon API response for future analysis needs. + +## Moving to a Server + +```bash +# Copy the project +scp -r mastodon-collector/ user@server:~/ + +# On the server +cd mastodon-collector +# Edit .env with production secrets +docker compose up -d +``` + +## Stopping + +```bash +docker compose down # Stop services, keep data +docker compose down -v # Stop services AND delete database +``` diff --git a/accounts.txt b/accounts.txt new file mode 100644 index 0000000..68d8695 --- /dev/null +++ b/accounts.txt @@ -0,0 +1,100 @@ +# Tweede Kamer +@HenriBontenbalCDA@mastodon.social +@laurensdassen@mastodon.nl +@barbarakathmann@mstdn.social +@JesseKlaver@mstdn.social +@JanPaternotte@mastodon.online +@katipiri@respublicae.eu +@Annemarijke@mastodon.social +@LisaWesterveld@mastodon.social + +# Eerste Kamer +@daanroovers@mastodon.social + +# Partijen landelijk +@BIJ1@social.bij1.org +@D66@mastodon.social +@PartijvoordeDieren@mastodon.social +@Piratenpartij@mastodon.social +@voltnederland@mastodon.nl + +# Kabinet +@MinisterBZK@social.overheid.nl +@staatssecretarisbzk@social.overheid.nl + +# Raadsleden & wethouders +@elisabethijmker@amsterdam.nl +@onno@waag.social +@erikwesselius@mastodon.social +@paullieverse@mastodon.nl +@ErikJonker@mastodon.social +@tvanelferen@mastodon.social +@joepbc@mastodon.social +@lynchantropen@mastodon.social +@MirjamHubert@mastodon.nl + +# Partijen gemeentelijk +@PvdDAlkmaar@mastodon.social +@ArnhemNijmegenBIJ1@social.bij1.org +@GroenLinks026@mastodon.nl +@pga_asten@mastodon.nl +@groenlinks020@mastodon.social +@pvddamsterdam@mastodon.social +@d66debilt@mastodon.social +@D66Bunnik@mastodon.online +@ppdelft@mastodon.pirateparty.be +@PvdDDelft@mastodon.social +@PvdDDenBosch@mastodon.nl +@DenHaagBIJ1@social.bij1.org +@voltenschede@tukkers.online +@PRO_Heeze_Leende@mastodon.nl +@groenlinkshengelo@mastodon.social +@Groenlinkspvdahouten@mastodon.social +@volthouten@mastodon.social +@GroenLinksPvdAKampen@mastodon.nl +@D66Nijmegen@mastodon.nl +@glnijmegen@mastodon.nl +@PvdDNijmegen@mastodon.nl +@glmeppel@mastodon.nl +@D66Ooststellingwerf@mastodon.social +@PvdAGLDRV@mastodon.nl +@UtrechtBIJ1@social.bij1.org +@d66vught@mastodon.nl +@ProgressiefWoerden@mastodon.nl + +# Gemeentelijk overig +@WheelieNick@mastodon.nl +@hotelbreakfast@mastodon.social +@johannesbeers@mastodon.social +@TiciaVerveer@mastodon.social +@RZondervan@mastodon.green + +# Partijen provinciaal +@D66Brabant@mastodon.nl +@Statenfractie_PvdD_Drenthe@mastodon.social +@PvdDStatenfractie_Fryslan@mastodon.social +@GroenLinksPU@mastodon.nl + +# Statenleden +@SiskaPeeks@mstdn.social +@Marjolein@mastodon.social + +# Europees Parlement +@bartgroothuis@mastodon.online +@kimvsparrentak@eupolicy.social + +# Waterschappen leden +@veerleslegers@mastodon.social +@PensioNien@todon.nl +@fabianzoon@mastodon.nl +@win_scheijde@mastodon.social +@Matthijs85@mastodon.social + +# Waterschappen partijen +@PvdDHHNK@mastodon.nl + +# Overige politici +@alexandravanhuffelen@mastodon.social + +# Oud-Tweede Kamerleden +@mariekekoekkoek@mastodon.nl diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/__main__.py b/app/__main__.py new file mode 100644 index 0000000..616c627 --- /dev/null +++ b/app/__main__.py @@ -0,0 +1,4 @@ +"""Allow running the collector with `python -m app`.""" +from app.collector import main + +main() diff --git a/app/collector.py b/app/collector.py new file mode 100644 index 0000000..0c163b2 --- /dev/null +++ b/app/collector.py @@ -0,0 +1,306 @@ +""" +Collector service — periodically polls Mastodon for new statuses from monitored accounts. +Runs as a standalone process via `python -m app.collector`. +""" + +import logging +import os +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +from apscheduler.schedulers.blocking import BlockingScheduler + +from app.db import ( + init_db, + get_session, + MonitoredAccount, + Status, + Mention, + MediaAttachment, + Tag, + CollectionLog, +) +from app.mastodon_api import ( + lookup_account, + get_account_statuses, + parse_status, + MastodonAPIError, + RateLimitError, +) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + handlers=[logging.StreamHandler(sys.stdout)], +) +logger = logging.getLogger("collector") + +POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL_SECONDS", 14400)) +ACCOUNTS_FILE = os.environ.get("ACCOUNTS_FILE", "/app/accounts.txt") + + +def load_accounts_from_file(filepath: str) -> list[tuple[str, str]]: + """Parse accounts.txt and return list of (username, instance) tuples.""" + accounts = [] + path = Path(filepath) + if not path.exists(): + logger.warning("Accounts file not found: %s", filepath) + return accounts + + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + # Expected format: @user@instance.social or user@instance.social + line = line.lstrip("@") + if "@" not in line: + logger.warning("Skipping malformed account line: %s", line) + continue + parts = line.split("@", 1) + if len(parts) == 2 and parts[0] and parts[1]: + accounts.append((parts[0], parts[1])) + else: + logger.warning("Skipping malformed account line: %s", line) + return accounts + + +def sync_monitored_accounts(session) -> list[MonitoredAccount]: + """ + Sync accounts from the file + database. + Accounts added via web UI are already in the DB. + Accounts in the file get added if missing. + Returns all active monitored accounts. + """ + file_accounts = load_accounts_from_file(ACCOUNTS_FILE) + + for username, instance in file_accounts: + existing = ( + session.query(MonitoredAccount) + .filter_by(username=username, instance=instance) + .first() + ) + if not existing: + logger.info("Adding account from file: @%s@%s", username, instance) + acct = MonitoredAccount(username=username, instance=instance, is_active=True) + session.add(acct) + + session.commit() + return session.query(MonitoredAccount).filter_by(is_active=True).all() + + +def resolve_account(session, account: MonitoredAccount) -> bool: + """Look up the Mastodon account ID if we don't have it yet.""" + if account.account_id: + return True + + try: + data = lookup_account(account.instance, account.username) + account.account_id = data["id"] + account.display_name = data.get("display_name", "") + account.avatar_url = data.get("avatar", "") + account.note = data.get("note", "") + session.commit() + logger.info("Resolved %s → account_id=%s", account.handle, account.account_id) + return True + except MastodonAPIError as e: + logger.error("Failed to resolve %s: %s", account.handle, e) + return False + + +def store_status(session, account: MonitoredAccount, parsed: dict) -> bool: + """Store a parsed status in the database. Returns True if new, False if duplicate.""" + # Check for duplicate + existing = ( + session.query(Status) + .filter_by(status_id=parsed["status_id"], account_db_id=account.id) + .first() + ) + if existing: + # Update interaction counts in case they changed + existing.replies_count = parsed["replies_count"] + existing.reblogs_count = parsed["reblogs_count"] + existing.favourites_count = parsed["favourites_count"] + return False + + status = Status( + status_id=parsed["status_id"], + account_db_id=account.id, + uri=parsed["uri"], + url=parsed["url"], + content=parsed["content"], + text_content=parsed["text_content"], + visibility=parsed["visibility"], + created_at=parsed["created_at"], + language=parsed["language"], + sensitive=parsed["sensitive"], + spoiler_text=parsed["spoiler_text"], + in_reply_to_id=parsed["in_reply_to_id"], + in_reply_to_account_id=parsed["in_reply_to_account_id"], + conversation_id=parsed["conversation_id"], + replies_count=parsed["replies_count"], + reblogs_count=parsed["reblogs_count"], + favourites_count=parsed["favourites_count"], + status_type=parsed["status_type"], + raw_json=parsed["raw_json"], + ) + session.add(status) + session.flush() # get status.id + + # Store mentions + for m in parsed["mentions"]: + session.add(Mention( + status_db_id=status.id, + mentioned_account_id=m["mentioned_account_id"], + mentioned_username=m["mentioned_username"], + mentioned_acct=m["mentioned_acct"], + mentioned_url=m["mentioned_url"], + )) + + # Store media + for ma in parsed["media_attachments"]: + session.add(MediaAttachment( + status_db_id=status.id, + media_id=ma["media_id"], + media_type=ma["media_type"], + url=ma["url"], + preview_url=ma["preview_url"], + description=ma["description"], + )) + + # Store tags + for t in parsed["tags"]: + session.add(Tag( + status_db_id=status.id, + name=t["name"], + url=t["url"], + )) + + return True + + +def collect_account(session, account: MonitoredAccount) -> int: + """Collect new statuses for a single account. Returns count of new statuses.""" + log = CollectionLog(account_db_id=account.id, status="running") + session.add(log) + session.commit() + + try: + if not resolve_account(session, account): + log.status = "error" + log.error = "Could not resolve account ID" + log.finished_at = datetime.now(timezone.utc) + session.commit() + return 0 + + logger.info("Collecting statuses for %s (since_id=%s)", account.handle, account.last_status_id) + + raw_statuses = get_account_statuses( + instance=account.instance, + account_id=account.account_id, + since_id=account.last_status_id, + ) + + new_count = 0 + newest_id = account.last_status_id + + for raw in raw_statuses: + parsed = parse_status(raw, account.account_id) + is_new = store_status(session, account, parsed) + if is_new: + new_count += 1 + + # Track the newest status ID + sid = parsed["status_id"] + if newest_id is None or sid > newest_id: + newest_id = sid + + if newest_id: + account.last_status_id = newest_id + account.last_collected_at = datetime.now(timezone.utc) + + log.statuses_collected = new_count + log.status = "success" + log.finished_at = datetime.now(timezone.utc) + session.commit() + + logger.info("Collected %d new statuses for %s (total fetched: %d)", + new_count, account.handle, len(raw_statuses)) + return new_count + + except RateLimitError as e: + log.status = "error" + log.error = f"Rate limited: {e}" + log.finished_at = datetime.now(timezone.utc) + session.commit() + logger.warning("Rate limited while collecting %s: %s", account.handle, e) + time.sleep(e.retry_after) + return 0 + + except MastodonAPIError as e: + log.status = "error" + log.error = str(e) + log.finished_at = datetime.now(timezone.utc) + session.commit() + logger.error("API error collecting %s: %s", account.handle, e) + return 0 + + except Exception as e: + log.status = "error" + log.error = str(e) + log.finished_at = datetime.now(timezone.utc) + session.commit() + logger.exception("Unexpected error collecting %s", account.handle) + return 0 + + +def run_collection_cycle(): + """Run one full collection cycle across all monitored accounts.""" + logger.info("=== Starting collection cycle ===") + session = get_session() + + try: + accounts = sync_monitored_accounts(session) + logger.info("Monitoring %d active accounts", len(accounts)) + + total_new = 0 + for account in accounts: + new = collect_account(session, account) + total_new += new + time.sleep(1) # Brief pause between accounts to be polite + + logger.info("=== Collection cycle complete: %d new statuses across %d accounts ===", + total_new, len(accounts)) + + except Exception: + logger.exception("Fatal error in collection cycle") + finally: + session.close() + + +def main(): + """Entry point: initialize DB and start the scheduler.""" + logger.info("Mastodon Collector starting up...") + logger.info("Poll interval: %d seconds (%d hours)", POLL_INTERVAL, POLL_INTERVAL // 3600) + + init_db() + logger.info("Database initialized") + + # Run one collection immediately on startup + run_collection_cycle() + + # Schedule recurring collection + scheduler = BlockingScheduler() + scheduler.add_job(run_collection_cycle, "interval", seconds=POLL_INTERVAL) + logger.info("Scheduler started — next run in %d seconds", POLL_INTERVAL) + + try: + scheduler.start() + except (KeyboardInterrupt, SystemExit): + logger.info("Collector shutting down") + scheduler.shutdown() + + +if __name__ == "__main__": + main() diff --git a/app/db.py b/app/db.py new file mode 100644 index 0000000..f2cd94d --- /dev/null +++ b/app/db.py @@ -0,0 +1,186 @@ +"""Database models and session management.""" + +import os +from datetime import datetime, timezone + +from sqlalchemy import ( + create_engine, + Column, + Integer, + BigInteger, + String, + Text, + Boolean, + DateTime, + ForeignKey, + Index, + UniqueConstraint, + JSON, +) +from sqlalchemy.orm import declarative_base, sessionmaker, relationship + +DATABASE_URL = os.environ.get( + "DATABASE_URL", "postgresql://collector:collector_secret@localhost:5432/mastodon_collector" +) + +engine = create_engine(DATABASE_URL, pool_pre_ping=True, pool_size=5, max_overflow=10) +SessionLocal = sessionmaker(bind=engine) +Base = declarative_base() + + +class MonitoredAccount(Base): + """An account we are monitoring.""" + + __tablename__ = "monitored_accounts" + + id = Column(Integer, primary_key=True, autoincrement=True) + username = Column(String(255), nullable=False) # e.g. "user" + instance = Column(String(255), nullable=False) # e.g. "mastodon.social" + account_id = Column(String(64), nullable=True) # Mastodon numeric account ID on that instance + display_name = Column(String(512), nullable=True) + avatar_url = Column(Text, nullable=True) + is_active = Column(Boolean, default=True, nullable=False) + last_collected_at = Column(DateTime(timezone=True), nullable=True) + last_status_id = Column(String(64), nullable=True) # For pagination: newest status ID we've seen + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) + note = Column(Text, nullable=True) # Bio / description + + statuses = relationship("Status", back_populates="account", lazy="dynamic") + + __table_args__ = ( + UniqueConstraint("username", "instance", name="uq_account_handle"), + ) + + @property + def handle(self): + return f"@{self.username}@{self.instance}" + + def __repr__(self): + return f"" + + +class Status(Base): + """A single post / toot collected from Mastodon.""" + + __tablename__ = "statuses" + + id = Column(Integer, primary_key=True, autoincrement=True) + status_id = Column(String(64), nullable=False) # Mastodon status ID + account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=False) + uri = Column(Text, nullable=False) # Canonical ActivityPub URI + url = Column(Text, nullable=True) # Human-readable URL + content = Column(Text, nullable=False) # HTML content + text_content = Column(Text, nullable=True) # Stripped plain-text content + visibility = Column(String(32), nullable=True) # public, unlisted, private, direct + created_at = Column(DateTime(timezone=True), nullable=False) + collected_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) + language = Column(String(16), nullable=True) + sensitive = Column(Boolean, default=False) + spoiler_text = Column(Text, nullable=True) + + # Reply / conversation tracking + in_reply_to_id = Column(String(64), nullable=True) # Status ID being replied to + in_reply_to_account_id = Column(String(64), nullable=True) + conversation_id = Column(String(64), nullable=True) + + # Interaction counts + replies_count = Column(Integer, default=0) + reblogs_count = Column(Integer, default=0) + favourites_count = Column(Integer, default=0) + + # Classification for your analysis pipeline + status_type = Column(String(32), nullable=False, default="post") # post, reply, mention, reblog + + # Store the full JSON for future reference + raw_json = Column(JSON, nullable=True) + + # Relationships + account = relationship("MonitoredAccount", back_populates="statuses") + mentions = relationship("Mention", back_populates="status", cascade="all, delete-orphan") + media_attachments = relationship("MediaAttachment", back_populates="status", cascade="all, delete-orphan") + tags = relationship("Tag", back_populates="status", cascade="all, delete-orphan") + + __table_args__ = ( + UniqueConstraint("status_id", "account_db_id", name="uq_status_per_account"), + Index("ix_status_created", "created_at"), + Index("ix_status_type", "status_type"), + Index("ix_status_account", "account_db_id"), + Index("ix_status_conversation", "conversation_id"), + ) + + def __repr__(self): + return f"" + + +class Mention(Base): + """A mention within a status (who was @-mentioned).""" + + __tablename__ = "mentions" + + id = Column(Integer, primary_key=True, autoincrement=True) + status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False) + mentioned_account_id = Column(String(64), nullable=True) + mentioned_username = Column(String(255), nullable=False) + mentioned_acct = Column(String(512), nullable=False) # full user@instance + mentioned_url = Column(Text, nullable=True) + + status = relationship("Status", back_populates="mentions") + + +class MediaAttachment(Base): + """Media attached to a status.""" + + __tablename__ = "media_attachments" + + id = Column(Integer, primary_key=True, autoincrement=True) + status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False) + media_id = Column(String(64), nullable=True) + media_type = Column(String(32), nullable=True) # image, video, gifv, audio + url = Column(Text, nullable=True) + preview_url = Column(Text, nullable=True) + description = Column(Text, nullable=True) # alt text + + status = relationship("Status", back_populates="media_attachments") + + +class Tag(Base): + """A hashtag used in a status.""" + + __tablename__ = "tags" + + id = Column(Integer, primary_key=True, autoincrement=True) + status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False) + name = Column(String(255), nullable=False) + url = Column(Text, nullable=True) + + status = relationship("Status", back_populates="tags") + + __table_args__ = ( + Index("ix_tag_name", "name"), + ) + + +class CollectionLog(Base): + """Log of each collection run for monitoring.""" + + __tablename__ = "collection_logs" + + id = Column(Integer, primary_key=True, autoincrement=True) + account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=True) + started_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) + finished_at = Column(DateTime(timezone=True), nullable=True) + statuses_collected = Column(Integer, default=0) + error = Column(Text, nullable=True) + status = Column(String(32), default="running") # running, success, error + + account = relationship("MonitoredAccount") + + +def init_db(): + """Create all tables.""" + Base.metadata.create_all(engine) + + +def get_session(): + """Get a new database session.""" + return SessionLocal() diff --git a/app/mastodon_api.py b/app/mastodon_api.py new file mode 100644 index 0000000..9fd7026 --- /dev/null +++ b/app/mastodon_api.py @@ -0,0 +1,226 @@ +"""Mastodon public API client — no authentication required.""" + +import logging +import re +import time +from html import unescape +from typing import Optional +from urllib.parse import urljoin + +import requests + +logger = logging.getLogger(__name__) + +# Respect rate limits: Mastodon returns 300 requests per 5 min by default +DEFAULT_TIMEOUT = 30 +MAX_RETRIES = 3 +RETRY_BACKOFF = 5 # seconds + + +class MastodonAPIError(Exception): + pass + + +class RateLimitError(MastodonAPIError): + def __init__(self, retry_after: float = 60): + self.retry_after = retry_after + super().__init__(f"Rate limited, retry after {retry_after}s") + + +def _strip_html(html: str) -> str: + """Strip HTML tags and decode entities to get plain text.""" + # Replace
and

with newlines + text = re.sub(r"", "\n", html) + text = re.sub(r"

", "\n", text) + # Remove all remaining tags + text = re.sub(r"<[^>]+>", "", text) + return unescape(text).strip() + + +def _api_get(instance: str, path: str, params: Optional[dict] = None) -> requests.Response: + """Make a GET request to a Mastodon instance's public API.""" + url = f"https://{instance}{path}" + headers = {"Accept": "application/json", "User-Agent": "MastodonCollector/1.0"} + + for attempt in range(MAX_RETRIES): + try: + resp = requests.get(url, params=params, headers=headers, timeout=DEFAULT_TIMEOUT) + + if resp.status_code == 429: + retry_after = float(resp.headers.get("X-RateLimit-Reset", 60)) + # If it's an ISO timestamp, calculate delta + if retry_after > 1_000_000: + retry_after = 60 + logger.warning("Rate limited by %s, waiting %.0fs", instance, retry_after) + raise RateLimitError(retry_after) + + if resp.status_code == 404: + raise MastodonAPIError(f"Not found: {url}") + + resp.raise_for_status() + return resp + + except RateLimitError: + raise + except requests.RequestException as e: + if attempt < MAX_RETRIES - 1: + wait = RETRY_BACKOFF * (attempt + 1) + logger.warning("Request to %s failed (attempt %d/%d): %s — retrying in %ds", + url, attempt + 1, MAX_RETRIES, e, wait) + time.sleep(wait) + else: + raise MastodonAPIError(f"Failed after {MAX_RETRIES} attempts: {e}") from e + + raise MastodonAPIError("Unexpected retry exhaustion") + + +def lookup_account(instance: str, username: str) -> dict: + """Look up an account on an instance by username. Returns the account JSON.""" + # Try the v1 lookup endpoint first (available on most instances) + try: + resp = _api_get(instance, "/api/v1/accounts/lookup", {"acct": username}) + return resp.json() + except MastodonAPIError: + pass + + # Fallback: search for the account + resp = _api_get(instance, "/api/v2/search", {"q": f"@{username}@{instance}", "type": "accounts", "limit": 1}) + data = resp.json() + accounts = data.get("accounts", []) + if not accounts: + raise MastodonAPIError(f"Account @{username} not found on {instance}") + return accounts[0] + + +def get_account_statuses( + instance: str, + account_id: str, + since_id: Optional[str] = None, + limit: int = 40, + exclude_reblogs: bool = False, +) -> list[dict]: + """ + Fetch statuses from an account. Handles pagination to get all new statuses. + Returns list of status dicts, oldest first. + """ + all_statuses = [] + params = {"limit": min(limit, 40)} + if since_id: + params["since_id"] = since_id + if exclude_reblogs: + params["exclude_reblogs"] = "true" + + path = f"/api/v1/accounts/{account_id}/statuses" + + # Paginate through results + max_pages = 25 # safety limit + page = 0 + + while page < max_pages: + resp = _api_get(instance, path, params) + statuses = resp.json() + + if not statuses: + break + + all_statuses.extend(statuses) + page += 1 + + # Check Link header for next page + link_header = resp.headers.get("Link", "") + next_match = re.search(r'<([^>]+)>;\s*rel="next"', link_header) + if not next_match: + break + + # Parse the next URL for max_id + next_url = next_match.group(1) + max_id_match = re.search(r"max_id=(\d+)", next_url) + if not max_id_match: + break + + params["max_id"] = max_id_match.group(1) + # Remove since_id for subsequent pages — we're paginating backwards + # Actually we keep since_id as the floor + time.sleep(0.5) # Be polite between pages + + # Return oldest first so we can process chronologically + all_statuses.reverse() + return all_statuses + + +def get_status_context(instance: str, status_id: str) -> dict: + """Get the context (ancestors + descendants) of a status. Useful for threading.""" + resp = _api_get(instance, f"/api/v1/statuses/{status_id}/context") + return resp.json() + + +def classify_status(status: dict, monitored_account_id: str) -> str: + """ + Classify a status as: post, reply, mention, or reblog. + - reblog: the status is a boost of another status + - reply: the status is in reply to another status + - mention: the status mentions other accounts (but is not a reply) + - post: a standalone original post + """ + if status.get("reblog"): + return "reblog" + if status.get("in_reply_to_id"): + return "reply" + mentions = status.get("mentions", []) + if mentions: + # Only classify as "mention" if it mentions someone other than self + other_mentions = [m for m in mentions if m.get("id") != monitored_account_id] + if other_mentions: + return "mention" + return "post" + + +def parse_status(status: dict, monitored_account_id: str) -> dict: + """Parse a raw Mastodon status JSON into a flat dict for storage.""" + # If it's a reblog, we store the original content but flag it + actual = status.get("reblog") or status + content_html = actual.get("content", "") + + return { + "status_id": status["id"], + "uri": status.get("uri", ""), + "url": status.get("url") or actual.get("url", ""), + "content": content_html, + "text_content": _strip_html(content_html), + "visibility": status.get("visibility", "public"), + "created_at": status.get("created_at"), + "language": status.get("language") or actual.get("language"), + "sensitive": status.get("sensitive", False), + "spoiler_text": status.get("spoiler_text", ""), + "in_reply_to_id": status.get("in_reply_to_id"), + "in_reply_to_account_id": status.get("in_reply_to_account_id"), + "conversation_id": status.get("conversation", {}).get("id") if isinstance(status.get("conversation"), dict) else None, + "replies_count": status.get("replies_count", 0), + "reblogs_count": status.get("reblogs_count", 0), + "favourites_count": status.get("favourites_count", 0), + "status_type": classify_status(status, monitored_account_id), + "mentions": [ + { + "mentioned_account_id": m.get("id"), + "mentioned_username": m.get("username", ""), + "mentioned_acct": m.get("acct", ""), + "mentioned_url": m.get("url", ""), + } + for m in (actual.get("mentions") or []) + ], + "media_attachments": [ + { + "media_id": ma.get("id"), + "media_type": ma.get("type"), + "url": ma.get("url"), + "preview_url": ma.get("preview_url"), + "description": ma.get("description"), + } + for ma in (actual.get("media_attachments") or []) + ], + "tags": [ + {"name": t.get("name", ""), "url": t.get("url", "")} + for t in (actual.get("tags") or []) + ], + "raw_json": status, + } diff --git a/app/templates/accounts.html b/app/templates/accounts.html new file mode 100644 index 0000000..720621b --- /dev/null +++ b/app/templates/accounts.html @@ -0,0 +1,77 @@ +{% extends "base.html" %} +{% block title %}Accounts — Mastodon Collector{% endblock %} + +{% block content %} +
+

Monitored Accounts

+
+ +
+

Add Account

+
+ + +
+

+ You can also add accounts by editing accounts.txt — the collector picks them up automatically. +

+
+ +
+ + + + + + + + + + + + + {% for acct in accounts %} + + + + + + + + + {% endfor %} + {% if not accounts %} + + + + {% endif %} + +
HandleDisplay NameAccount IDStatusLast CollectedActions
+ + {{ acct.handle }} + + {{ acct.display_name or '—' }}{{ acct.account_id or 'unresolved' }} + {% if acct.is_active %} + Active + {% else %} + Paused + {% endif %} + + {{ acct.last_collected_at.strftime('%Y-%m-%d %H:%M') if acct.last_collected_at else 'Never' }} + +
+
+ +
+
+ +
+
+
+ No accounts yet. Add one above or edit accounts.txt. +
+
+{% endblock %} diff --git a/app/templates/base.html b/app/templates/base.html new file mode 100644 index 0000000..d370f5a --- /dev/null +++ b/app/templates/base.html @@ -0,0 +1,262 @@ + + + + + + {% block title %}Mastodon Collector{% endblock %} + + + + + +
+
+ {% with messages = get_flashed_messages(with_categories=true) %} + {% for category, message in messages %} +
{{ message }}
+ {% endfor %} + {% endwith %} + + {% block content %}{% endblock %} +
+
+ + diff --git a/app/templates/index.html b/app/templates/index.html new file mode 100644 index 0000000..e2e04d0 --- /dev/null +++ b/app/templates/index.html @@ -0,0 +1,123 @@ +{% extends "base.html" %} +{% block title %}Dashboard — Mastodon Collector{% endblock %} + +{% block content %} +

Dashboard

+ +
+
+
{{ total_statuses }}
+
Total Statuses
+
+
+
{{ total_posts }}
+
Posts
+
+
+
{{ total_replies }}
+
Replies
+
+
+
{{ total_mentions }}
+
Mentions
+
+
+
{{ total_reblogs }}
+
Reblogs
+
+
+
{{ account_stats|length }}
+
Monitored Accounts
+
+
+ +
+

Monitored Accounts

+ + + + + + + + + + + + {% for item in account_stats %} + + + + + + + + {% endfor %} + {% if not account_stats %} + + + + {% endif %} + +
AccountInstanceStatusCollectedLast Run
+ + {{ item.account.handle }} + + {% if item.account.display_name %} + — {{ item.account.display_name }} + {% endif %} + {{ item.account.instance }} + {% if item.account.is_active %} + Active + {% else %} + Paused + {% endif %} + {{ item.status_count }} + {% if item.account.last_collected_at %} + {{ item.account.last_collected_at.strftime('%Y-%m-%d %H:%M') }} + {% if item.last_log %} + {{ item.last_log.status }} + {% endif %} + {% else %} + Never + {% endif %} +
+ No accounts being monitored yet. + Add some accounts. +
+
+ +{% if recent_logs %} +
+

Recent Collection Runs

+ + + + + + + + + + + + {% for log in recent_logs %} + + + + + + + + {% endfor %} + +
TimeAccountStatusCollectedError
{{ log.started_at.strftime('%Y-%m-%d %H:%M:%S') if log.started_at }} + {% if log.account %} + {{ log.account.handle }} + {% else %} + — + {% endif %} + {{ log.status }}{{ log.statuses_collected }}{{ log.error or '—' }}
+
+{% endif %} +{% endblock %} diff --git a/app/templates/status_detail.html b/app/templates/status_detail.html new file mode 100644 index 0000000..f60972f --- /dev/null +++ b/app/templates/status_detail.html @@ -0,0 +1,140 @@ +{% extends "base.html" %} +{% block title %}Status Detail — Mastodon Collector{% endblock %} + +{% block content %} +
+ ← Back +

Status Detail

+
+ +
+
+
+ {{ status.status_type }} + + {{ status.created_at.strftime('%Y-%m-%d %H:%M:%S UTC') if status.created_at }} + +
+
+ {% if status.url %} + View on Mastodon ↗ + {% endif %} +
+
+ + + + + + + + + + + + + + + {% if status.in_reply_to_id %} + + + + + {% endif %} + {% if status.conversation_id %} + + + + + {% endif %} + + + + + {% if status.sensitive %} + + + + + {% endif %} + + + + + + + + +
Account{{ status.account.handle }}{% if status.account.display_name %} — {{ status.account.display_name }}{% endif %}
Visibility{{ status.visibility }}
Language{{ status.language or 'Unknown' }}
In Reply ToStatus {{ status.in_reply_to_id }}{% if status.in_reply_to_account_id %} (account {{ status.in_reply_to_account_id }}){% endif %}
Conversation{{ status.conversation_id }}
Interactions↩ {{ status.replies_count }} replies   ⟳ {{ status.reblogs_count }} reblogs   ★ {{ status.favourites_count }} favourites
SensitiveYes{% if status.spoiler_text %} — {{ status.spoiler_text }}{% endif %}
Mastodon Status ID{{ status.status_id }}
URI{{ status.uri }}
+
+ +
+

Content (HTML)

+
+ {{ status.content | safe }} +
+
+ +
+

Content (Plain Text)

+
{{ status.text_content }}
+
+ +{% if status.mentions %} +
+

Mentions ({{ status.mentions|length }})

+ + + + + + {% for m in status.mentions %} + + + + + {% endfor %} + +
AccountURL
@{{ m.mentioned_acct }}{{ m.mentioned_url }}
+
+{% endif %} + +{% if status.media_attachments %} +
+

Media Attachments ({{ status.media_attachments|length }})

+ + + + + + {% for ma in status.media_attachments %} + + + + + + {% endfor %} + +
TypeDescriptionURL
{{ ma.media_type }}{{ ma.description or '—' }}View ↗
+
+{% endif %} + +{% if status.tags %} +
+

Tags

+
+ {% for t in status.tags %} + #{{ t.name }} + {% endfor %} +
+
+{% endif %} + +
+

Raw JSON

+
+ Click to expand +
{{ status.raw_json | tojson(indent=2) }}
+
+
+{% endblock %} diff --git a/app/templates/statuses.html b/app/templates/statuses.html new file mode 100644 index 0000000..d7a64af --- /dev/null +++ b/app/templates/statuses.html @@ -0,0 +1,112 @@ +{% extends "base.html" %} +{% block title %}Statuses — Mastodon Collector{% endblock %} + +{% block content %} +
+

Collected Statuses ({{ total }})

+ Export CSV +
+ +
+
+ + + + + + + {% if current_account_id or current_type or search %} + Clear + {% endif %} +
+
+ +
+ + + + + + + + + + + + + {% for s in statuses %} + + + + + + + + + {% endfor %} + {% if not statuses %} + + + + {% endif %} + +
DateAccountTypeContentInteractions
+ {{ s.created_at.strftime('%Y-%m-%d %H:%M') if s.created_at }} + {{ s.account.handle }} + {{ s.status_type }} + +
+ {{ s.text_content[:200] }}{% if s.text_content and s.text_content|length > 200 %}...{% endif %} +
+ {% if s.tags %} +
+ {% for t in s.tags %} + #{{ t.name }} + {% endfor %} +
+ {% endif %} +
+ ↩ {{ s.replies_count }}   ⟳ {{ s.reblogs_count }}   ★ {{ s.favourites_count }} + + View +
+ No statuses found. The collector runs every {{ (config.get('POLL_INTERVAL_SECONDS', 14400)|int // 3600) }} hours, or you can wait for the first collection cycle. +
+
+ +{% if total_pages > 1 %} + +{% endif %} +{% endblock %} diff --git a/app/web.py b/app/web.py new file mode 100644 index 0000000..698f717 --- /dev/null +++ b/app/web.py @@ -0,0 +1,383 @@ +"""Flask web application for managing monitored accounts and viewing collected data.""" + +import os +import logging +from datetime import datetime, timezone + +from flask import Flask, render_template, request, redirect, url_for, flash, jsonify +from sqlalchemy import func, desc + +from app.db import ( + init_db, + get_session, + MonitoredAccount, + Status, + Mention, + CollectionLog, +) +from app.mastodon_api import lookup_account, MastodonAPIError + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = Flask(__name__) +app.secret_key = os.environ.get("FLASK_SECRET_KEY", "dev-secret-key") + +# Initialize database on startup +with app.app_context(): + init_db() + + +@app.route("/") +def index(): + """Dashboard overview.""" + session = get_session() + try: + accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all() + total_statuses = session.query(func.count(Status.id)).scalar() or 0 + total_posts = session.query(func.count(Status.id)).filter(Status.status_type == "post").scalar() or 0 + total_replies = session.query(func.count(Status.id)).filter(Status.status_type == "reply").scalar() or 0 + total_mentions = session.query(func.count(Status.id)).filter(Status.status_type == "mention").scalar() or 0 + total_reblogs = session.query(func.count(Status.id)).filter(Status.status_type == "reblog").scalar() or 0 + + # Per-account stats + account_stats = [] + for acct in accounts: + count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0 + last_log = ( + session.query(CollectionLog) + .filter_by(account_db_id=acct.id) + .order_by(desc(CollectionLog.started_at)) + .first() + ) + account_stats.append({ + "account": acct, + "status_count": count, + "last_log": last_log, + }) + + # Recent collection logs + recent_logs = ( + session.query(CollectionLog) + .order_by(desc(CollectionLog.started_at)) + .limit(20) + .all() + ) + + return render_template( + "index.html", + account_stats=account_stats, + total_statuses=total_statuses, + total_posts=total_posts, + total_replies=total_replies, + total_mentions=total_mentions, + total_reblogs=total_reblogs, + recent_logs=recent_logs, + ) + finally: + session.close() + + +@app.route("/accounts") +def accounts_list(): + """List all monitored accounts.""" + session = get_session() + try: + accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all() + return render_template("accounts.html", accounts=accounts) + finally: + session.close() + + +@app.route("/accounts/add", methods=["POST"]) +def accounts_add(): + """Add a new account to monitor.""" + handle = request.form.get("handle", "").strip().lstrip("@") + if "@" not in handle: + flash("Invalid handle format. Use @user@instance.social", "error") + return redirect(url_for("accounts_list")) + + username, instance = handle.split("@", 1) + if not username or not instance: + flash("Invalid handle format. Use @user@instance.social", "error") + return redirect(url_for("accounts_list")) + + session = get_session() + try: + existing = session.query(MonitoredAccount).filter_by(username=username, instance=instance).first() + if existing: + if not existing.is_active: + existing.is_active = True + session.commit() + flash(f"Re-activated {existing.handle}", "success") + else: + flash(f"{existing.handle} is already being monitored", "info") + return redirect(url_for("accounts_list")) + + # Try to resolve the account first + try: + data = lookup_account(instance, username) + acct = MonitoredAccount( + username=username, + instance=instance, + account_id=data["id"], + display_name=data.get("display_name", ""), + avatar_url=data.get("avatar", ""), + note=data.get("note", ""), + is_active=True, + ) + except MastodonAPIError as e: + logger.warning("Could not resolve account @%s@%s: %s — adding anyway", username, instance, e) + acct = MonitoredAccount( + username=username, + instance=instance, + is_active=True, + ) + + session.add(acct) + session.commit() + flash(f"Added {acct.handle} to monitoring list", "success") + return redirect(url_for("accounts_list")) + + finally: + session.close() + + +@app.route("/accounts//toggle", methods=["POST"]) +def accounts_toggle(account_id): + """Toggle an account's active status.""" + session = get_session() + try: + acct = session.query(MonitoredAccount).get(account_id) + if acct: + acct.is_active = not acct.is_active + session.commit() + state = "activated" if acct.is_active else "paused" + flash(f"{state.capitalize()} monitoring for {acct.handle}", "success") + return redirect(url_for("accounts_list")) + finally: + session.close() + + +@app.route("/accounts//delete", methods=["POST"]) +def accounts_delete(account_id): + """Delete an account and all its collected data.""" + session = get_session() + try: + acct = session.query(MonitoredAccount).get(account_id) + if acct: + handle = acct.handle + # Delete associated statuses (cascades to mentions, media, tags) + session.query(Status).filter_by(account_db_id=acct.id).delete() + session.query(CollectionLog).filter_by(account_db_id=acct.id).delete() + session.delete(acct) + session.commit() + flash(f"Deleted {handle} and all collected data", "success") + return redirect(url_for("accounts_list")) + finally: + session.close() + + +@app.route("/statuses") +def statuses_list(): + """Browse collected statuses with filters.""" + session = get_session() + try: + page = request.args.get("page", 1, type=int) + per_page = request.args.get("per_page", 50, type=int) + account_id = request.args.get("account_id", type=int) + status_type = request.args.get("type", "") + search = request.args.get("q", "").strip() + + query = session.query(Status).join(MonitoredAccount) + + if account_id: + query = query.filter(Status.account_db_id == account_id) + if status_type: + query = query.filter(Status.status_type == status_type) + if search: + query = query.filter(Status.text_content.ilike(f"%{search}%")) + + total = query.count() + statuses = ( + query.order_by(desc(Status.created_at)) + .offset((page - 1) * per_page) + .limit(per_page) + .all() + ) + + accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.username).all() + total_pages = max(1, (total + per_page - 1) // per_page) + + return render_template( + "statuses.html", + statuses=statuses, + accounts=accounts, + page=page, + per_page=per_page, + total=total, + total_pages=total_pages, + current_account_id=account_id, + current_type=status_type, + search=search, + ) + finally: + session.close() + + +@app.route("/statuses/") +def status_detail(status_db_id): + """View a single status with all details.""" + session = get_session() + try: + status = session.query(Status).get(status_db_id) + if not status: + flash("Status not found", "error") + return redirect(url_for("statuses_list")) + return render_template("status_detail.html", status=status) + finally: + session.close() + + +@app.route("/api/stats") +def api_stats(): + """JSON API endpoint for stats (useful for your analysis pipeline).""" + session = get_session() + try: + stats = { + "total_statuses": session.query(func.count(Status.id)).scalar() or 0, + "by_type": {}, + "accounts": [], + } + for stype in ["post", "reply", "mention", "reblog"]: + stats["by_type"][stype] = ( + session.query(func.count(Status.id)).filter(Status.status_type == stype).scalar() or 0 + ) + + accounts = session.query(MonitoredAccount).filter_by(is_active=True).all() + for acct in accounts: + count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0 + stats["accounts"].append({ + "handle": acct.handle, + "status_count": count, + "last_collected": acct.last_collected_at.isoformat() if acct.last_collected_at else None, + }) + + return jsonify(stats) + finally: + session.close() + + +@app.route("/api/statuses") +def api_statuses(): + """JSON API endpoint for statuses (for your analysis pipeline).""" + session = get_session() + try: + page = request.args.get("page", 1, type=int) + per_page = min(request.args.get("per_page", 100, type=int), 500) + account_id = request.args.get("account_id", type=int) + status_type = request.args.get("type", "") + since = request.args.get("since", "") # ISO datetime + + query = session.query(Status) + + if account_id: + query = query.filter(Status.account_db_id == account_id) + if status_type: + query = query.filter(Status.status_type == status_type) + if since: + query = query.filter(Status.created_at >= since) + + total = query.count() + statuses = ( + query.order_by(desc(Status.created_at)) + .offset((page - 1) * per_page) + .limit(per_page) + .all() + ) + + return jsonify({ + "total": total, + "page": page, + "per_page": per_page, + "statuses": [ + { + "id": s.id, + "status_id": s.status_id, + "account": s.account.handle, + "url": s.url, + "content": s.content, + "text_content": s.text_content, + "visibility": s.visibility, + "created_at": s.created_at.isoformat() if s.created_at else None, + "language": s.language, + "status_type": s.status_type, + "in_reply_to_id": s.in_reply_to_id, + "replies_count": s.replies_count, + "reblogs_count": s.reblogs_count, + "favourites_count": s.favourites_count, + "mentions": [ + {"acct": m.mentioned_acct, "url": m.mentioned_url} + for m in s.mentions + ], + "tags": [t.name for t in s.tags], + } + for s in statuses + ], + }) + finally: + session.close() + + +@app.route("/export") +def export_csv(): + """Export statuses as CSV for analysis.""" + from io import StringIO + import csv + + session = get_session() + try: + account_id = request.args.get("account_id", type=int) + status_type = request.args.get("type", "") + + query = session.query(Status).join(MonitoredAccount) + if account_id: + query = query.filter(Status.account_db_id == account_id) + if status_type: + query = query.filter(Status.status_type == status_type) + + statuses = query.order_by(desc(Status.created_at)).all() + + output = StringIO() + writer = csv.writer(output) + writer.writerow([ + "id", "account", "status_type", "created_at", "url", + "text_content", "language", "visibility", "in_reply_to_id", + "replies_count", "reblogs_count", "favourites_count", + "mentions", "tags", "sensitive", "spoiler_text", + ]) + + for s in statuses: + mentions_str = "; ".join(m.mentioned_acct for m in s.mentions) + tags_str = "; ".join(t.name for t in s.tags) + writer.writerow([ + s.status_id, s.account.handle, s.status_type, + s.created_at.isoformat() if s.created_at else "", + s.url, s.text_content, s.language, s.visibility, + s.in_reply_to_id, s.replies_count, s.reblogs_count, + s.favourites_count, mentions_str, tags_str, + s.sensitive, s.spoiler_text, + ]) + + from flask import Response + return Response( + output.getvalue(), + mimetype="text/csv", + headers={"Content-Disposition": "attachment; filename=mastodon_statuses.csv"}, + ) + finally: + session.close() + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=5000, debug=True) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7c2d970 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,50 @@ +version: "3.8" + +services: + db: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_DB: mastodon_collector + POSTGRES_USER: collector + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-collector_secret} + volumes: + - pgdata:/var/lib/postgresql/data + ports: + - "127.0.0.1:5434:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U collector -d mastodon_collector"] + interval: 5s + timeout: 5s + retries: 5 + + web: + build: . + restart: unless-stopped + command: gunicorn --bind 0.0.0.0:5000 --workers 2 --timeout 120 app.web:app + ports: + - "127.0.0.1:8585:5000" + environment: + DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector + FLASK_SECRET_KEY: ${FLASK_SECRET_KEY:-change-me-in-production} + volumes: + - ./accounts.txt:/app/accounts.txt + depends_on: + db: + condition: service_healthy + + collector: + build: . + restart: unless-stopped + command: python -m app.collector + environment: + DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector + POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400} + volumes: + - ./accounts.txt:/app/accounts.txt + depends_on: + db: + condition: service_healthy + +volumes: + pgdata: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b554053 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +flask==3.1.0 +gunicorn==23.0.0 +psycopg2-binary==2.9.10 +sqlalchemy==2.0.36 +requests==2.32.3 +apscheduler==3.10.4