commit 1783a48d7c1c9f42fa2cf63b347a2efa89e412d9
Author: Pieter
Date: Mon Feb 9 08:05:54 2026 +0100
Initial commit: Mastodon collector application
Add Flask-based application for collecting and archiving Mastodon posts from configured accounts.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..24a9226
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,6 @@
+__pycache__
+*.pyc
+.env
+.git
+.gitignore
+README.md
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..1904194
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,8 @@
+# PostgreSQL Configuration
+POSTGRES_PASSWORD=your_secure_password_here
+
+# Flask Configuration
+FLASK_SECRET_KEY=your_secure_secret_key_here
+
+# Polling Configuration
+POLL_INTERVAL_SECONDS=14400
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..595ac22
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,115 @@
+# Environment variables and secrets
+.env
+.env.local
+.env.*.local
+*.secret
+secrets/
+credentials/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+.venv/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Database files
+*.sqlite
+*.sqlite3
+*.db
+*.db-journal
+*.db-shm
+*.db-wal
+postgres_data/
+pgdata/
+
+# Logs
+*.log
+logs/
+*.log.*
+
+# Docker volumes and local data
+docker-compose.override.yml
+.docker/
+volumes/
+
+# Certificates and keys
+*.pem
+*.key
+*.crt
+*.cer
+*.p12
+*.pfx
+
+# Backup files
+*.bak
+*.backup
+*.tmp
+*~
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.hypothesis/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# pyenv
+.python-version
+
+# pipenv
+Pipfile.lock
+
+# Poetry
+poetry.lock
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..b1b4a17
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libpq-dev gcc \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+# Create empty accounts file if it doesn't exist
+RUN touch /app/accounts.txt
+
+EXPOSE 5000
+
+CMD ["python", "-m", "app.collector"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8b8a56c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,91 @@
+# Mastodon Collector
+
+Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes a web UI for account management and data browsing, plus JSON/CSV APIs for your analysis pipeline.
+
+## Quick Start
+
+```bash
+# 1. Add accounts to monitor
+echo "@user@mastodon.social" >> accounts.txt
+
+# 2. Start everything
+docker compose up -d
+
+# 3. Open the dashboard
+open http://localhost:8585
+```
+
+## Architecture
+
+| Service | Description | Port |
+|---------------|------------------------------------------------|-------|
+| **db** | PostgreSQL 16 | 5432 |
+| **web** | Flask dashboard (Gunicorn) | 8585 |
+| **collector** | Background service, polls every 4 hours | — |
+
+## Adding Accounts
+
+Two methods:
+
+1. **Text file** — edit `accounts.txt`, one handle per line (`@user@instance.social`). Picked up on next collection cycle.
+2. **Web UI** — go to http://localhost:8585/accounts and use the form.
+
+## Configuration
+
+Edit `.env` to customize:
+
+```
+POSTGRES_PASSWORD=collector_secret # Change for production
+FLASK_SECRET_KEY=change-me-in-production
+POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
+```
+
+## API Endpoints
+
+For plugging into your analysis pipeline:
+
+| Endpoint | Description |
+|-----------------------|--------------------------------------|
+| `GET /api/stats` | Overview stats (counts by type) |
+| `GET /api/statuses` | Paginated statuses as JSON |
+| `GET /export` | Download all statuses as CSV |
+
+### `/api/statuses` parameters
+
+- `page` — page number (default: 1)
+- `per_page` — results per page (default: 100, max: 500)
+- `account_id` — filter by internal account ID
+- `type` — filter by status type: `post`, `reply`, `mention`, `reblog`
+- `since` — ISO datetime, only return statuses after this time
+
+## Database Schema
+
+Main tables:
+
+- `monitored_accounts` — accounts being tracked
+- `statuses` — collected posts with plain text + HTML content
+- `mentions` — who was @-mentioned in each status
+- `media_attachments` — images/videos attached to statuses
+- `tags` — hashtags used
+- `collection_logs` — audit trail of each collection run
+
+Each status stores `raw_json` with the full Mastodon API response for future analysis needs.
+
+## Moving to a Server
+
+```bash
+# Copy the project
+scp -r mastodon-collector/ user@server:~/
+
+# On the server
+cd mastodon-collector
+# Edit .env with production secrets
+docker compose up -d
+```
+
+## Stopping
+
+```bash
+docker compose down # Stop services, keep data
+docker compose down -v # Stop services AND delete database
+```
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/__main__.py b/app/__main__.py
new file mode 100644
index 0000000..616c627
--- /dev/null
+++ b/app/__main__.py
@@ -0,0 +1,4 @@
+"""Allow running the collector with `python -m app`."""
+from app.collector import main
+
+main()
diff --git a/app/collector.py b/app/collector.py
new file mode 100644
index 0000000..0c163b2
--- /dev/null
+++ b/app/collector.py
@@ -0,0 +1,306 @@
+"""
+Collector service — periodically polls Mastodon for new statuses from monitored accounts.
+Runs as a standalone process via `python -m app.collector`.
+"""
+
+import logging
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+
+from app.db import (
+ init_db,
+ get_session,
+ MonitoredAccount,
+ Status,
+ Mention,
+ MediaAttachment,
+ Tag,
+ CollectionLog,
+)
+from app.mastodon_api import (
+ lookup_account,
+ get_account_statuses,
+ parse_status,
+ MastodonAPIError,
+ RateLimitError,
+)
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+ handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("collector")
+
+POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL_SECONDS", 14400))
+ACCOUNTS_FILE = os.environ.get("ACCOUNTS_FILE", "/app/accounts.txt")
+
+
+def load_accounts_from_file(filepath: str) -> list[tuple[str, str]]:
+ """Parse accounts.txt and return list of (username, instance) tuples."""
+ accounts = []
+ path = Path(filepath)
+ if not path.exists():
+ logger.warning("Accounts file not found: %s", filepath)
+ return accounts
+
+ for line in path.read_text().splitlines():
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ # Expected format: @user@instance.social or user@instance.social
+ line = line.lstrip("@")
+ if "@" not in line:
+ logger.warning("Skipping malformed account line: %s", line)
+ continue
+ parts = line.split("@", 1)
+ if len(parts) == 2 and parts[0] and parts[1]:
+ accounts.append((parts[0], parts[1]))
+ else:
+ logger.warning("Skipping malformed account line: %s", line)
+ return accounts
+
+
+def sync_monitored_accounts(session) -> list[MonitoredAccount]:
+ """
+ Sync accounts from the file + database.
+ Accounts added via web UI are already in the DB.
+ Accounts in the file get added if missing.
+ Returns all active monitored accounts.
+ """
+ file_accounts = load_accounts_from_file(ACCOUNTS_FILE)
+
+ for username, instance in file_accounts:
+ existing = (
+ session.query(MonitoredAccount)
+ .filter_by(username=username, instance=instance)
+ .first()
+ )
+ if not existing:
+ logger.info("Adding account from file: @%s@%s", username, instance)
+ acct = MonitoredAccount(username=username, instance=instance, is_active=True)
+ session.add(acct)
+
+ session.commit()
+ return session.query(MonitoredAccount).filter_by(is_active=True).all()
+
+
+def resolve_account(session, account: MonitoredAccount) -> bool:
+ """Look up the Mastodon account ID if we don't have it yet."""
+ if account.account_id:
+ return True
+
+ try:
+ data = lookup_account(account.instance, account.username)
+ account.account_id = data["id"]
+ account.display_name = data.get("display_name", "")
+ account.avatar_url = data.get("avatar", "")
+ account.note = data.get("note", "")
+ session.commit()
+ logger.info("Resolved %s → account_id=%s", account.handle, account.account_id)
+ return True
+ except MastodonAPIError as e:
+ logger.error("Failed to resolve %s: %s", account.handle, e)
+ return False
+
+
+def store_status(session, account: MonitoredAccount, parsed: dict) -> bool:
+ """Store a parsed status in the database. Returns True if new, False if duplicate."""
+ # Check for duplicate
+ existing = (
+ session.query(Status)
+ .filter_by(status_id=parsed["status_id"], account_db_id=account.id)
+ .first()
+ )
+ if existing:
+ # Update interaction counts in case they changed
+ existing.replies_count = parsed["replies_count"]
+ existing.reblogs_count = parsed["reblogs_count"]
+ existing.favourites_count = parsed["favourites_count"]
+ return False
+
+ status = Status(
+ status_id=parsed["status_id"],
+ account_db_id=account.id,
+ uri=parsed["uri"],
+ url=parsed["url"],
+ content=parsed["content"],
+ text_content=parsed["text_content"],
+ visibility=parsed["visibility"],
+ created_at=parsed["created_at"],
+ language=parsed["language"],
+ sensitive=parsed["sensitive"],
+ spoiler_text=parsed["spoiler_text"],
+ in_reply_to_id=parsed["in_reply_to_id"],
+ in_reply_to_account_id=parsed["in_reply_to_account_id"],
+ conversation_id=parsed["conversation_id"],
+ replies_count=parsed["replies_count"],
+ reblogs_count=parsed["reblogs_count"],
+ favourites_count=parsed["favourites_count"],
+ status_type=parsed["status_type"],
+ raw_json=parsed["raw_json"],
+ )
+ session.add(status)
+ session.flush() # get status.id
+
+ # Store mentions
+ for m in parsed["mentions"]:
+ session.add(Mention(
+ status_db_id=status.id,
+ mentioned_account_id=m["mentioned_account_id"],
+ mentioned_username=m["mentioned_username"],
+ mentioned_acct=m["mentioned_acct"],
+ mentioned_url=m["mentioned_url"],
+ ))
+
+ # Store media
+ for ma in parsed["media_attachments"]:
+ session.add(MediaAttachment(
+ status_db_id=status.id,
+ media_id=ma["media_id"],
+ media_type=ma["media_type"],
+ url=ma["url"],
+ preview_url=ma["preview_url"],
+ description=ma["description"],
+ ))
+
+ # Store tags
+ for t in parsed["tags"]:
+ session.add(Tag(
+ status_db_id=status.id,
+ name=t["name"],
+ url=t["url"],
+ ))
+
+ return True
+
+
+def collect_account(session, account: MonitoredAccount) -> int:
+ """Collect new statuses for a single account. Returns count of new statuses."""
+ log = CollectionLog(account_db_id=account.id, status="running")
+ session.add(log)
+ session.commit()
+
+ try:
+ if not resolve_account(session, account):
+ log.status = "error"
+ log.error = "Could not resolve account ID"
+ log.finished_at = datetime.now(timezone.utc)
+ session.commit()
+ return 0
+
+ logger.info("Collecting statuses for %s (since_id=%s)", account.handle, account.last_status_id)
+
+ raw_statuses = get_account_statuses(
+ instance=account.instance,
+ account_id=account.account_id,
+ since_id=account.last_status_id,
+ )
+
+ new_count = 0
+ newest_id = account.last_status_id
+
+ for raw in raw_statuses:
+ parsed = parse_status(raw, account.account_id)
+ is_new = store_status(session, account, parsed)
+ if is_new:
+ new_count += 1
+
+ # Track the newest status ID
+ sid = parsed["status_id"]
+ if newest_id is None or sid > newest_id:
+ newest_id = sid
+
+ if newest_id:
+ account.last_status_id = newest_id
+ account.last_collected_at = datetime.now(timezone.utc)
+
+ log.statuses_collected = new_count
+ log.status = "success"
+ log.finished_at = datetime.now(timezone.utc)
+ session.commit()
+
+ logger.info("Collected %d new statuses for %s (total fetched: %d)",
+ new_count, account.handle, len(raw_statuses))
+ return new_count
+
+ except RateLimitError as e:
+ log.status = "error"
+ log.error = f"Rate limited: {e}"
+ log.finished_at = datetime.now(timezone.utc)
+ session.commit()
+ logger.warning("Rate limited while collecting %s: %s", account.handle, e)
+ time.sleep(e.retry_after)
+ return 0
+
+ except MastodonAPIError as e:
+ log.status = "error"
+ log.error = str(e)
+ log.finished_at = datetime.now(timezone.utc)
+ session.commit()
+ logger.error("API error collecting %s: %s", account.handle, e)
+ return 0
+
+ except Exception as e:
+ log.status = "error"
+ log.error = str(e)
+ log.finished_at = datetime.now(timezone.utc)
+ session.commit()
+ logger.exception("Unexpected error collecting %s", account.handle)
+ return 0
+
+
+def run_collection_cycle():
+ """Run one full collection cycle across all monitored accounts."""
+ logger.info("=== Starting collection cycle ===")
+ session = get_session()
+
+ try:
+ accounts = sync_monitored_accounts(session)
+ logger.info("Monitoring %d active accounts", len(accounts))
+
+ total_new = 0
+ for account in accounts:
+ new = collect_account(session, account)
+ total_new += new
+ time.sleep(1) # Brief pause between accounts to be polite
+
+ logger.info("=== Collection cycle complete: %d new statuses across %d accounts ===",
+ total_new, len(accounts))
+
+ except Exception:
+ logger.exception("Fatal error in collection cycle")
+ finally:
+ session.close()
+
+
+def main():
+ """Entry point: initialize DB and start the scheduler."""
+ logger.info("Mastodon Collector starting up...")
+ logger.info("Poll interval: %d seconds (%d hours)", POLL_INTERVAL, POLL_INTERVAL // 3600)
+
+ init_db()
+ logger.info("Database initialized")
+
+ # Run one collection immediately on startup
+ run_collection_cycle()
+
+ # Schedule recurring collection
+ scheduler = BlockingScheduler()
+ scheduler.add_job(run_collection_cycle, "interval", seconds=POLL_INTERVAL)
+ logger.info("Scheduler started — next run in %d seconds", POLL_INTERVAL)
+
+ try:
+ scheduler.start()
+ except (KeyboardInterrupt, SystemExit):
+ logger.info("Collector shutting down")
+ scheduler.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/app/db.py b/app/db.py
new file mode 100644
index 0000000..f2cd94d
--- /dev/null
+++ b/app/db.py
@@ -0,0 +1,186 @@
+"""Database models and session management."""
+
+import os
+from datetime import datetime, timezone
+
+from sqlalchemy import (
+ create_engine,
+ Column,
+ Integer,
+ BigInteger,
+ String,
+ Text,
+ Boolean,
+ DateTime,
+ ForeignKey,
+ Index,
+ UniqueConstraint,
+ JSON,
+)
+from sqlalchemy.orm import declarative_base, sessionmaker, relationship
+
+DATABASE_URL = os.environ.get(
+ "DATABASE_URL", "postgresql://collector:collector_secret@localhost:5432/mastodon_collector"
+)
+
+engine = create_engine(DATABASE_URL, pool_pre_ping=True, pool_size=5, max_overflow=10)
+SessionLocal = sessionmaker(bind=engine)
+Base = declarative_base()
+
+
+class MonitoredAccount(Base):
+ """An account we are monitoring."""
+
+ __tablename__ = "monitored_accounts"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ username = Column(String(255), nullable=False) # e.g. "user"
+ instance = Column(String(255), nullable=False) # e.g. "mastodon.social"
+ account_id = Column(String(64), nullable=True) # Mastodon numeric account ID on that instance
+ display_name = Column(String(512), nullable=True)
+ avatar_url = Column(Text, nullable=True)
+ is_active = Column(Boolean, default=True, nullable=False)
+ last_collected_at = Column(DateTime(timezone=True), nullable=True)
+ last_status_id = Column(String(64), nullable=True) # For pagination: newest status ID we've seen
+ created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
+ note = Column(Text, nullable=True) # Bio / description
+
+ statuses = relationship("Status", back_populates="account", lazy="dynamic")
+
+ __table_args__ = (
+ UniqueConstraint("username", "instance", name="uq_account_handle"),
+ )
+
+ @property
+ def handle(self):
+ return f"@{self.username}@{self.instance}"
+
+ def __repr__(self):
+ return f""
+
+
+class Status(Base):
+ """A single post / toot collected from Mastodon."""
+
+ __tablename__ = "statuses"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ status_id = Column(String(64), nullable=False) # Mastodon status ID
+ account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=False)
+ uri = Column(Text, nullable=False) # Canonical ActivityPub URI
+ url = Column(Text, nullable=True) # Human-readable URL
+ content = Column(Text, nullable=False) # HTML content
+ text_content = Column(Text, nullable=True) # Stripped plain-text content
+ visibility = Column(String(32), nullable=True) # public, unlisted, private, direct
+ created_at = Column(DateTime(timezone=True), nullable=False)
+ collected_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
+ language = Column(String(16), nullable=True)
+ sensitive = Column(Boolean, default=False)
+ spoiler_text = Column(Text, nullable=True)
+
+ # Reply / conversation tracking
+ in_reply_to_id = Column(String(64), nullable=True) # Status ID being replied to
+ in_reply_to_account_id = Column(String(64), nullable=True)
+ conversation_id = Column(String(64), nullable=True)
+
+ # Interaction counts
+ replies_count = Column(Integer, default=0)
+ reblogs_count = Column(Integer, default=0)
+ favourites_count = Column(Integer, default=0)
+
+ # Classification for your analysis pipeline
+ status_type = Column(String(32), nullable=False, default="post") # post, reply, mention, reblog
+
+ # Store the full JSON for future reference
+ raw_json = Column(JSON, nullable=True)
+
+ # Relationships
+ account = relationship("MonitoredAccount", back_populates="statuses")
+ mentions = relationship("Mention", back_populates="status", cascade="all, delete-orphan")
+ media_attachments = relationship("MediaAttachment", back_populates="status", cascade="all, delete-orphan")
+ tags = relationship("Tag", back_populates="status", cascade="all, delete-orphan")
+
+ __table_args__ = (
+ UniqueConstraint("status_id", "account_db_id", name="uq_status_per_account"),
+ Index("ix_status_created", "created_at"),
+ Index("ix_status_type", "status_type"),
+ Index("ix_status_account", "account_db_id"),
+ Index("ix_status_conversation", "conversation_id"),
+ )
+
+ def __repr__(self):
+ return f""
+
+
+class Mention(Base):
+ """A mention within a status (who was @-mentioned)."""
+
+ __tablename__ = "mentions"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
+ mentioned_account_id = Column(String(64), nullable=True)
+ mentioned_username = Column(String(255), nullable=False)
+ mentioned_acct = Column(String(512), nullable=False) # full user@instance
+ mentioned_url = Column(Text, nullable=True)
+
+ status = relationship("Status", back_populates="mentions")
+
+
+class MediaAttachment(Base):
+ """Media attached to a status."""
+
+ __tablename__ = "media_attachments"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
+ media_id = Column(String(64), nullable=True)
+ media_type = Column(String(32), nullable=True) # image, video, gifv, audio
+ url = Column(Text, nullable=True)
+ preview_url = Column(Text, nullable=True)
+ description = Column(Text, nullable=True) # alt text
+
+ status = relationship("Status", back_populates="media_attachments")
+
+
+class Tag(Base):
+ """A hashtag used in a status."""
+
+ __tablename__ = "tags"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
+ name = Column(String(255), nullable=False)
+ url = Column(Text, nullable=True)
+
+ status = relationship("Status", back_populates="tags")
+
+ __table_args__ = (
+ Index("ix_tag_name", "name"),
+ )
+
+
+class CollectionLog(Base):
+ """Log of each collection run for monitoring."""
+
+ __tablename__ = "collection_logs"
+
+ id = Column(Integer, primary_key=True, autoincrement=True)
+ account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=True)
+ started_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
+ finished_at = Column(DateTime(timezone=True), nullable=True)
+ statuses_collected = Column(Integer, default=0)
+ error = Column(Text, nullable=True)
+ status = Column(String(32), default="running") # running, success, error
+
+ account = relationship("MonitoredAccount")
+
+
+def init_db():
+ """Create all tables."""
+ Base.metadata.create_all(engine)
+
+
+def get_session():
+ """Get a new database session."""
+ return SessionLocal()
diff --git a/app/mastodon_api.py b/app/mastodon_api.py
new file mode 100644
index 0000000..9fd7026
--- /dev/null
+++ b/app/mastodon_api.py
@@ -0,0 +1,226 @@
+"""Mastodon public API client — no authentication required."""
+
+import logging
+import re
+import time
+from html import unescape
+from typing import Optional
+from urllib.parse import urljoin
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+# Respect rate limits: Mastodon returns 300 requests per 5 min by default
+DEFAULT_TIMEOUT = 30
+MAX_RETRIES = 3
+RETRY_BACKOFF = 5 # seconds
+
+
+class MastodonAPIError(Exception):
+ pass
+
+
+class RateLimitError(MastodonAPIError):
+ def __init__(self, retry_after: float = 60):
+ self.retry_after = retry_after
+ super().__init__(f"Rate limited, retry after {retry_after}s")
+
+
+def _strip_html(html: str) -> str:
+ """Strip HTML tags and decode entities to get plain text."""
+ # Replace and
with newlines
+ text = re.sub(r" ", "\n", html)
+ text = re.sub(r"", "\n", text)
+ # Remove all remaining tags
+ text = re.sub(r"<[^>]+>", "", text)
+ return unescape(text).strip()
+
+
+def _api_get(instance: str, path: str, params: Optional[dict] = None) -> requests.Response:
+ """Make a GET request to a Mastodon instance's public API."""
+ url = f"https://{instance}{path}"
+ headers = {"Accept": "application/json", "User-Agent": "MastodonCollector/1.0"}
+
+ for attempt in range(MAX_RETRIES):
+ try:
+ resp = requests.get(url, params=params, headers=headers, timeout=DEFAULT_TIMEOUT)
+
+ if resp.status_code == 429:
+ retry_after = float(resp.headers.get("X-RateLimit-Reset", 60))
+ # If it's an ISO timestamp, calculate delta
+ if retry_after > 1_000_000:
+ retry_after = 60
+ logger.warning("Rate limited by %s, waiting %.0fs", instance, retry_after)
+ raise RateLimitError(retry_after)
+
+ if resp.status_code == 404:
+ raise MastodonAPIError(f"Not found: {url}")
+
+ resp.raise_for_status()
+ return resp
+
+ except RateLimitError:
+ raise
+ except requests.RequestException as e:
+ if attempt < MAX_RETRIES - 1:
+ wait = RETRY_BACKOFF * (attempt + 1)
+ logger.warning("Request to %s failed (attempt %d/%d): %s — retrying in %ds",
+ url, attempt + 1, MAX_RETRIES, e, wait)
+ time.sleep(wait)
+ else:
+ raise MastodonAPIError(f"Failed after {MAX_RETRIES} attempts: {e}") from e
+
+ raise MastodonAPIError("Unexpected retry exhaustion")
+
+
+def lookup_account(instance: str, username: str) -> dict:
+ """Look up an account on an instance by username. Returns the account JSON."""
+ # Try the v1 lookup endpoint first (available on most instances)
+ try:
+ resp = _api_get(instance, "/api/v1/accounts/lookup", {"acct": username})
+ return resp.json()
+ except MastodonAPIError:
+ pass
+
+ # Fallback: search for the account
+ resp = _api_get(instance, "/api/v2/search", {"q": f"@{username}@{instance}", "type": "accounts", "limit": 1})
+ data = resp.json()
+ accounts = data.get("accounts", [])
+ if not accounts:
+ raise MastodonAPIError(f"Account @{username} not found on {instance}")
+ return accounts[0]
+
+
+def get_account_statuses(
+ instance: str,
+ account_id: str,
+ since_id: Optional[str] = None,
+ limit: int = 40,
+ exclude_reblogs: bool = False,
+) -> list[dict]:
+ """
+ Fetch statuses from an account. Handles pagination to get all new statuses.
+ Returns list of status dicts, oldest first.
+ """
+ all_statuses = []
+ params = {"limit": min(limit, 40)}
+ if since_id:
+ params["since_id"] = since_id
+ if exclude_reblogs:
+ params["exclude_reblogs"] = "true"
+
+ path = f"/api/v1/accounts/{account_id}/statuses"
+
+ # Paginate through results
+ max_pages = 25 # safety limit
+ page = 0
+
+ while page < max_pages:
+ resp = _api_get(instance, path, params)
+ statuses = resp.json()
+
+ if not statuses:
+ break
+
+ all_statuses.extend(statuses)
+ page += 1
+
+ # Check Link header for next page
+ link_header = resp.headers.get("Link", "")
+ next_match = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
+ if not next_match:
+ break
+
+ # Parse the next URL for max_id
+ next_url = next_match.group(1)
+ max_id_match = re.search(r"max_id=(\d+)", next_url)
+ if not max_id_match:
+ break
+
+ params["max_id"] = max_id_match.group(1)
+ # Remove since_id for subsequent pages — we're paginating backwards
+ # Actually we keep since_id as the floor
+ time.sleep(0.5) # Be polite between pages
+
+ # Return oldest first so we can process chronologically
+ all_statuses.reverse()
+ return all_statuses
+
+
+def get_status_context(instance: str, status_id: str) -> dict:
+ """Get the context (ancestors + descendants) of a status. Useful for threading."""
+ resp = _api_get(instance, f"/api/v1/statuses/{status_id}/context")
+ return resp.json()
+
+
+def classify_status(status: dict, monitored_account_id: str) -> str:
+ """
+ Classify a status as: post, reply, mention, or reblog.
+ - reblog: the status is a boost of another status
+ - reply: the status is in reply to another status
+ - mention: the status mentions other accounts (but is not a reply)
+ - post: a standalone original post
+ """
+ if status.get("reblog"):
+ return "reblog"
+ if status.get("in_reply_to_id"):
+ return "reply"
+ mentions = status.get("mentions", [])
+ if mentions:
+ # Only classify as "mention" if it mentions someone other than self
+ other_mentions = [m for m in mentions if m.get("id") != monitored_account_id]
+ if other_mentions:
+ return "mention"
+ return "post"
+
+
+def parse_status(status: dict, monitored_account_id: str) -> dict:
+ """Parse a raw Mastodon status JSON into a flat dict for storage."""
+ # If it's a reblog, we store the original content but flag it
+ actual = status.get("reblog") or status
+ content_html = actual.get("content", "")
+
+ return {
+ "status_id": status["id"],
+ "uri": status.get("uri", ""),
+ "url": status.get("url") or actual.get("url", ""),
+ "content": content_html,
+ "text_content": _strip_html(content_html),
+ "visibility": status.get("visibility", "public"),
+ "created_at": status.get("created_at"),
+ "language": status.get("language") or actual.get("language"),
+ "sensitive": status.get("sensitive", False),
+ "spoiler_text": status.get("spoiler_text", ""),
+ "in_reply_to_id": status.get("in_reply_to_id"),
+ "in_reply_to_account_id": status.get("in_reply_to_account_id"),
+ "conversation_id": status.get("conversation", {}).get("id") if isinstance(status.get("conversation"), dict) else None,
+ "replies_count": status.get("replies_count", 0),
+ "reblogs_count": status.get("reblogs_count", 0),
+ "favourites_count": status.get("favourites_count", 0),
+ "status_type": classify_status(status, monitored_account_id),
+ "mentions": [
+ {
+ "mentioned_account_id": m.get("id"),
+ "mentioned_username": m.get("username", ""),
+ "mentioned_acct": m.get("acct", ""),
+ "mentioned_url": m.get("url", ""),
+ }
+ for m in (actual.get("mentions") or [])
+ ],
+ "media_attachments": [
+ {
+ "media_id": ma.get("id"),
+ "media_type": ma.get("type"),
+ "url": ma.get("url"),
+ "preview_url": ma.get("preview_url"),
+ "description": ma.get("description"),
+ }
+ for ma in (actual.get("media_attachments") or [])
+ ],
+ "tags": [
+ {"name": t.get("name", ""), "url": t.get("url", "")}
+ for t in (actual.get("tags") or [])
+ ],
+ "raw_json": status,
+ }
diff --git a/app/templates/accounts.html b/app/templates/accounts.html
new file mode 100644
index 0000000..720621b
--- /dev/null
+++ b/app/templates/accounts.html
@@ -0,0 +1,77 @@
+{% extends "base.html" %}
+{% block title %}Accounts — Mastodon Collector{% endblock %}
+
+{% block content %}
+
+
Monitored Accounts
+
+
+
+
Add Account
+
+
+ You can also add accounts by editing accounts.txt — the collector picks them up automatically.
+
+
+
+
+
+
+
+ Handle
+ Display Name
+ Account ID
+ Status
+ Last Collected
+ Actions
+
+
+
+ {% for acct in accounts %}
+
+
+
+ {{ acct.handle }}
+
+
+ {{ acct.display_name or '—' }}
+ {{ acct.account_id or 'unresolved' }}
+
+ {% if acct.is_active %}
+ Active
+ {% else %}
+ Paused
+ {% endif %}
+
+
+ {{ acct.last_collected_at.strftime('%Y-%m-%d %H:%M') if acct.last_collected_at else 'Never' }}
+
+
+
+
+
+
+
+
+ {% endfor %}
+ {% if not accounts %}
+
+
+ No accounts yet. Add one above or edit accounts.txt.
+
+
+ {% endif %}
+
+
+
+{% endblock %}
diff --git a/app/templates/base.html b/app/templates/base.html
new file mode 100644
index 0000000..d370f5a
--- /dev/null
+++ b/app/templates/base.html
@@ -0,0 +1,262 @@
+
+
+
+
+
+ {% block title %}Mastodon Collector{% endblock %}
+
+
+
+
+
+
+
+
+
+ {% with messages = get_flashed_messages(with_categories=true) %}
+ {% for category, message in messages %}
+
{{ message }}
+ {% endfor %}
+ {% endwith %}
+
+ {% block content %}{% endblock %}
+
+
+
+
diff --git a/app/templates/index.html b/app/templates/index.html
new file mode 100644
index 0000000..e2e04d0
--- /dev/null
+++ b/app/templates/index.html
@@ -0,0 +1,123 @@
+{% extends "base.html" %}
+{% block title %}Dashboard — Mastodon Collector{% endblock %}
+
+{% block content %}
+Dashboard
+
+
+
+
{{ total_statuses }}
+
Total Statuses
+
+
+
{{ total_posts }}
+
Posts
+
+
+
{{ total_replies }}
+
Replies
+
+
+
{{ total_mentions }}
+
Mentions
+
+
+
{{ total_reblogs }}
+
Reblogs
+
+
+
{{ account_stats|length }}
+
Monitored Accounts
+
+
+
+
+
Monitored Accounts
+
+
+
+ Account
+ Instance
+ Status
+ Collected
+ Last Run
+
+
+
+ {% for item in account_stats %}
+
+
+
+ {{ item.account.handle }}
+
+ {% if item.account.display_name %}
+ — {{ item.account.display_name }}
+ {% endif %}
+
+ {{ item.account.instance }}
+
+ {% if item.account.is_active %}
+ Active
+ {% else %}
+ Paused
+ {% endif %}
+
+ {{ item.status_count }}
+
+ {% if item.account.last_collected_at %}
+ {{ item.account.last_collected_at.strftime('%Y-%m-%d %H:%M') }}
+ {% if item.last_log %}
+ {{ item.last_log.status }}
+ {% endif %}
+ {% else %}
+ Never
+ {% endif %}
+
+
+ {% endfor %}
+ {% if not account_stats %}
+
+
+ No accounts being monitored yet.
+ Add some accounts .
+
+
+ {% endif %}
+
+
+
+
+{% if recent_logs %}
+
+
Recent Collection Runs
+
+
+
+ Time
+ Account
+ Status
+ Collected
+ Error
+
+
+
+ {% for log in recent_logs %}
+
+ {{ log.started_at.strftime('%Y-%m-%d %H:%M:%S') if log.started_at }}
+
+ {% if log.account %}
+ {{ log.account.handle }}
+ {% else %}
+ —
+ {% endif %}
+
+ {{ log.status }}
+ {{ log.statuses_collected }}
+ {{ log.error or '—' }}
+
+ {% endfor %}
+
+
+
+{% endif %}
+{% endblock %}
diff --git a/app/templates/status_detail.html b/app/templates/status_detail.html
new file mode 100644
index 0000000..f60972f
--- /dev/null
+++ b/app/templates/status_detail.html
@@ -0,0 +1,140 @@
+{% extends "base.html" %}
+{% block title %}Status Detail — Mastodon Collector{% endblock %}
+
+{% block content %}
+
+
+
+
+
+ {{ status.status_type }}
+
+ {{ status.created_at.strftime('%Y-%m-%d %H:%M:%S UTC') if status.created_at }}
+
+
+
+
+
+
+
+ Account
+ {{ status.account.handle }}{% if status.account.display_name %} — {{ status.account.display_name }}{% endif %}
+
+
+ Visibility
+ {{ status.visibility }}
+
+
+ Language
+ {{ status.language or 'Unknown' }}
+
+ {% if status.in_reply_to_id %}
+
+ In Reply To
+ Status {{ status.in_reply_to_id }}{% if status.in_reply_to_account_id %} (account {{ status.in_reply_to_account_id }}){% endif %}
+
+ {% endif %}
+ {% if status.conversation_id %}
+
+ Conversation
+ {{ status.conversation_id }}
+
+ {% endif %}
+
+ Interactions
+ ↩ {{ status.replies_count }} replies ⟳ {{ status.reblogs_count }} reblogs ★ {{ status.favourites_count }} favourites
+
+ {% if status.sensitive %}
+
+ Sensitive
+ Yes{% if status.spoiler_text %} — {{ status.spoiler_text }}{% endif %}
+
+ {% endif %}
+
+ Mastodon Status ID
+ {{ status.status_id }}
+
+
+ URI
+ {{ status.uri }}
+
+
+
+
+
+
Content (HTML)
+
+ {{ status.content | safe }}
+
+
+
+
+
Content (Plain Text)
+
{{ status.text_content }}
+
+
+{% if status.mentions %}
+
+
Mentions ({{ status.mentions|length }})
+
+
+ Account URL
+
+
+ {% for m in status.mentions %}
+
+ @{{ m.mentioned_acct }}
+ {{ m.mentioned_url }}
+
+ {% endfor %}
+
+
+
+{% endif %}
+
+{% if status.media_attachments %}
+
+
Media Attachments ({{ status.media_attachments|length }})
+
+
+ Type Description URL
+
+
+ {% for ma in status.media_attachments %}
+
+ {{ ma.media_type }}
+ {{ ma.description or '—' }}
+ View ↗
+
+ {% endfor %}
+
+
+
+{% endif %}
+
+{% if status.tags %}
+
+
Tags
+
+ {% for t in status.tags %}
+ #{{ t.name }}
+ {% endfor %}
+
+
+{% endif %}
+
+
+
Raw JSON
+
+ Click to expand
+ {{ status.raw_json | tojson(indent=2) }}
+
+
+{% endblock %}
diff --git a/app/templates/statuses.html b/app/templates/statuses.html
new file mode 100644
index 0000000..d7a64af
--- /dev/null
+++ b/app/templates/statuses.html
@@ -0,0 +1,112 @@
+{% extends "base.html" %}
+{% block title %}Statuses — Mastodon Collector{% endblock %}
+
+{% block content %}
+
+
+
+
+
+
+
+
+ Date
+ Account
+ Type
+ Content
+ Interactions
+
+
+
+
+ {% for s in statuses %}
+
+
+ {{ s.created_at.strftime('%Y-%m-%d %H:%M') if s.created_at }}
+
+ {{ s.account.handle }}
+
+ {{ s.status_type }}
+
+
+
+ {{ s.text_content[:200] }}{% if s.text_content and s.text_content|length > 200 %}...{% endif %}
+
+ {% if s.tags %}
+
+ {% for t in s.tags %}
+ #{{ t.name }}
+ {% endfor %}
+
+ {% endif %}
+
+
+ ↩ {{ s.replies_count }} ⟳ {{ s.reblogs_count }} ★ {{ s.favourites_count }}
+
+
+ View
+
+
+ {% endfor %}
+ {% if not statuses %}
+
+
+ No statuses found. The collector runs every {{ (config.get('POLL_INTERVAL_SECONDS', 14400)|int // 3600) }} hours, or you can wait for the first collection cycle.
+
+
+ {% endif %}
+
+
+
+
+{% if total_pages > 1 %}
+
+{% endif %}
+{% endblock %}
diff --git a/app/web.py b/app/web.py
new file mode 100644
index 0000000..698f717
--- /dev/null
+++ b/app/web.py
@@ -0,0 +1,383 @@
+"""Flask web application for managing monitored accounts and viewing collected data."""
+
+import os
+import logging
+from datetime import datetime, timezone
+
+from flask import Flask, render_template, request, redirect, url_for, flash, jsonify
+from sqlalchemy import func, desc
+
+from app.db import (
+ init_db,
+ get_session,
+ MonitoredAccount,
+ Status,
+ Mention,
+ CollectionLog,
+)
+from app.mastodon_api import lookup_account, MastodonAPIError
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = Flask(__name__)
+app.secret_key = os.environ.get("FLASK_SECRET_KEY", "dev-secret-key")
+
+# Initialize database on startup
+with app.app_context():
+ init_db()
+
+
+@app.route("/")
+def index():
+ """Dashboard overview."""
+ session = get_session()
+ try:
+ accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
+ total_statuses = session.query(func.count(Status.id)).scalar() or 0
+ total_posts = session.query(func.count(Status.id)).filter(Status.status_type == "post").scalar() or 0
+ total_replies = session.query(func.count(Status.id)).filter(Status.status_type == "reply").scalar() or 0
+ total_mentions = session.query(func.count(Status.id)).filter(Status.status_type == "mention").scalar() or 0
+ total_reblogs = session.query(func.count(Status.id)).filter(Status.status_type == "reblog").scalar() or 0
+
+ # Per-account stats
+ account_stats = []
+ for acct in accounts:
+ count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
+ last_log = (
+ session.query(CollectionLog)
+ .filter_by(account_db_id=acct.id)
+ .order_by(desc(CollectionLog.started_at))
+ .first()
+ )
+ account_stats.append({
+ "account": acct,
+ "status_count": count,
+ "last_log": last_log,
+ })
+
+ # Recent collection logs
+ recent_logs = (
+ session.query(CollectionLog)
+ .order_by(desc(CollectionLog.started_at))
+ .limit(20)
+ .all()
+ )
+
+ return render_template(
+ "index.html",
+ account_stats=account_stats,
+ total_statuses=total_statuses,
+ total_posts=total_posts,
+ total_replies=total_replies,
+ total_mentions=total_mentions,
+ total_reblogs=total_reblogs,
+ recent_logs=recent_logs,
+ )
+ finally:
+ session.close()
+
+
+@app.route("/accounts")
+def accounts_list():
+ """List all monitored accounts."""
+ session = get_session()
+ try:
+ accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
+ return render_template("accounts.html", accounts=accounts)
+ finally:
+ session.close()
+
+
+@app.route("/accounts/add", methods=["POST"])
+def accounts_add():
+ """Add a new account to monitor."""
+ handle = request.form.get("handle", "").strip().lstrip("@")
+ if "@" not in handle:
+ flash("Invalid handle format. Use @user@instance.social", "error")
+ return redirect(url_for("accounts_list"))
+
+ username, instance = handle.split("@", 1)
+ if not username or not instance:
+ flash("Invalid handle format. Use @user@instance.social", "error")
+ return redirect(url_for("accounts_list"))
+
+ session = get_session()
+ try:
+ existing = session.query(MonitoredAccount).filter_by(username=username, instance=instance).first()
+ if existing:
+ if not existing.is_active:
+ existing.is_active = True
+ session.commit()
+ flash(f"Re-activated {existing.handle}", "success")
+ else:
+ flash(f"{existing.handle} is already being monitored", "info")
+ return redirect(url_for("accounts_list"))
+
+ # Try to resolve the account first
+ try:
+ data = lookup_account(instance, username)
+ acct = MonitoredAccount(
+ username=username,
+ instance=instance,
+ account_id=data["id"],
+ display_name=data.get("display_name", ""),
+ avatar_url=data.get("avatar", ""),
+ note=data.get("note", ""),
+ is_active=True,
+ )
+ except MastodonAPIError as e:
+ logger.warning("Could not resolve account @%s@%s: %s — adding anyway", username, instance, e)
+ acct = MonitoredAccount(
+ username=username,
+ instance=instance,
+ is_active=True,
+ )
+
+ session.add(acct)
+ session.commit()
+ flash(f"Added {acct.handle} to monitoring list", "success")
+ return redirect(url_for("accounts_list"))
+
+ finally:
+ session.close()
+
+
+@app.route("/accounts//toggle", methods=["POST"])
+def accounts_toggle(account_id):
+ """Toggle an account's active status."""
+ session = get_session()
+ try:
+ acct = session.query(MonitoredAccount).get(account_id)
+ if acct:
+ acct.is_active = not acct.is_active
+ session.commit()
+ state = "activated" if acct.is_active else "paused"
+ flash(f"{state.capitalize()} monitoring for {acct.handle}", "success")
+ return redirect(url_for("accounts_list"))
+ finally:
+ session.close()
+
+
+@app.route("/accounts//delete", methods=["POST"])
+def accounts_delete(account_id):
+ """Delete an account and all its collected data."""
+ session = get_session()
+ try:
+ acct = session.query(MonitoredAccount).get(account_id)
+ if acct:
+ handle = acct.handle
+ # Delete associated statuses (cascades to mentions, media, tags)
+ session.query(Status).filter_by(account_db_id=acct.id).delete()
+ session.query(CollectionLog).filter_by(account_db_id=acct.id).delete()
+ session.delete(acct)
+ session.commit()
+ flash(f"Deleted {handle} and all collected data", "success")
+ return redirect(url_for("accounts_list"))
+ finally:
+ session.close()
+
+
+@app.route("/statuses")
+def statuses_list():
+ """Browse collected statuses with filters."""
+ session = get_session()
+ try:
+ page = request.args.get("page", 1, type=int)
+ per_page = request.args.get("per_page", 50, type=int)
+ account_id = request.args.get("account_id", type=int)
+ status_type = request.args.get("type", "")
+ search = request.args.get("q", "").strip()
+
+ query = session.query(Status).join(MonitoredAccount)
+
+ if account_id:
+ query = query.filter(Status.account_db_id == account_id)
+ if status_type:
+ query = query.filter(Status.status_type == status_type)
+ if search:
+ query = query.filter(Status.text_content.ilike(f"%{search}%"))
+
+ total = query.count()
+ statuses = (
+ query.order_by(desc(Status.created_at))
+ .offset((page - 1) * per_page)
+ .limit(per_page)
+ .all()
+ )
+
+ accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.username).all()
+ total_pages = max(1, (total + per_page - 1) // per_page)
+
+ return render_template(
+ "statuses.html",
+ statuses=statuses,
+ accounts=accounts,
+ page=page,
+ per_page=per_page,
+ total=total,
+ total_pages=total_pages,
+ current_account_id=account_id,
+ current_type=status_type,
+ search=search,
+ )
+ finally:
+ session.close()
+
+
+@app.route("/statuses/")
+def status_detail(status_db_id):
+ """View a single status with all details."""
+ session = get_session()
+ try:
+ status = session.query(Status).get(status_db_id)
+ if not status:
+ flash("Status not found", "error")
+ return redirect(url_for("statuses_list"))
+ return render_template("status_detail.html", status=status)
+ finally:
+ session.close()
+
+
+@app.route("/api/stats")
+def api_stats():
+ """JSON API endpoint for stats (useful for your analysis pipeline)."""
+ session = get_session()
+ try:
+ stats = {
+ "total_statuses": session.query(func.count(Status.id)).scalar() or 0,
+ "by_type": {},
+ "accounts": [],
+ }
+ for stype in ["post", "reply", "mention", "reblog"]:
+ stats["by_type"][stype] = (
+ session.query(func.count(Status.id)).filter(Status.status_type == stype).scalar() or 0
+ )
+
+ accounts = session.query(MonitoredAccount).filter_by(is_active=True).all()
+ for acct in accounts:
+ count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
+ stats["accounts"].append({
+ "handle": acct.handle,
+ "status_count": count,
+ "last_collected": acct.last_collected_at.isoformat() if acct.last_collected_at else None,
+ })
+
+ return jsonify(stats)
+ finally:
+ session.close()
+
+
+@app.route("/api/statuses")
+def api_statuses():
+ """JSON API endpoint for statuses (for your analysis pipeline)."""
+ session = get_session()
+ try:
+ page = request.args.get("page", 1, type=int)
+ per_page = min(request.args.get("per_page", 100, type=int), 500)
+ account_id = request.args.get("account_id", type=int)
+ status_type = request.args.get("type", "")
+ since = request.args.get("since", "") # ISO datetime
+
+ query = session.query(Status)
+
+ if account_id:
+ query = query.filter(Status.account_db_id == account_id)
+ if status_type:
+ query = query.filter(Status.status_type == status_type)
+ if since:
+ query = query.filter(Status.created_at >= since)
+
+ total = query.count()
+ statuses = (
+ query.order_by(desc(Status.created_at))
+ .offset((page - 1) * per_page)
+ .limit(per_page)
+ .all()
+ )
+
+ return jsonify({
+ "total": total,
+ "page": page,
+ "per_page": per_page,
+ "statuses": [
+ {
+ "id": s.id,
+ "status_id": s.status_id,
+ "account": s.account.handle,
+ "url": s.url,
+ "content": s.content,
+ "text_content": s.text_content,
+ "visibility": s.visibility,
+ "created_at": s.created_at.isoformat() if s.created_at else None,
+ "language": s.language,
+ "status_type": s.status_type,
+ "in_reply_to_id": s.in_reply_to_id,
+ "replies_count": s.replies_count,
+ "reblogs_count": s.reblogs_count,
+ "favourites_count": s.favourites_count,
+ "mentions": [
+ {"acct": m.mentioned_acct, "url": m.mentioned_url}
+ for m in s.mentions
+ ],
+ "tags": [t.name for t in s.tags],
+ }
+ for s in statuses
+ ],
+ })
+ finally:
+ session.close()
+
+
+@app.route("/export")
+def export_csv():
+ """Export statuses as CSV for analysis."""
+ from io import StringIO
+ import csv
+
+ session = get_session()
+ try:
+ account_id = request.args.get("account_id", type=int)
+ status_type = request.args.get("type", "")
+
+ query = session.query(Status).join(MonitoredAccount)
+ if account_id:
+ query = query.filter(Status.account_db_id == account_id)
+ if status_type:
+ query = query.filter(Status.status_type == status_type)
+
+ statuses = query.order_by(desc(Status.created_at)).all()
+
+ output = StringIO()
+ writer = csv.writer(output)
+ writer.writerow([
+ "id", "account", "status_type", "created_at", "url",
+ "text_content", "language", "visibility", "in_reply_to_id",
+ "replies_count", "reblogs_count", "favourites_count",
+ "mentions", "tags", "sensitive", "spoiler_text",
+ ])
+
+ for s in statuses:
+ mentions_str = "; ".join(m.mentioned_acct for m in s.mentions)
+ tags_str = "; ".join(t.name for t in s.tags)
+ writer.writerow([
+ s.status_id, s.account.handle, s.status_type,
+ s.created_at.isoformat() if s.created_at else "",
+ s.url, s.text_content, s.language, s.visibility,
+ s.in_reply_to_id, s.replies_count, s.reblogs_count,
+ s.favourites_count, mentions_str, tags_str,
+ s.sensitive, s.spoiler_text,
+ ])
+
+ from flask import Response
+ return Response(
+ output.getvalue(),
+ mimetype="text/csv",
+ headers={"Content-Disposition": "attachment; filename=mastodon_statuses.csv"},
+ )
+ finally:
+ session.close()
+
+
+if __name__ == "__main__":
+ app.run(host="0.0.0.0", port=5000, debug=True)
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..7c2d970
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,50 @@
+version: "3.8"
+
+services:
+ db:
+ image: postgres:16-alpine
+ restart: unless-stopped
+ environment:
+ POSTGRES_DB: mastodon_collector
+ POSTGRES_USER: collector
+ POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-collector_secret}
+ volumes:
+ - pgdata:/var/lib/postgresql/data
+ ports:
+ - "127.0.0.1:5434:5432"
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U collector -d mastodon_collector"]
+ interval: 5s
+ timeout: 5s
+ retries: 5
+
+ web:
+ build: .
+ restart: unless-stopped
+ command: gunicorn --bind 0.0.0.0:5000 --workers 2 --timeout 120 app.web:app
+ ports:
+ - "127.0.0.1:8585:5000"
+ environment:
+ DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
+ FLASK_SECRET_KEY: ${FLASK_SECRET_KEY:-change-me-in-production}
+ volumes:
+ - ./accounts.txt:/app/accounts.txt
+ depends_on:
+ db:
+ condition: service_healthy
+
+ collector:
+ build: .
+ restart: unless-stopped
+ command: python -m app.collector
+ environment:
+ DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
+ POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
+ volumes:
+ - ./accounts.txt:/app/accounts.txt
+ depends_on:
+ db:
+ condition: service_healthy
+
+volumes:
+ pgdata:
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b554053
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+flask==3.1.0
+gunicorn==23.0.0
+psycopg2-binary==2.9.10
+sqlalchemy==2.0.36
+requests==2.32.3
+apscheduler==3.10.4