Initial commit: Mastodon collector application
Add Flask-based application for collecting and archiving Mastodon posts from configured accounts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
commit
1783a48d7c
18 changed files with 2115 additions and 0 deletions
6
.dockerignore
Normal file
6
.dockerignore
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
__pycache__
|
||||
*.pyc
|
||||
.env
|
||||
.git
|
||||
.gitignore
|
||||
README.md
|
||||
8
.env.example
Normal file
8
.env.example
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# PostgreSQL Configuration
|
||||
POSTGRES_PASSWORD=your_secure_password_here
|
||||
|
||||
# Flask Configuration
|
||||
FLASK_SECRET_KEY=your_secure_secret_key_here
|
||||
|
||||
# Polling Configuration
|
||||
POLL_INTERVAL_SECONDS=14400
|
||||
115
.gitignore
vendored
Normal file
115
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
# Environment variables and secrets
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
*.secret
|
||||
secrets/
|
||||
credentials/
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
.venv/
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
|
||||
# Database files
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
*.db
|
||||
*.db-journal
|
||||
*.db-shm
|
||||
*.db-wal
|
||||
postgres_data/
|
||||
pgdata/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
*.log.*
|
||||
|
||||
# Docker volumes and local data
|
||||
docker-compose.override.yml
|
||||
.docker/
|
||||
volumes/
|
||||
|
||||
# Certificates and keys
|
||||
*.pem
|
||||
*.key
|
||||
*.crt
|
||||
*.cer
|
||||
*.p12
|
||||
*.pfx
|
||||
|
||||
# Backup files
|
||||
*.bak
|
||||
*.backup
|
||||
*.tmp
|
||||
*~
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
.tox/
|
||||
.hypothesis/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
Pipfile.lock
|
||||
|
||||
# Poetry
|
||||
poetry.lock
|
||||
20
Dockerfile
Normal file
20
Dockerfile
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libpq-dev gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
# Create empty accounts file if it doesn't exist
|
||||
RUN touch /app/accounts.txt
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["python", "-m", "app.collector"]
|
||||
91
README.md
Normal file
91
README.md
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# Mastodon Collector
|
||||
|
||||
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes a web UI for account management and data browsing, plus JSON/CSV APIs for your analysis pipeline.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Add accounts to monitor
|
||||
echo "@user@mastodon.social" >> accounts.txt
|
||||
|
||||
# 2. Start everything
|
||||
docker compose up -d
|
||||
|
||||
# 3. Open the dashboard
|
||||
open http://localhost:8585
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
| Service | Description | Port |
|
||||
|---------------|------------------------------------------------|-------|
|
||||
| **db** | PostgreSQL 16 | 5432 |
|
||||
| **web** | Flask dashboard (Gunicorn) | 8585 |
|
||||
| **collector** | Background service, polls every 4 hours | — |
|
||||
|
||||
## Adding Accounts
|
||||
|
||||
Two methods:
|
||||
|
||||
1. **Text file** — edit `accounts.txt`, one handle per line (`@user@instance.social`). Picked up on next collection cycle.
|
||||
2. **Web UI** — go to http://localhost:8585/accounts and use the form.
|
||||
|
||||
## Configuration
|
||||
|
||||
Edit `.env` to customize:
|
||||
|
||||
```
|
||||
POSTGRES_PASSWORD=collector_secret # Change for production
|
||||
FLASK_SECRET_KEY=change-me-in-production
|
||||
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
For plugging into your analysis pipeline:
|
||||
|
||||
| Endpoint | Description |
|
||||
|-----------------------|--------------------------------------|
|
||||
| `GET /api/stats` | Overview stats (counts by type) |
|
||||
| `GET /api/statuses` | Paginated statuses as JSON |
|
||||
| `GET /export` | Download all statuses as CSV |
|
||||
|
||||
### `/api/statuses` parameters
|
||||
|
||||
- `page` — page number (default: 1)
|
||||
- `per_page` — results per page (default: 100, max: 500)
|
||||
- `account_id` — filter by internal account ID
|
||||
- `type` — filter by status type: `post`, `reply`, `mention`, `reblog`
|
||||
- `since` — ISO datetime, only return statuses after this time
|
||||
|
||||
## Database Schema
|
||||
|
||||
Main tables:
|
||||
|
||||
- `monitored_accounts` — accounts being tracked
|
||||
- `statuses` — collected posts with plain text + HTML content
|
||||
- `mentions` — who was @-mentioned in each status
|
||||
- `media_attachments` — images/videos attached to statuses
|
||||
- `tags` — hashtags used
|
||||
- `collection_logs` — audit trail of each collection run
|
||||
|
||||
Each status stores `raw_json` with the full Mastodon API response for future analysis needs.
|
||||
|
||||
## Moving to a Server
|
||||
|
||||
```bash
|
||||
# Copy the project
|
||||
scp -r mastodon-collector/ user@server:~/
|
||||
|
||||
# On the server
|
||||
cd mastodon-collector
|
||||
# Edit .env with production secrets
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## Stopping
|
||||
|
||||
```bash
|
||||
docker compose down # Stop services, keep data
|
||||
docker compose down -v # Stop services AND delete database
|
||||
```
|
||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
4
app/__main__.py
Normal file
4
app/__main__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
"""Allow running the collector with `python -m app`."""
|
||||
from app.collector import main
|
||||
|
||||
main()
|
||||
306
app/collector.py
Normal file
306
app/collector.py
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
"""
|
||||
Collector service — periodically polls Mastodon for new statuses from monitored accounts.
|
||||
Runs as a standalone process via `python -m app.collector`.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
||||
|
||||
from app.db import (
|
||||
init_db,
|
||||
get_session,
|
||||
MonitoredAccount,
|
||||
Status,
|
||||
Mention,
|
||||
MediaAttachment,
|
||||
Tag,
|
||||
CollectionLog,
|
||||
)
|
||||
from app.mastodon_api import (
|
||||
lookup_account,
|
||||
get_account_statuses,
|
||||
parse_status,
|
||||
MastodonAPIError,
|
||||
RateLimitError,
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger = logging.getLogger("collector")
|
||||
|
||||
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL_SECONDS", 14400))
|
||||
ACCOUNTS_FILE = os.environ.get("ACCOUNTS_FILE", "/app/accounts.txt")
|
||||
|
||||
|
||||
def load_accounts_from_file(filepath: str) -> list[tuple[str, str]]:
|
||||
"""Parse accounts.txt and return list of (username, instance) tuples."""
|
||||
accounts = []
|
||||
path = Path(filepath)
|
||||
if not path.exists():
|
||||
logger.warning("Accounts file not found: %s", filepath)
|
||||
return accounts
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
# Expected format: @user@instance.social or user@instance.social
|
||||
line = line.lstrip("@")
|
||||
if "@" not in line:
|
||||
logger.warning("Skipping malformed account line: %s", line)
|
||||
continue
|
||||
parts = line.split("@", 1)
|
||||
if len(parts) == 2 and parts[0] and parts[1]:
|
||||
accounts.append((parts[0], parts[1]))
|
||||
else:
|
||||
logger.warning("Skipping malformed account line: %s", line)
|
||||
return accounts
|
||||
|
||||
|
||||
def sync_monitored_accounts(session) -> list[MonitoredAccount]:
|
||||
"""
|
||||
Sync accounts from the file + database.
|
||||
Accounts added via web UI are already in the DB.
|
||||
Accounts in the file get added if missing.
|
||||
Returns all active monitored accounts.
|
||||
"""
|
||||
file_accounts = load_accounts_from_file(ACCOUNTS_FILE)
|
||||
|
||||
for username, instance in file_accounts:
|
||||
existing = (
|
||||
session.query(MonitoredAccount)
|
||||
.filter_by(username=username, instance=instance)
|
||||
.first()
|
||||
)
|
||||
if not existing:
|
||||
logger.info("Adding account from file: @%s@%s", username, instance)
|
||||
acct = MonitoredAccount(username=username, instance=instance, is_active=True)
|
||||
session.add(acct)
|
||||
|
||||
session.commit()
|
||||
return session.query(MonitoredAccount).filter_by(is_active=True).all()
|
||||
|
||||
|
||||
def resolve_account(session, account: MonitoredAccount) -> bool:
|
||||
"""Look up the Mastodon account ID if we don't have it yet."""
|
||||
if account.account_id:
|
||||
return True
|
||||
|
||||
try:
|
||||
data = lookup_account(account.instance, account.username)
|
||||
account.account_id = data["id"]
|
||||
account.display_name = data.get("display_name", "")
|
||||
account.avatar_url = data.get("avatar", "")
|
||||
account.note = data.get("note", "")
|
||||
session.commit()
|
||||
logger.info("Resolved %s → account_id=%s", account.handle, account.account_id)
|
||||
return True
|
||||
except MastodonAPIError as e:
|
||||
logger.error("Failed to resolve %s: %s", account.handle, e)
|
||||
return False
|
||||
|
||||
|
||||
def store_status(session, account: MonitoredAccount, parsed: dict) -> bool:
|
||||
"""Store a parsed status in the database. Returns True if new, False if duplicate."""
|
||||
# Check for duplicate
|
||||
existing = (
|
||||
session.query(Status)
|
||||
.filter_by(status_id=parsed["status_id"], account_db_id=account.id)
|
||||
.first()
|
||||
)
|
||||
if existing:
|
||||
# Update interaction counts in case they changed
|
||||
existing.replies_count = parsed["replies_count"]
|
||||
existing.reblogs_count = parsed["reblogs_count"]
|
||||
existing.favourites_count = parsed["favourites_count"]
|
||||
return False
|
||||
|
||||
status = Status(
|
||||
status_id=parsed["status_id"],
|
||||
account_db_id=account.id,
|
||||
uri=parsed["uri"],
|
||||
url=parsed["url"],
|
||||
content=parsed["content"],
|
||||
text_content=parsed["text_content"],
|
||||
visibility=parsed["visibility"],
|
||||
created_at=parsed["created_at"],
|
||||
language=parsed["language"],
|
||||
sensitive=parsed["sensitive"],
|
||||
spoiler_text=parsed["spoiler_text"],
|
||||
in_reply_to_id=parsed["in_reply_to_id"],
|
||||
in_reply_to_account_id=parsed["in_reply_to_account_id"],
|
||||
conversation_id=parsed["conversation_id"],
|
||||
replies_count=parsed["replies_count"],
|
||||
reblogs_count=parsed["reblogs_count"],
|
||||
favourites_count=parsed["favourites_count"],
|
||||
status_type=parsed["status_type"],
|
||||
raw_json=parsed["raw_json"],
|
||||
)
|
||||
session.add(status)
|
||||
session.flush() # get status.id
|
||||
|
||||
# Store mentions
|
||||
for m in parsed["mentions"]:
|
||||
session.add(Mention(
|
||||
status_db_id=status.id,
|
||||
mentioned_account_id=m["mentioned_account_id"],
|
||||
mentioned_username=m["mentioned_username"],
|
||||
mentioned_acct=m["mentioned_acct"],
|
||||
mentioned_url=m["mentioned_url"],
|
||||
))
|
||||
|
||||
# Store media
|
||||
for ma in parsed["media_attachments"]:
|
||||
session.add(MediaAttachment(
|
||||
status_db_id=status.id,
|
||||
media_id=ma["media_id"],
|
||||
media_type=ma["media_type"],
|
||||
url=ma["url"],
|
||||
preview_url=ma["preview_url"],
|
||||
description=ma["description"],
|
||||
))
|
||||
|
||||
# Store tags
|
||||
for t in parsed["tags"]:
|
||||
session.add(Tag(
|
||||
status_db_id=status.id,
|
||||
name=t["name"],
|
||||
url=t["url"],
|
||||
))
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def collect_account(session, account: MonitoredAccount) -> int:
|
||||
"""Collect new statuses for a single account. Returns count of new statuses."""
|
||||
log = CollectionLog(account_db_id=account.id, status="running")
|
||||
session.add(log)
|
||||
session.commit()
|
||||
|
||||
try:
|
||||
if not resolve_account(session, account):
|
||||
log.status = "error"
|
||||
log.error = "Could not resolve account ID"
|
||||
log.finished_at = datetime.now(timezone.utc)
|
||||
session.commit()
|
||||
return 0
|
||||
|
||||
logger.info("Collecting statuses for %s (since_id=%s)", account.handle, account.last_status_id)
|
||||
|
||||
raw_statuses = get_account_statuses(
|
||||
instance=account.instance,
|
||||
account_id=account.account_id,
|
||||
since_id=account.last_status_id,
|
||||
)
|
||||
|
||||
new_count = 0
|
||||
newest_id = account.last_status_id
|
||||
|
||||
for raw in raw_statuses:
|
||||
parsed = parse_status(raw, account.account_id)
|
||||
is_new = store_status(session, account, parsed)
|
||||
if is_new:
|
||||
new_count += 1
|
||||
|
||||
# Track the newest status ID
|
||||
sid = parsed["status_id"]
|
||||
if newest_id is None or sid > newest_id:
|
||||
newest_id = sid
|
||||
|
||||
if newest_id:
|
||||
account.last_status_id = newest_id
|
||||
account.last_collected_at = datetime.now(timezone.utc)
|
||||
|
||||
log.statuses_collected = new_count
|
||||
log.status = "success"
|
||||
log.finished_at = datetime.now(timezone.utc)
|
||||
session.commit()
|
||||
|
||||
logger.info("Collected %d new statuses for %s (total fetched: %d)",
|
||||
new_count, account.handle, len(raw_statuses))
|
||||
return new_count
|
||||
|
||||
except RateLimitError as e:
|
||||
log.status = "error"
|
||||
log.error = f"Rate limited: {e}"
|
||||
log.finished_at = datetime.now(timezone.utc)
|
||||
session.commit()
|
||||
logger.warning("Rate limited while collecting %s: %s", account.handle, e)
|
||||
time.sleep(e.retry_after)
|
||||
return 0
|
||||
|
||||
except MastodonAPIError as e:
|
||||
log.status = "error"
|
||||
log.error = str(e)
|
||||
log.finished_at = datetime.now(timezone.utc)
|
||||
session.commit()
|
||||
logger.error("API error collecting %s: %s", account.handle, e)
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
log.status = "error"
|
||||
log.error = str(e)
|
||||
log.finished_at = datetime.now(timezone.utc)
|
||||
session.commit()
|
||||
logger.exception("Unexpected error collecting %s", account.handle)
|
||||
return 0
|
||||
|
||||
|
||||
def run_collection_cycle():
|
||||
"""Run one full collection cycle across all monitored accounts."""
|
||||
logger.info("=== Starting collection cycle ===")
|
||||
session = get_session()
|
||||
|
||||
try:
|
||||
accounts = sync_monitored_accounts(session)
|
||||
logger.info("Monitoring %d active accounts", len(accounts))
|
||||
|
||||
total_new = 0
|
||||
for account in accounts:
|
||||
new = collect_account(session, account)
|
||||
total_new += new
|
||||
time.sleep(1) # Brief pause between accounts to be polite
|
||||
|
||||
logger.info("=== Collection cycle complete: %d new statuses across %d accounts ===",
|
||||
total_new, len(accounts))
|
||||
|
||||
except Exception:
|
||||
logger.exception("Fatal error in collection cycle")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""Entry point: initialize DB and start the scheduler."""
|
||||
logger.info("Mastodon Collector starting up...")
|
||||
logger.info("Poll interval: %d seconds (%d hours)", POLL_INTERVAL, POLL_INTERVAL // 3600)
|
||||
|
||||
init_db()
|
||||
logger.info("Database initialized")
|
||||
|
||||
# Run one collection immediately on startup
|
||||
run_collection_cycle()
|
||||
|
||||
# Schedule recurring collection
|
||||
scheduler = BlockingScheduler()
|
||||
scheduler.add_job(run_collection_cycle, "interval", seconds=POLL_INTERVAL)
|
||||
logger.info("Scheduler started — next run in %d seconds", POLL_INTERVAL)
|
||||
|
||||
try:
|
||||
scheduler.start()
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
logger.info("Collector shutting down")
|
||||
scheduler.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
186
app/db.py
Normal file
186
app/db.py
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
"""Database models and session management."""
|
||||
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import (
|
||||
create_engine,
|
||||
Column,
|
||||
Integer,
|
||||
BigInteger,
|
||||
String,
|
||||
Text,
|
||||
Boolean,
|
||||
DateTime,
|
||||
ForeignKey,
|
||||
Index,
|
||||
UniqueConstraint,
|
||||
JSON,
|
||||
)
|
||||
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
|
||||
|
||||
DATABASE_URL = os.environ.get(
|
||||
"DATABASE_URL", "postgresql://collector:collector_secret@localhost:5432/mastodon_collector"
|
||||
)
|
||||
|
||||
engine = create_engine(DATABASE_URL, pool_pre_ping=True, pool_size=5, max_overflow=10)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class MonitoredAccount(Base):
|
||||
"""An account we are monitoring."""
|
||||
|
||||
__tablename__ = "monitored_accounts"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
username = Column(String(255), nullable=False) # e.g. "user"
|
||||
instance = Column(String(255), nullable=False) # e.g. "mastodon.social"
|
||||
account_id = Column(String(64), nullable=True) # Mastodon numeric account ID on that instance
|
||||
display_name = Column(String(512), nullable=True)
|
||||
avatar_url = Column(Text, nullable=True)
|
||||
is_active = Column(Boolean, default=True, nullable=False)
|
||||
last_collected_at = Column(DateTime(timezone=True), nullable=True)
|
||||
last_status_id = Column(String(64), nullable=True) # For pagination: newest status ID we've seen
|
||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
note = Column(Text, nullable=True) # Bio / description
|
||||
|
||||
statuses = relationship("Status", back_populates="account", lazy="dynamic")
|
||||
|
||||
__table_args__ = (
|
||||
UniqueConstraint("username", "instance", name="uq_account_handle"),
|
||||
)
|
||||
|
||||
@property
|
||||
def handle(self):
|
||||
return f"@{self.username}@{self.instance}"
|
||||
|
||||
def __repr__(self):
|
||||
return f"<MonitoredAccount {self.handle}>"
|
||||
|
||||
|
||||
class Status(Base):
|
||||
"""A single post / toot collected from Mastodon."""
|
||||
|
||||
__tablename__ = "statuses"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
status_id = Column(String(64), nullable=False) # Mastodon status ID
|
||||
account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=False)
|
||||
uri = Column(Text, nullable=False) # Canonical ActivityPub URI
|
||||
url = Column(Text, nullable=True) # Human-readable URL
|
||||
content = Column(Text, nullable=False) # HTML content
|
||||
text_content = Column(Text, nullable=True) # Stripped plain-text content
|
||||
visibility = Column(String(32), nullable=True) # public, unlisted, private, direct
|
||||
created_at = Column(DateTime(timezone=True), nullable=False)
|
||||
collected_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
language = Column(String(16), nullable=True)
|
||||
sensitive = Column(Boolean, default=False)
|
||||
spoiler_text = Column(Text, nullable=True)
|
||||
|
||||
# Reply / conversation tracking
|
||||
in_reply_to_id = Column(String(64), nullable=True) # Status ID being replied to
|
||||
in_reply_to_account_id = Column(String(64), nullable=True)
|
||||
conversation_id = Column(String(64), nullable=True)
|
||||
|
||||
# Interaction counts
|
||||
replies_count = Column(Integer, default=0)
|
||||
reblogs_count = Column(Integer, default=0)
|
||||
favourites_count = Column(Integer, default=0)
|
||||
|
||||
# Classification for your analysis pipeline
|
||||
status_type = Column(String(32), nullable=False, default="post") # post, reply, mention, reblog
|
||||
|
||||
# Store the full JSON for future reference
|
||||
raw_json = Column(JSON, nullable=True)
|
||||
|
||||
# Relationships
|
||||
account = relationship("MonitoredAccount", back_populates="statuses")
|
||||
mentions = relationship("Mention", back_populates="status", cascade="all, delete-orphan")
|
||||
media_attachments = relationship("MediaAttachment", back_populates="status", cascade="all, delete-orphan")
|
||||
tags = relationship("Tag", back_populates="status", cascade="all, delete-orphan")
|
||||
|
||||
__table_args__ = (
|
||||
UniqueConstraint("status_id", "account_db_id", name="uq_status_per_account"),
|
||||
Index("ix_status_created", "created_at"),
|
||||
Index("ix_status_type", "status_type"),
|
||||
Index("ix_status_account", "account_db_id"),
|
||||
Index("ix_status_conversation", "conversation_id"),
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Status {self.status_id} type={self.status_type}>"
|
||||
|
||||
|
||||
class Mention(Base):
|
||||
"""A mention within a status (who was @-mentioned)."""
|
||||
|
||||
__tablename__ = "mentions"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
|
||||
mentioned_account_id = Column(String(64), nullable=True)
|
||||
mentioned_username = Column(String(255), nullable=False)
|
||||
mentioned_acct = Column(String(512), nullable=False) # full user@instance
|
||||
mentioned_url = Column(Text, nullable=True)
|
||||
|
||||
status = relationship("Status", back_populates="mentions")
|
||||
|
||||
|
||||
class MediaAttachment(Base):
|
||||
"""Media attached to a status."""
|
||||
|
||||
__tablename__ = "media_attachments"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
|
||||
media_id = Column(String(64), nullable=True)
|
||||
media_type = Column(String(32), nullable=True) # image, video, gifv, audio
|
||||
url = Column(Text, nullable=True)
|
||||
preview_url = Column(Text, nullable=True)
|
||||
description = Column(Text, nullable=True) # alt text
|
||||
|
||||
status = relationship("Status", back_populates="media_attachments")
|
||||
|
||||
|
||||
class Tag(Base):
|
||||
"""A hashtag used in a status."""
|
||||
|
||||
__tablename__ = "tags"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
status_db_id = Column(Integer, ForeignKey("statuses.id", ondelete="CASCADE"), nullable=False)
|
||||
name = Column(String(255), nullable=False)
|
||||
url = Column(Text, nullable=True)
|
||||
|
||||
status = relationship("Status", back_populates="tags")
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_tag_name", "name"),
|
||||
)
|
||||
|
||||
|
||||
class CollectionLog(Base):
|
||||
"""Log of each collection run for monitoring."""
|
||||
|
||||
__tablename__ = "collection_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
account_db_id = Column(Integer, ForeignKey("monitored_accounts.id"), nullable=True)
|
||||
started_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
finished_at = Column(DateTime(timezone=True), nullable=True)
|
||||
statuses_collected = Column(Integer, default=0)
|
||||
error = Column(Text, nullable=True)
|
||||
status = Column(String(32), default="running") # running, success, error
|
||||
|
||||
account = relationship("MonitoredAccount")
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Create all tables."""
|
||||
Base.metadata.create_all(engine)
|
||||
|
||||
|
||||
def get_session():
|
||||
"""Get a new database session."""
|
||||
return SessionLocal()
|
||||
226
app/mastodon_api.py
Normal file
226
app/mastodon_api.py
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
"""Mastodon public API client — no authentication required."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from html import unescape
|
||||
from typing import Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Respect rate limits: Mastodon returns 300 requests per 5 min by default
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BACKOFF = 5 # seconds
|
||||
|
||||
|
||||
class MastodonAPIError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RateLimitError(MastodonAPIError):
|
||||
def __init__(self, retry_after: float = 60):
|
||||
self.retry_after = retry_after
|
||||
super().__init__(f"Rate limited, retry after {retry_after}s")
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
"""Strip HTML tags and decode entities to get plain text."""
|
||||
# Replace <br> and </p> with newlines
|
||||
text = re.sub(r"<br\s*/?>", "\n", html)
|
||||
text = re.sub(r"</p>", "\n", text)
|
||||
# Remove all remaining tags
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
return unescape(text).strip()
|
||||
|
||||
|
||||
def _api_get(instance: str, path: str, params: Optional[dict] = None) -> requests.Response:
|
||||
"""Make a GET request to a Mastodon instance's public API."""
|
||||
url = f"https://{instance}{path}"
|
||||
headers = {"Accept": "application/json", "User-Agent": "MastodonCollector/1.0"}
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = requests.get(url, params=params, headers=headers, timeout=DEFAULT_TIMEOUT)
|
||||
|
||||
if resp.status_code == 429:
|
||||
retry_after = float(resp.headers.get("X-RateLimit-Reset", 60))
|
||||
# If it's an ISO timestamp, calculate delta
|
||||
if retry_after > 1_000_000:
|
||||
retry_after = 60
|
||||
logger.warning("Rate limited by %s, waiting %.0fs", instance, retry_after)
|
||||
raise RateLimitError(retry_after)
|
||||
|
||||
if resp.status_code == 404:
|
||||
raise MastodonAPIError(f"Not found: {url}")
|
||||
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
|
||||
except RateLimitError:
|
||||
raise
|
||||
except requests.RequestException as e:
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
wait = RETRY_BACKOFF * (attempt + 1)
|
||||
logger.warning("Request to %s failed (attempt %d/%d): %s — retrying in %ds",
|
||||
url, attempt + 1, MAX_RETRIES, e, wait)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
raise MastodonAPIError(f"Failed after {MAX_RETRIES} attempts: {e}") from e
|
||||
|
||||
raise MastodonAPIError("Unexpected retry exhaustion")
|
||||
|
||||
|
||||
def lookup_account(instance: str, username: str) -> dict:
|
||||
"""Look up an account on an instance by username. Returns the account JSON."""
|
||||
# Try the v1 lookup endpoint first (available on most instances)
|
||||
try:
|
||||
resp = _api_get(instance, "/api/v1/accounts/lookup", {"acct": username})
|
||||
return resp.json()
|
||||
except MastodonAPIError:
|
||||
pass
|
||||
|
||||
# Fallback: search for the account
|
||||
resp = _api_get(instance, "/api/v2/search", {"q": f"@{username}@{instance}", "type": "accounts", "limit": 1})
|
||||
data = resp.json()
|
||||
accounts = data.get("accounts", [])
|
||||
if not accounts:
|
||||
raise MastodonAPIError(f"Account @{username} not found on {instance}")
|
||||
return accounts[0]
|
||||
|
||||
|
||||
def get_account_statuses(
|
||||
instance: str,
|
||||
account_id: str,
|
||||
since_id: Optional[str] = None,
|
||||
limit: int = 40,
|
||||
exclude_reblogs: bool = False,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Fetch statuses from an account. Handles pagination to get all new statuses.
|
||||
Returns list of status dicts, oldest first.
|
||||
"""
|
||||
all_statuses = []
|
||||
params = {"limit": min(limit, 40)}
|
||||
if since_id:
|
||||
params["since_id"] = since_id
|
||||
if exclude_reblogs:
|
||||
params["exclude_reblogs"] = "true"
|
||||
|
||||
path = f"/api/v1/accounts/{account_id}/statuses"
|
||||
|
||||
# Paginate through results
|
||||
max_pages = 25 # safety limit
|
||||
page = 0
|
||||
|
||||
while page < max_pages:
|
||||
resp = _api_get(instance, path, params)
|
||||
statuses = resp.json()
|
||||
|
||||
if not statuses:
|
||||
break
|
||||
|
||||
all_statuses.extend(statuses)
|
||||
page += 1
|
||||
|
||||
# Check Link header for next page
|
||||
link_header = resp.headers.get("Link", "")
|
||||
next_match = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
|
||||
if not next_match:
|
||||
break
|
||||
|
||||
# Parse the next URL for max_id
|
||||
next_url = next_match.group(1)
|
||||
max_id_match = re.search(r"max_id=(\d+)", next_url)
|
||||
if not max_id_match:
|
||||
break
|
||||
|
||||
params["max_id"] = max_id_match.group(1)
|
||||
# Remove since_id for subsequent pages — we're paginating backwards
|
||||
# Actually we keep since_id as the floor
|
||||
time.sleep(0.5) # Be polite between pages
|
||||
|
||||
# Return oldest first so we can process chronologically
|
||||
all_statuses.reverse()
|
||||
return all_statuses
|
||||
|
||||
|
||||
def get_status_context(instance: str, status_id: str) -> dict:
|
||||
"""Get the context (ancestors + descendants) of a status. Useful for threading."""
|
||||
resp = _api_get(instance, f"/api/v1/statuses/{status_id}/context")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def classify_status(status: dict, monitored_account_id: str) -> str:
|
||||
"""
|
||||
Classify a status as: post, reply, mention, or reblog.
|
||||
- reblog: the status is a boost of another status
|
||||
- reply: the status is in reply to another status
|
||||
- mention: the status mentions other accounts (but is not a reply)
|
||||
- post: a standalone original post
|
||||
"""
|
||||
if status.get("reblog"):
|
||||
return "reblog"
|
||||
if status.get("in_reply_to_id"):
|
||||
return "reply"
|
||||
mentions = status.get("mentions", [])
|
||||
if mentions:
|
||||
# Only classify as "mention" if it mentions someone other than self
|
||||
other_mentions = [m for m in mentions if m.get("id") != monitored_account_id]
|
||||
if other_mentions:
|
||||
return "mention"
|
||||
return "post"
|
||||
|
||||
|
||||
def parse_status(status: dict, monitored_account_id: str) -> dict:
|
||||
"""Parse a raw Mastodon status JSON into a flat dict for storage."""
|
||||
# If it's a reblog, we store the original content but flag it
|
||||
actual = status.get("reblog") or status
|
||||
content_html = actual.get("content", "")
|
||||
|
||||
return {
|
||||
"status_id": status["id"],
|
||||
"uri": status.get("uri", ""),
|
||||
"url": status.get("url") or actual.get("url", ""),
|
||||
"content": content_html,
|
||||
"text_content": _strip_html(content_html),
|
||||
"visibility": status.get("visibility", "public"),
|
||||
"created_at": status.get("created_at"),
|
||||
"language": status.get("language") or actual.get("language"),
|
||||
"sensitive": status.get("sensitive", False),
|
||||
"spoiler_text": status.get("spoiler_text", ""),
|
||||
"in_reply_to_id": status.get("in_reply_to_id"),
|
||||
"in_reply_to_account_id": status.get("in_reply_to_account_id"),
|
||||
"conversation_id": status.get("conversation", {}).get("id") if isinstance(status.get("conversation"), dict) else None,
|
||||
"replies_count": status.get("replies_count", 0),
|
||||
"reblogs_count": status.get("reblogs_count", 0),
|
||||
"favourites_count": status.get("favourites_count", 0),
|
||||
"status_type": classify_status(status, monitored_account_id),
|
||||
"mentions": [
|
||||
{
|
||||
"mentioned_account_id": m.get("id"),
|
||||
"mentioned_username": m.get("username", ""),
|
||||
"mentioned_acct": m.get("acct", ""),
|
||||
"mentioned_url": m.get("url", ""),
|
||||
}
|
||||
for m in (actual.get("mentions") or [])
|
||||
],
|
||||
"media_attachments": [
|
||||
{
|
||||
"media_id": ma.get("id"),
|
||||
"media_type": ma.get("type"),
|
||||
"url": ma.get("url"),
|
||||
"preview_url": ma.get("preview_url"),
|
||||
"description": ma.get("description"),
|
||||
}
|
||||
for ma in (actual.get("media_attachments") or [])
|
||||
],
|
||||
"tags": [
|
||||
{"name": t.get("name", ""), "url": t.get("url", "")}
|
||||
for t in (actual.get("tags") or [])
|
||||
],
|
||||
"raw_json": status,
|
||||
}
|
||||
77
app/templates/accounts.html
Normal file
77
app/templates/accounts.html
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
{% extends "base.html" %}
|
||||
{% block title %}Accounts — Mastodon Collector{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<h1>Monitored Accounts</h1>
|
||||
</div>
|
||||
|
||||
<div class="card mb-4">
|
||||
<h2>Add Account</h2>
|
||||
<form method="POST" action="{{ url_for('accounts_add') }}" class="form-inline mt-2">
|
||||
<input type="text" name="handle" placeholder="@user@instance.social" style="width: 320px;" required>
|
||||
<button type="submit" class="btn btn-primary">Add Account</button>
|
||||
</form>
|
||||
<p class="text-muted text-sm mt-2">
|
||||
You can also add accounts by editing <code>accounts.txt</code> — the collector picks them up automatically.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Handle</th>
|
||||
<th>Display Name</th>
|
||||
<th>Account ID</th>
|
||||
<th>Status</th>
|
||||
<th>Last Collected</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for acct in accounts %}
|
||||
<tr>
|
||||
<td>
|
||||
<a href="{{ url_for('statuses_list', account_id=acct.id) }}" style="color: var(--accent);">
|
||||
{{ acct.handle }}
|
||||
</a>
|
||||
</td>
|
||||
<td>{{ acct.display_name or '—' }}</td>
|
||||
<td class="text-muted text-sm">{{ acct.account_id or 'unresolved' }}</td>
|
||||
<td>
|
||||
{% if acct.is_active %}
|
||||
<span class="badge badge-active">Active</span>
|
||||
{% else %}
|
||||
<span class="badge badge-paused">Paused</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="text-muted text-sm">
|
||||
{{ acct.last_collected_at.strftime('%Y-%m-%d %H:%M') if acct.last_collected_at else 'Never' }}
|
||||
</td>
|
||||
<td>
|
||||
<div class="flex gap-2">
|
||||
<form method="POST" action="{{ url_for('accounts_toggle', account_id=acct.id) }}">
|
||||
<button type="submit" class="btn btn-outline btn-sm">
|
||||
{{ 'Pause' if acct.is_active else 'Resume' }}
|
||||
</button>
|
||||
</form>
|
||||
<form method="POST" action="{{ url_for('accounts_delete', account_id=acct.id) }}"
|
||||
onsubmit="return confirm('Delete {{ acct.handle }} and ALL collected data? This cannot be undone.')">
|
||||
<button type="submit" class="btn btn-danger btn-sm">Delete</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% if not accounts %}
|
||||
<tr>
|
||||
<td colspan="6" class="text-muted" style="text-align:center; padding: 24px;">
|
||||
No accounts yet. Add one above or edit <code>accounts.txt</code>.
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endblock %}
|
||||
262
app/templates/base.html
Normal file
262
app/templates/base.html
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{% block title %}Mastodon Collector{% endblock %}</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #1a1a2e;
|
||||
--bg-card: #16213e;
|
||||
--bg-hover: #1a2745;
|
||||
--text: #e0e0e0;
|
||||
--text-muted: #8892a4;
|
||||
--accent: #e2dbff;
|
||||
--accent-hover: #ffffff;
|
||||
--link: #7dd3fc;
|
||||
--success: #2ecc71;
|
||||
--warning: #f39c12;
|
||||
--danger: #e74c3c;
|
||||
--border: #2a3a5c;
|
||||
--tag-post: #3498db;
|
||||
--tag-reply: #e67e22;
|
||||
--tag-mention: #9b59b6;
|
||||
--tag-reblog: #1abc9c;
|
||||
}
|
||||
|
||||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.container { max-width: 1200px; margin: 0 auto; padding: 0 20px; }
|
||||
|
||||
nav {
|
||||
background: var(--bg-card);
|
||||
border-bottom: 1px solid var(--border);
|
||||
padding: 12px 0;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 100;
|
||||
}
|
||||
nav .container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 24px;
|
||||
}
|
||||
nav .logo {
|
||||
font-size: 18px;
|
||||
font-weight: 700;
|
||||
color: var(--accent);
|
||||
text-decoration: none;
|
||||
}
|
||||
nav a {
|
||||
color: var(--text-muted);
|
||||
text-decoration: none;
|
||||
font-size: 14px;
|
||||
padding: 6px 12px;
|
||||
border-radius: 6px;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
nav a:hover, nav a.active {
|
||||
color: var(--text);
|
||||
background: var(--bg-hover);
|
||||
}
|
||||
|
||||
main { padding: 24px 0; }
|
||||
|
||||
h1 { font-size: 24px; margin-bottom: 20px; }
|
||||
h2 { font-size: 18px; margin-bottom: 12px; color: var(--text-muted); }
|
||||
|
||||
.flash {
|
||||
padding: 12px 16px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 16px;
|
||||
font-size: 14px;
|
||||
}
|
||||
.flash.success { background: rgba(46, 204, 113, 0.15); border: 1px solid var(--success); color: var(--success); }
|
||||
.flash.error { background: rgba(231, 76, 60, 0.15); border: 1px solid var(--danger); color: var(--danger); }
|
||||
.flash.info { background: rgba(108, 99, 255, 0.15); border: 1px solid var(--accent); color: var(--accent); }
|
||||
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
||||
gap: 16px;
|
||||
margin-bottom: 24px;
|
||||
}
|
||||
.stat-card {
|
||||
background: var(--bg-card);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
text-align: center;
|
||||
}
|
||||
.stat-card .number {
|
||||
font-size: 32px;
|
||||
font-weight: 700;
|
||||
color: var(--accent);
|
||||
}
|
||||
.stat-card .label {
|
||||
font-size: 13px;
|
||||
color: var(--text-muted);
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: var(--bg-card);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 14px;
|
||||
}
|
||||
th, td {
|
||||
padding: 10px 12px;
|
||||
text-align: left;
|
||||
border-bottom: 1px solid var(--border);
|
||||
}
|
||||
th {
|
||||
color: var(--text-muted);
|
||||
font-weight: 600;
|
||||
font-size: 12px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
tr:hover { background: var(--bg-hover); }
|
||||
td a { color: var(--link); text-decoration: none; }
|
||||
td a:hover { color: #ffffff; text-decoration: underline; }
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 12px;
|
||||
font-size: 11px;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
.badge-post { background: rgba(52, 152, 219, 0.2); color: var(--tag-post); }
|
||||
.badge-reply { background: rgba(230, 126, 34, 0.2); color: var(--tag-reply); }
|
||||
.badge-mention { background: rgba(155, 89, 182, 0.2); color: var(--tag-mention); }
|
||||
.badge-reblog { background: rgba(26, 188, 156, 0.2); color: var(--tag-reblog); }
|
||||
.badge-active { background: rgba(46, 204, 113, 0.2); color: var(--success); }
|
||||
.badge-paused { background: rgba(243, 156, 18, 0.2); color: var(--warning); }
|
||||
.badge-success { background: rgba(46, 204, 113, 0.2); color: var(--success); }
|
||||
.badge-error { background: rgba(231, 76, 60, 0.2); color: var(--danger); }
|
||||
.badge-running { background: rgba(108, 99, 255, 0.2); color: var(--accent); }
|
||||
|
||||
.btn {
|
||||
display: inline-block;
|
||||
padding: 8px 16px;
|
||||
border-radius: 6px;
|
||||
font-size: 14px;
|
||||
font-weight: 500;
|
||||
text-decoration: none;
|
||||
border: none;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
.btn-primary { background: var(--accent); color: white; }
|
||||
.btn-primary:hover { background: var(--accent-hover); }
|
||||
.btn-sm { padding: 4px 10px; font-size: 12px; }
|
||||
.btn-danger { background: var(--danger); color: white; }
|
||||
.btn-danger:hover { background: #c0392b; }
|
||||
.btn-outline {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border);
|
||||
color: var(--text-muted);
|
||||
}
|
||||
.btn-outline:hover { border-color: var(--accent); color: var(--accent); }
|
||||
|
||||
input[type="text"], input[type="search"], select {
|
||||
background: var(--bg);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
padding: 8px 12px;
|
||||
color: var(--text);
|
||||
font-size: 14px;
|
||||
outline: none;
|
||||
}
|
||||
input:focus, select:focus { border-color: var(--accent); }
|
||||
|
||||
.form-inline {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.status-content {
|
||||
font-size: 14px;
|
||||
line-height: 1.7;
|
||||
word-break: break-word;
|
||||
}
|
||||
.status-content a { color: var(--accent); }
|
||||
|
||||
.pagination {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
.pagination a {
|
||||
color: var(--text-muted);
|
||||
text-decoration: none;
|
||||
padding: 6px 12px;
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
}
|
||||
.pagination a:hover { border-color: var(--accent); color: var(--accent); }
|
||||
.pagination .current {
|
||||
background: var(--accent);
|
||||
color: white;
|
||||
padding: 6px 12px;
|
||||
border-radius: 6px;
|
||||
}
|
||||
|
||||
.text-muted { color: var(--text-muted); }
|
||||
.text-sm { font-size: 13px; }
|
||||
.mt-2 { margin-top: 8px; }
|
||||
.mt-4 { margin-top: 16px; }
|
||||
.mb-4 { margin-bottom: 16px; }
|
||||
.flex { display: flex; }
|
||||
.gap-2 { gap: 8px; }
|
||||
.items-center { align-items: center; }
|
||||
.justify-between { justify-content: space-between; }
|
||||
.truncate { white-space: nowrap; overflow: hidden; text-overflow: ellipsis; max-width: 400px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<nav>
|
||||
<div class="container">
|
||||
<a href="{{ url_for('index') }}" class="logo">Mastodon Collector</a>
|
||||
<a href="{{ url_for('index') }}" class="{{ 'active' if request.endpoint == 'index' }}">Dashboard</a>
|
||||
<a href="{{ url_for('accounts_list') }}" class="{{ 'active' if request.endpoint == 'accounts_list' }}">Accounts</a>
|
||||
<a href="{{ url_for('statuses_list') }}" class="{{ 'active' if request.endpoint == 'statuses_list' }}">Statuses</a>
|
||||
<a href="{{ url_for('export_csv') }}">Export CSV</a>
|
||||
<a href="{{ url_for('api_stats') }}">API</a>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<main>
|
||||
<div class="container">
|
||||
{% with messages = get_flashed_messages(with_categories=true) %}
|
||||
{% for category, message in messages %}
|
||||
<div class="flash {{ category }}">{{ message }}</div>
|
||||
{% endfor %}
|
||||
{% endwith %}
|
||||
|
||||
{% block content %}{% endblock %}
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
123
app/templates/index.html
Normal file
123
app/templates/index.html
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
{% extends "base.html" %}
|
||||
{% block title %}Dashboard — Mastodon Collector{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>Dashboard</h1>
|
||||
|
||||
<div class="stats-grid">
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ total_statuses }}</div>
|
||||
<div class="label">Total Statuses</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ total_posts }}</div>
|
||||
<div class="label">Posts</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ total_replies }}</div>
|
||||
<div class="label">Replies</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ total_mentions }}</div>
|
||||
<div class="label">Mentions</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ total_reblogs }}</div>
|
||||
<div class="label">Reblogs</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="number">{{ account_stats|length }}</div>
|
||||
<div class="label">Monitored Accounts</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>Monitored Accounts</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Account</th>
|
||||
<th>Instance</th>
|
||||
<th>Status</th>
|
||||
<th>Collected</th>
|
||||
<th>Last Run</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for item in account_stats %}
|
||||
<tr>
|
||||
<td>
|
||||
<a href="{{ url_for('statuses_list', account_id=item.account.id) }}">
|
||||
{{ item.account.handle }}
|
||||
</a>
|
||||
{% if item.account.display_name %}
|
||||
<span class="text-muted text-sm">— {{ item.account.display_name }}</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="text-muted">{{ item.account.instance }}</td>
|
||||
<td>
|
||||
{% if item.account.is_active %}
|
||||
<span class="badge badge-active">Active</span>
|
||||
{% else %}
|
||||
<span class="badge badge-paused">Paused</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ item.status_count }}</td>
|
||||
<td class="text-muted text-sm">
|
||||
{% if item.account.last_collected_at %}
|
||||
{{ item.account.last_collected_at.strftime('%Y-%m-%d %H:%M') }}
|
||||
{% if item.last_log %}
|
||||
<span class="badge badge-{{ item.last_log.status }}">{{ item.last_log.status }}</span>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
Never
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% if not account_stats %}
|
||||
<tr>
|
||||
<td colspan="5" class="text-muted" style="text-align:center; padding: 24px;">
|
||||
No accounts being monitored yet.
|
||||
<a href="{{ url_for('accounts_list') }}" style="color: var(--accent);">Add some accounts</a>.
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{% if recent_logs %}
|
||||
<div class="card mt-4">
|
||||
<h2>Recent Collection Runs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Account</th>
|
||||
<th>Status</th>
|
||||
<th>Collected</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for log in recent_logs %}
|
||||
<tr>
|
||||
<td class="text-sm">{{ log.started_at.strftime('%Y-%m-%d %H:%M:%S') if log.started_at }}</td>
|
||||
<td>
|
||||
{% if log.account %}
|
||||
{{ log.account.handle }}
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</td>
|
||||
<td><span class="badge badge-{{ log.status }}">{{ log.status }}</span></td>
|
||||
<td>{{ log.statuses_collected }}</td>
|
||||
<td class="text-muted text-sm truncate">{{ log.error or '—' }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
140
app/templates/status_detail.html
Normal file
140
app/templates/status_detail.html
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
{% extends "base.html" %}
|
||||
{% block title %}Status Detail — Mastodon Collector{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="flex items-center gap-2 mb-4">
|
||||
<a href="{{ url_for('statuses_list') }}" class="btn btn-outline btn-sm">← Back</a>
|
||||
<h1>Status Detail</h1>
|
||||
</div>
|
||||
|
||||
<div class="card mb-4">
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<div>
|
||||
<span class="badge badge-{{ status.status_type }}">{{ status.status_type }}</span>
|
||||
<span class="text-muted text-sm" style="margin-left: 8px;">
|
||||
{{ status.created_at.strftime('%Y-%m-%d %H:%M:%S UTC') if status.created_at }}
|
||||
</span>
|
||||
</div>
|
||||
<div>
|
||||
{% if status.url %}
|
||||
<a href="{{ status.url }}" target="_blank" class="btn btn-outline btn-sm">View on Mastodon ↗</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th style="width: 160px;">Account</th>
|
||||
<td>{{ status.account.handle }}{% if status.account.display_name %} — {{ status.account.display_name }}{% endif %}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Visibility</th>
|
||||
<td>{{ status.visibility }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Language</th>
|
||||
<td>{{ status.language or 'Unknown' }}</td>
|
||||
</tr>
|
||||
{% if status.in_reply_to_id %}
|
||||
<tr>
|
||||
<th>In Reply To</th>
|
||||
<td>Status {{ status.in_reply_to_id }}{% if status.in_reply_to_account_id %} (account {{ status.in_reply_to_account_id }}){% endif %}</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% if status.conversation_id %}
|
||||
<tr>
|
||||
<th>Conversation</th>
|
||||
<td>{{ status.conversation_id }}</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
<tr>
|
||||
<th>Interactions</th>
|
||||
<td>↩ {{ status.replies_count }} replies ⟳ {{ status.reblogs_count }} reblogs ★ {{ status.favourites_count }} favourites</td>
|
||||
</tr>
|
||||
{% if status.sensitive %}
|
||||
<tr>
|
||||
<th>Sensitive</th>
|
||||
<td>Yes{% if status.spoiler_text %} — {{ status.spoiler_text }}{% endif %}</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
<tr>
|
||||
<th>Mastodon Status ID</th>
|
||||
<td class="text-muted">{{ status.status_id }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>URI</th>
|
||||
<td class="text-muted text-sm">{{ status.uri }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="card mb-4">
|
||||
<h2>Content (HTML)</h2>
|
||||
<div class="status-content mt-2" style="padding: 16px; background: var(--bg); border-radius: 8px;">
|
||||
{{ status.content | safe }}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card mb-4">
|
||||
<h2>Content (Plain Text)</h2>
|
||||
<div class="mt-2" style="padding: 16px; background: var(--bg); border-radius: 8px; white-space: pre-wrap; font-family: monospace; font-size: 13px;">{{ status.text_content }}</div>
|
||||
</div>
|
||||
|
||||
{% if status.mentions %}
|
||||
<div class="card mb-4">
|
||||
<h2>Mentions ({{ status.mentions|length }})</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Account</th><th>URL</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for m in status.mentions %}
|
||||
<tr>
|
||||
<td>@{{ m.mentioned_acct }}</td>
|
||||
<td class="text-muted text-sm">{{ m.mentioned_url }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if status.media_attachments %}
|
||||
<div class="card mb-4">
|
||||
<h2>Media Attachments ({{ status.media_attachments|length }})</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Type</th><th>Description</th><th>URL</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for ma in status.media_attachments %}
|
||||
<tr>
|
||||
<td>{{ ma.media_type }}</td>
|
||||
<td>{{ ma.description or '—' }}</td>
|
||||
<td class="text-sm"><a href="{{ ma.url }}" target="_blank" style="color: var(--accent);">View ↗</a></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if status.tags %}
|
||||
<div class="card mb-4">
|
||||
<h2>Tags</h2>
|
||||
<div class="mt-2">
|
||||
{% for t in status.tags %}
|
||||
<span class="badge" style="background: var(--bg); margin: 2px;">#{{ t.name }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="card">
|
||||
<h2>Raw JSON</h2>
|
||||
<details>
|
||||
<summary class="text-muted" style="cursor: pointer; padding: 8px 0;">Click to expand</summary>
|
||||
<pre style="padding: 16px; background: var(--bg); border-radius: 8px; overflow-x: auto; font-size: 12px; max-height: 600px; overflow-y: auto;">{{ status.raw_json | tojson(indent=2) }}</pre>
|
||||
</details>
|
||||
</div>
|
||||
{% endblock %}
|
||||
112
app/templates/statuses.html
Normal file
112
app/templates/statuses.html
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
{% extends "base.html" %}
|
||||
{% block title %}Statuses — Mastodon Collector{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<h1>Collected Statuses <span class="text-muted text-sm">({{ total }})</span></h1>
|
||||
<a href="{{ url_for('export_csv', account_id=current_account_id or '', type=current_type or '') }}"
|
||||
class="btn btn-outline btn-sm">Export CSV</a>
|
||||
</div>
|
||||
|
||||
<div class="card mb-4">
|
||||
<form method="GET" action="{{ url_for('statuses_list') }}" class="form-inline">
|
||||
<select name="account_id">
|
||||
<option value="">All accounts</option>
|
||||
{% for acct in accounts %}
|
||||
<option value="{{ acct.id }}" {{ 'selected' if current_account_id == acct.id }}>
|
||||
{{ acct.handle }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
|
||||
<select name="type">
|
||||
<option value="">All types</option>
|
||||
<option value="post" {{ 'selected' if current_type == 'post' }}>Posts</option>
|
||||
<option value="reply" {{ 'selected' if current_type == 'reply' }}>Replies</option>
|
||||
<option value="mention" {{ 'selected' if current_type == 'mention' }}>Mentions</option>
|
||||
<option value="reblog" {{ 'selected' if current_type == 'reblog' }}>Reblogs</option>
|
||||
</select>
|
||||
|
||||
<input type="search" name="q" placeholder="Search text content..." value="{{ search }}" style="width: 260px;">
|
||||
<button type="submit" class="btn btn-primary btn-sm">Filter</button>
|
||||
{% if current_account_id or current_type or search %}
|
||||
<a href="{{ url_for('statuses_list') }}" class="btn btn-outline btn-sm">Clear</a>
|
||||
{% endif %}
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Date</th>
|
||||
<th>Account</th>
|
||||
<th>Type</th>
|
||||
<th>Content</th>
|
||||
<th>Interactions</th>
|
||||
<th></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in statuses %}
|
||||
<tr>
|
||||
<td class="text-sm" style="white-space:nowrap;">
|
||||
{{ s.created_at.strftime('%Y-%m-%d %H:%M') if s.created_at }}
|
||||
</td>
|
||||
<td class="text-sm">{{ s.account.handle }}</td>
|
||||
<td>
|
||||
<span class="badge badge-{{ s.status_type }}">{{ s.status_type }}</span>
|
||||
</td>
|
||||
<td>
|
||||
<div class="truncate status-content">
|
||||
{{ s.text_content[:200] }}{% if s.text_content and s.text_content|length > 200 %}...{% endif %}
|
||||
</div>
|
||||
{% if s.tags %}
|
||||
<div class="text-sm text-muted mt-2">
|
||||
{% for t in s.tags %}
|
||||
<span>#{{ t.name }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="text-sm text-muted" style="white-space:nowrap;">
|
||||
↩ {{ s.replies_count }} ⟳ {{ s.reblogs_count }} ★ {{ s.favourites_count }}
|
||||
</td>
|
||||
<td>
|
||||
<a href="{{ url_for('status_detail', status_db_id=s.id) }}" class="btn btn-outline btn-sm">View</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% if not statuses %}
|
||||
<tr>
|
||||
<td colspan="6" class="text-muted" style="text-align:center; padding: 24px;">
|
||||
No statuses found. The collector runs every {{ (config.get('POLL_INTERVAL_SECONDS', 14400)|int // 3600) }} hours, or you can wait for the first collection cycle.
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{% if total_pages > 1 %}
|
||||
<div class="pagination">
|
||||
{% if page > 1 %}
|
||||
<a href="{{ url_for('statuses_list', page=page-1, account_id=current_account_id, type=current_type, q=search) }}">← Prev</a>
|
||||
{% endif %}
|
||||
|
||||
{% for p in range(1, total_pages + 1) %}
|
||||
{% if p == page %}
|
||||
<span class="current">{{ p }}</span>
|
||||
{% elif p <= 3 or p >= total_pages - 2 or (p >= page - 2 and p <= page + 2) %}
|
||||
<a href="{{ url_for('statuses_list', page=p, account_id=current_account_id, type=current_type, q=search) }}">{{ p }}</a>
|
||||
{% elif p == 4 or p == total_pages - 3 %}
|
||||
<span class="text-muted">…</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if page < total_pages %}
|
||||
<a href="{{ url_for('statuses_list', page=page+1, account_id=current_account_id, type=current_type, q=search) }}">Next →</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
383
app/web.py
Normal file
383
app/web.py
Normal file
|
|
@ -0,0 +1,383 @@
|
|||
"""Flask web application for managing monitored accounts and viewing collected data."""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from flask import Flask, render_template, request, redirect, url_for, flash, jsonify
|
||||
from sqlalchemy import func, desc
|
||||
|
||||
from app.db import (
|
||||
init_db,
|
||||
get_session,
|
||||
MonitoredAccount,
|
||||
Status,
|
||||
Mention,
|
||||
CollectionLog,
|
||||
)
|
||||
from app.mastodon_api import lookup_account, MastodonAPIError
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = Flask(__name__)
|
||||
app.secret_key = os.environ.get("FLASK_SECRET_KEY", "dev-secret-key")
|
||||
|
||||
# Initialize database on startup
|
||||
with app.app_context():
|
||||
init_db()
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
"""Dashboard overview."""
|
||||
session = get_session()
|
||||
try:
|
||||
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
|
||||
total_statuses = session.query(func.count(Status.id)).scalar() or 0
|
||||
total_posts = session.query(func.count(Status.id)).filter(Status.status_type == "post").scalar() or 0
|
||||
total_replies = session.query(func.count(Status.id)).filter(Status.status_type == "reply").scalar() or 0
|
||||
total_mentions = session.query(func.count(Status.id)).filter(Status.status_type == "mention").scalar() or 0
|
||||
total_reblogs = session.query(func.count(Status.id)).filter(Status.status_type == "reblog").scalar() or 0
|
||||
|
||||
# Per-account stats
|
||||
account_stats = []
|
||||
for acct in accounts:
|
||||
count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
|
||||
last_log = (
|
||||
session.query(CollectionLog)
|
||||
.filter_by(account_db_id=acct.id)
|
||||
.order_by(desc(CollectionLog.started_at))
|
||||
.first()
|
||||
)
|
||||
account_stats.append({
|
||||
"account": acct,
|
||||
"status_count": count,
|
||||
"last_log": last_log,
|
||||
})
|
||||
|
||||
# Recent collection logs
|
||||
recent_logs = (
|
||||
session.query(CollectionLog)
|
||||
.order_by(desc(CollectionLog.started_at))
|
||||
.limit(20)
|
||||
.all()
|
||||
)
|
||||
|
||||
return render_template(
|
||||
"index.html",
|
||||
account_stats=account_stats,
|
||||
total_statuses=total_statuses,
|
||||
total_posts=total_posts,
|
||||
total_replies=total_replies,
|
||||
total_mentions=total_mentions,
|
||||
total_reblogs=total_reblogs,
|
||||
recent_logs=recent_logs,
|
||||
)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/accounts")
|
||||
def accounts_list():
|
||||
"""List all monitored accounts."""
|
||||
session = get_session()
|
||||
try:
|
||||
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.instance, MonitoredAccount.username).all()
|
||||
return render_template("accounts.html", accounts=accounts)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/accounts/add", methods=["POST"])
|
||||
def accounts_add():
|
||||
"""Add a new account to monitor."""
|
||||
handle = request.form.get("handle", "").strip().lstrip("@")
|
||||
if "@" not in handle:
|
||||
flash("Invalid handle format. Use @user@instance.social", "error")
|
||||
return redirect(url_for("accounts_list"))
|
||||
|
||||
username, instance = handle.split("@", 1)
|
||||
if not username or not instance:
|
||||
flash("Invalid handle format. Use @user@instance.social", "error")
|
||||
return redirect(url_for("accounts_list"))
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
existing = session.query(MonitoredAccount).filter_by(username=username, instance=instance).first()
|
||||
if existing:
|
||||
if not existing.is_active:
|
||||
existing.is_active = True
|
||||
session.commit()
|
||||
flash(f"Re-activated {existing.handle}", "success")
|
||||
else:
|
||||
flash(f"{existing.handle} is already being monitored", "info")
|
||||
return redirect(url_for("accounts_list"))
|
||||
|
||||
# Try to resolve the account first
|
||||
try:
|
||||
data = lookup_account(instance, username)
|
||||
acct = MonitoredAccount(
|
||||
username=username,
|
||||
instance=instance,
|
||||
account_id=data["id"],
|
||||
display_name=data.get("display_name", ""),
|
||||
avatar_url=data.get("avatar", ""),
|
||||
note=data.get("note", ""),
|
||||
is_active=True,
|
||||
)
|
||||
except MastodonAPIError as e:
|
||||
logger.warning("Could not resolve account @%s@%s: %s — adding anyway", username, instance, e)
|
||||
acct = MonitoredAccount(
|
||||
username=username,
|
||||
instance=instance,
|
||||
is_active=True,
|
||||
)
|
||||
|
||||
session.add(acct)
|
||||
session.commit()
|
||||
flash(f"Added {acct.handle} to monitoring list", "success")
|
||||
return redirect(url_for("accounts_list"))
|
||||
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/accounts/<int:account_id>/toggle", methods=["POST"])
|
||||
def accounts_toggle(account_id):
|
||||
"""Toggle an account's active status."""
|
||||
session = get_session()
|
||||
try:
|
||||
acct = session.query(MonitoredAccount).get(account_id)
|
||||
if acct:
|
||||
acct.is_active = not acct.is_active
|
||||
session.commit()
|
||||
state = "activated" if acct.is_active else "paused"
|
||||
flash(f"{state.capitalize()} monitoring for {acct.handle}", "success")
|
||||
return redirect(url_for("accounts_list"))
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/accounts/<int:account_id>/delete", methods=["POST"])
|
||||
def accounts_delete(account_id):
|
||||
"""Delete an account and all its collected data."""
|
||||
session = get_session()
|
||||
try:
|
||||
acct = session.query(MonitoredAccount).get(account_id)
|
||||
if acct:
|
||||
handle = acct.handle
|
||||
# Delete associated statuses (cascades to mentions, media, tags)
|
||||
session.query(Status).filter_by(account_db_id=acct.id).delete()
|
||||
session.query(CollectionLog).filter_by(account_db_id=acct.id).delete()
|
||||
session.delete(acct)
|
||||
session.commit()
|
||||
flash(f"Deleted {handle} and all collected data", "success")
|
||||
return redirect(url_for("accounts_list"))
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/statuses")
|
||||
def statuses_list():
|
||||
"""Browse collected statuses with filters."""
|
||||
session = get_session()
|
||||
try:
|
||||
page = request.args.get("page", 1, type=int)
|
||||
per_page = request.args.get("per_page", 50, type=int)
|
||||
account_id = request.args.get("account_id", type=int)
|
||||
status_type = request.args.get("type", "")
|
||||
search = request.args.get("q", "").strip()
|
||||
|
||||
query = session.query(Status).join(MonitoredAccount)
|
||||
|
||||
if account_id:
|
||||
query = query.filter(Status.account_db_id == account_id)
|
||||
if status_type:
|
||||
query = query.filter(Status.status_type == status_type)
|
||||
if search:
|
||||
query = query.filter(Status.text_content.ilike(f"%{search}%"))
|
||||
|
||||
total = query.count()
|
||||
statuses = (
|
||||
query.order_by(desc(Status.created_at))
|
||||
.offset((page - 1) * per_page)
|
||||
.limit(per_page)
|
||||
.all()
|
||||
)
|
||||
|
||||
accounts = session.query(MonitoredAccount).order_by(MonitoredAccount.username).all()
|
||||
total_pages = max(1, (total + per_page - 1) // per_page)
|
||||
|
||||
return render_template(
|
||||
"statuses.html",
|
||||
statuses=statuses,
|
||||
accounts=accounts,
|
||||
page=page,
|
||||
per_page=per_page,
|
||||
total=total,
|
||||
total_pages=total_pages,
|
||||
current_account_id=account_id,
|
||||
current_type=status_type,
|
||||
search=search,
|
||||
)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/statuses/<int:status_db_id>")
|
||||
def status_detail(status_db_id):
|
||||
"""View a single status with all details."""
|
||||
session = get_session()
|
||||
try:
|
||||
status = session.query(Status).get(status_db_id)
|
||||
if not status:
|
||||
flash("Status not found", "error")
|
||||
return redirect(url_for("statuses_list"))
|
||||
return render_template("status_detail.html", status=status)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/api/stats")
|
||||
def api_stats():
|
||||
"""JSON API endpoint for stats (useful for your analysis pipeline)."""
|
||||
session = get_session()
|
||||
try:
|
||||
stats = {
|
||||
"total_statuses": session.query(func.count(Status.id)).scalar() or 0,
|
||||
"by_type": {},
|
||||
"accounts": [],
|
||||
}
|
||||
for stype in ["post", "reply", "mention", "reblog"]:
|
||||
stats["by_type"][stype] = (
|
||||
session.query(func.count(Status.id)).filter(Status.status_type == stype).scalar() or 0
|
||||
)
|
||||
|
||||
accounts = session.query(MonitoredAccount).filter_by(is_active=True).all()
|
||||
for acct in accounts:
|
||||
count = session.query(func.count(Status.id)).filter(Status.account_db_id == acct.id).scalar() or 0
|
||||
stats["accounts"].append({
|
||||
"handle": acct.handle,
|
||||
"status_count": count,
|
||||
"last_collected": acct.last_collected_at.isoformat() if acct.last_collected_at else None,
|
||||
})
|
||||
|
||||
return jsonify(stats)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/api/statuses")
|
||||
def api_statuses():
|
||||
"""JSON API endpoint for statuses (for your analysis pipeline)."""
|
||||
session = get_session()
|
||||
try:
|
||||
page = request.args.get("page", 1, type=int)
|
||||
per_page = min(request.args.get("per_page", 100, type=int), 500)
|
||||
account_id = request.args.get("account_id", type=int)
|
||||
status_type = request.args.get("type", "")
|
||||
since = request.args.get("since", "") # ISO datetime
|
||||
|
||||
query = session.query(Status)
|
||||
|
||||
if account_id:
|
||||
query = query.filter(Status.account_db_id == account_id)
|
||||
if status_type:
|
||||
query = query.filter(Status.status_type == status_type)
|
||||
if since:
|
||||
query = query.filter(Status.created_at >= since)
|
||||
|
||||
total = query.count()
|
||||
statuses = (
|
||||
query.order_by(desc(Status.created_at))
|
||||
.offset((page - 1) * per_page)
|
||||
.limit(per_page)
|
||||
.all()
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"total": total,
|
||||
"page": page,
|
||||
"per_page": per_page,
|
||||
"statuses": [
|
||||
{
|
||||
"id": s.id,
|
||||
"status_id": s.status_id,
|
||||
"account": s.account.handle,
|
||||
"url": s.url,
|
||||
"content": s.content,
|
||||
"text_content": s.text_content,
|
||||
"visibility": s.visibility,
|
||||
"created_at": s.created_at.isoformat() if s.created_at else None,
|
||||
"language": s.language,
|
||||
"status_type": s.status_type,
|
||||
"in_reply_to_id": s.in_reply_to_id,
|
||||
"replies_count": s.replies_count,
|
||||
"reblogs_count": s.reblogs_count,
|
||||
"favourites_count": s.favourites_count,
|
||||
"mentions": [
|
||||
{"acct": m.mentioned_acct, "url": m.mentioned_url}
|
||||
for m in s.mentions
|
||||
],
|
||||
"tags": [t.name for t in s.tags],
|
||||
}
|
||||
for s in statuses
|
||||
],
|
||||
})
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.route("/export")
|
||||
def export_csv():
|
||||
"""Export statuses as CSV for analysis."""
|
||||
from io import StringIO
|
||||
import csv
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
account_id = request.args.get("account_id", type=int)
|
||||
status_type = request.args.get("type", "")
|
||||
|
||||
query = session.query(Status).join(MonitoredAccount)
|
||||
if account_id:
|
||||
query = query.filter(Status.account_db_id == account_id)
|
||||
if status_type:
|
||||
query = query.filter(Status.status_type == status_type)
|
||||
|
||||
statuses = query.order_by(desc(Status.created_at)).all()
|
||||
|
||||
output = StringIO()
|
||||
writer = csv.writer(output)
|
||||
writer.writerow([
|
||||
"id", "account", "status_type", "created_at", "url",
|
||||
"text_content", "language", "visibility", "in_reply_to_id",
|
||||
"replies_count", "reblogs_count", "favourites_count",
|
||||
"mentions", "tags", "sensitive", "spoiler_text",
|
||||
])
|
||||
|
||||
for s in statuses:
|
||||
mentions_str = "; ".join(m.mentioned_acct for m in s.mentions)
|
||||
tags_str = "; ".join(t.name for t in s.tags)
|
||||
writer.writerow([
|
||||
s.status_id, s.account.handle, s.status_type,
|
||||
s.created_at.isoformat() if s.created_at else "",
|
||||
s.url, s.text_content, s.language, s.visibility,
|
||||
s.in_reply_to_id, s.replies_count, s.reblogs_count,
|
||||
s.favourites_count, mentions_str, tags_str,
|
||||
s.sensitive, s.spoiler_text,
|
||||
])
|
||||
|
||||
from flask import Response
|
||||
return Response(
|
||||
output.getvalue(),
|
||||
mimetype="text/csv",
|
||||
headers={"Content-Disposition": "attachment; filename=mastodon_statuses.csv"},
|
||||
)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5000, debug=True)
|
||||
50
docker-compose.yml
Normal file
50
docker-compose.yml
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
version: "3.8"
|
||||
|
||||
services:
|
||||
db:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_DB: mastodon_collector
|
||||
POSTGRES_USER: collector
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-collector_secret}
|
||||
volumes:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
ports:
|
||||
- "127.0.0.1:5434:5432"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U collector -d mastodon_collector"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
web:
|
||||
build: .
|
||||
restart: unless-stopped
|
||||
command: gunicorn --bind 0.0.0.0:5000 --workers 2 --timeout 120 app.web:app
|
||||
ports:
|
||||
- "127.0.0.1:8585:5000"
|
||||
environment:
|
||||
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
||||
FLASK_SECRET_KEY: ${FLASK_SECRET_KEY:-change-me-in-production}
|
||||
volumes:
|
||||
- ./accounts.txt:/app/accounts.txt
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
||||
collector:
|
||||
build: .
|
||||
restart: unless-stopped
|
||||
command: python -m app.collector
|
||||
environment:
|
||||
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
||||
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
||||
volumes:
|
||||
- ./accounts.txt:/app/accounts.txt
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
||||
volumes:
|
||||
pgdata:
|
||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
flask==3.1.0
|
||||
gunicorn==23.0.0
|
||||
psycopg2-binary==2.9.10
|
||||
sqlalchemy==2.0.36
|
||||
requests==2.32.3
|
||||
apscheduler==3.10.4
|
||||
Loading…
Add table
Reference in a new issue