""" Collector service — periodically polls Mastodon for new statuses from monitored accounts. Runs as a standalone process via `python -m app.collector`. """ import logging import os import sys import time from datetime import datetime, timezone from pathlib import Path from apscheduler.schedulers.blocking import BlockingScheduler from app.db import ( init_db, get_session, MonitoredAccount, Status, Mention, MediaAttachment, Tag, CollectionLog, ) from app.mastodon_api import ( lookup_account, get_account_statuses, parse_status, MastodonAPIError, RateLimitError, ) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger("collector") POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL_SECONDS", 14400)) ACCOUNTS_FILE = os.environ.get("ACCOUNTS_FILE", "/app/accounts.txt") def load_accounts_from_file(filepath: str) -> list[tuple[str, str]]: """Parse accounts.txt and return list of (username, instance) tuples.""" accounts = [] path = Path(filepath) if not path.exists(): logger.warning("Accounts file not found: %s", filepath) return accounts for line in path.read_text().splitlines(): line = line.strip() if not line or line.startswith("#"): continue # Expected format: @user@instance.social or user@instance.social line = line.lstrip("@") if "@" not in line: logger.warning("Skipping malformed account line: %s", line) continue parts = line.split("@", 1) if len(parts) == 2 and parts[0] and parts[1]: accounts.append((parts[0], parts[1])) else: logger.warning("Skipping malformed account line: %s", line) return accounts def sync_monitored_accounts(session) -> list[MonitoredAccount]: """ Sync accounts from the file + database. Accounts added via web UI are already in the DB. Accounts in the file get added if missing. Returns all active monitored accounts. """ file_accounts = load_accounts_from_file(ACCOUNTS_FILE) for username, instance in file_accounts: existing = ( session.query(MonitoredAccount) .filter_by(username=username, instance=instance) .first() ) if not existing: logger.info("Adding account from file: @%s@%s", username, instance) acct = MonitoredAccount(username=username, instance=instance, is_active=True) session.add(acct) session.commit() return session.query(MonitoredAccount).filter_by(is_active=True).all() def resolve_account(session, account: MonitoredAccount) -> bool: """Look up the Mastodon account ID if we don't have it yet.""" if account.account_id: return True try: data = lookup_account(account.instance, account.username) account.account_id = data["id"] account.display_name = data.get("display_name", "") account.avatar_url = data.get("avatar", "") account.note = data.get("note", "") session.commit() logger.info("Resolved %s → account_id=%s", account.handle, account.account_id) return True except MastodonAPIError as e: logger.error("Failed to resolve %s: %s", account.handle, e) return False def store_status(session, account: MonitoredAccount, parsed: dict) -> bool: """Store a parsed status in the database. Returns True if new, False if duplicate.""" # Check for duplicate existing = ( session.query(Status) .filter_by(status_id=parsed["status_id"], account_db_id=account.id) .first() ) if existing: # Update interaction counts in case they changed existing.replies_count = parsed["replies_count"] existing.reblogs_count = parsed["reblogs_count"] existing.favourites_count = parsed["favourites_count"] return False status = Status( status_id=parsed["status_id"], account_db_id=account.id, uri=parsed["uri"], url=parsed["url"], content=parsed["content"], text_content=parsed["text_content"], visibility=parsed["visibility"], created_at=parsed["created_at"], language=parsed["language"], sensitive=parsed["sensitive"], spoiler_text=parsed["spoiler_text"], in_reply_to_id=parsed["in_reply_to_id"], in_reply_to_account_id=parsed["in_reply_to_account_id"], conversation_id=parsed["conversation_id"], replies_count=parsed["replies_count"], reblogs_count=parsed["reblogs_count"], favourites_count=parsed["favourites_count"], status_type=parsed["status_type"], raw_json=parsed["raw_json"], ) session.add(status) session.flush() # get status.id # Store mentions for m in parsed["mentions"]: session.add(Mention( status_db_id=status.id, mentioned_account_id=m["mentioned_account_id"], mentioned_username=m["mentioned_username"], mentioned_acct=m["mentioned_acct"], mentioned_url=m["mentioned_url"], )) # Store media for ma in parsed["media_attachments"]: session.add(MediaAttachment( status_db_id=status.id, media_id=ma["media_id"], media_type=ma["media_type"], url=ma["url"], preview_url=ma["preview_url"], description=ma["description"], )) # Store tags for t in parsed["tags"]: session.add(Tag( status_db_id=status.id, name=t["name"], url=t["url"], )) return True def collect_account(session, account: MonitoredAccount) -> int: """Collect new statuses for a single account. Returns count of new statuses.""" log = CollectionLog(account_db_id=account.id, status="running") session.add(log) session.commit() try: if not resolve_account(session, account): log.status = "error" log.error = "Could not resolve account ID" log.finished_at = datetime.now(timezone.utc) session.commit() return 0 logger.info("Collecting statuses for %s (since_id=%s)", account.handle, account.last_status_id) raw_statuses = get_account_statuses( instance=account.instance, account_id=account.account_id, since_id=account.last_status_id, ) new_count = 0 newest_id = account.last_status_id for raw in raw_statuses: parsed = parse_status(raw, account.account_id) is_new = store_status(session, account, parsed) if is_new: new_count += 1 # Track the newest status ID sid = parsed["status_id"] if newest_id is None or sid > newest_id: newest_id = sid if newest_id: account.last_status_id = newest_id account.last_collected_at = datetime.now(timezone.utc) log.statuses_collected = new_count log.status = "success" log.finished_at = datetime.now(timezone.utc) session.commit() logger.info("Collected %d new statuses for %s (total fetched: %d)", new_count, account.handle, len(raw_statuses)) return new_count except RateLimitError as e: log.status = "error" log.error = f"Rate limited: {e}" log.finished_at = datetime.now(timezone.utc) session.commit() logger.warning("Rate limited while collecting %s: %s", account.handle, e) time.sleep(e.retry_after) return 0 except MastodonAPIError as e: log.status = "error" log.error = str(e) log.finished_at = datetime.now(timezone.utc) session.commit() logger.error("API error collecting %s: %s", account.handle, e) return 0 except Exception as e: log.status = "error" log.error = str(e) log.finished_at = datetime.now(timezone.utc) session.commit() logger.exception("Unexpected error collecting %s", account.handle) return 0 def run_collection_cycle(): """Run one full collection cycle across all monitored accounts.""" logger.info("=== Starting collection cycle ===") session = get_session() try: accounts = sync_monitored_accounts(session) logger.info("Monitoring %d active accounts", len(accounts)) total_new = 0 for account in accounts: new = collect_account(session, account) total_new += new time.sleep(1) # Brief pause between accounts to be polite logger.info("=== Collection cycle complete: %d new statuses across %d accounts ===", total_new, len(accounts)) except Exception: logger.exception("Fatal error in collection cycle") finally: session.close() def main(): """Entry point: initialize DB and start the scheduler.""" logger.info("Mastodon Collector starting up...") logger.info("Poll interval: %d seconds (%d hours)", POLL_INTERVAL, POLL_INTERVAL // 3600) init_db() logger.info("Database initialized") # Run one collection immediately on startup run_collection_cycle() # Schedule recurring collection scheduler = BlockingScheduler() scheduler.add_job(run_collection_cycle, "interval", seconds=POLL_INTERVAL) logger.info("Scheduler started — next run in %d seconds", POLL_INTERVAL) try: scheduler.start() except (KeyboardInterrupt, SystemExit): logger.info("Collector shutting down") scheduler.shutdown() if __name__ == "__main__": main()