diff --git a/.gitignore b/.gitignore index 628546d..bb351a0 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,9 @@ venv.bak/ .DS_Store .claude/ +# Local documentation +LOCAL_OPERATIONS.md + # Database files *.sqlite *.sqlite3 diff --git a/README.md b/README.md index a3a6b6a..0e483ee 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Mastodon Collector -Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using OpenAI GPT-4o-mini, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline. +Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline. ## Quick Start @@ -38,7 +38,7 @@ Edit `.env` to customize: POSTGRES_PASSWORD=collector_secret # Change for production FLASK_SECRET_KEY=change-me-in-production POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s) -OPENAI_API_KEY=sk-... # Required for toxicity analysis +LLM_API_KEY=sk-... # Required for toxicity analysis ``` ## Toxicity Analysis @@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities: ### Features -- **Automated Classification**: Uses OpenAI GPT-4o-mini to analyze posts across 12 toxicity dimensions: +- **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions: - General toxicity, threats, hate speech - Racism, antisemitism, islamophobia - Sexism, homophobia, ableism diff --git a/app/analyzer/analyzer.py b/app/analyzer/analyzer.py index a1c6731..fb52d37 100644 --- a/app/analyzer/analyzer.py +++ b/app/analyzer/analyzer.py @@ -1,7 +1,7 @@ """Main toxicity analysis orchestrator. Runs as a one-shot batch process: fetches unscored statuses, -classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL. +classifies them in batches with LLM API, and stores scores in PostgreSQL. Usage: python -m app.analyzer @@ -120,7 +120,7 @@ async def run() -> None: db = AnalyzerDB(config.database_url) classifier = ToxicityClassifier( - api_key=config.openai_api_key, + api_key=config.llm_api_key, model=config.model, ) diff --git a/app/analyzer/classifier.py b/app/analyzer/classifier.py index a3de131..07d80ec 100644 --- a/app/analyzer/classifier.py +++ b/app/analyzer/classifier.py @@ -1,6 +1,6 @@ -"""OpenAI-powered toxicity classifier. +"""LLM-powered toxicity classifier. -Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring. +Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring. Returns a list of dicts of category → score (0.0–1.0). """ @@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]: class ToxicityClassifier: - """Async OpenAI-based toxicity classifier with batch support.""" + """Async LLM-based toxicity classifier with batch support.""" def __init__(self, api_key: str, model: str = "gpt-4o-mini"): self.client = AsyncOpenAI(api_key=api_key) diff --git a/app/analyzer/config.py b/app/analyzer/config.py index 64b1cd2..7f0d486 100644 --- a/app/analyzer/config.py +++ b/app/analyzer/config.py @@ -11,7 +11,7 @@ class AnalyzerConfig: """Configuration for the toxicity analyzer.""" database_url: str - openai_api_key: str + llm_api_key: str model: str = "gpt-4o-mini" batch_size: int = 10 concurrency: int = 5 @@ -28,7 +28,7 @@ class AnalyzerConfig: """Load configuration from environment variables.""" return cls( database_url=os.environ["DATABASE_URL"], - openai_api_key=os.environ["OPENAI_API_KEY"], + llm_api_key=os.environ["LLM_API_KEY"], model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"), batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")), concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")), diff --git a/docker-compose.yml b/docker-compose.yml index 06674f8..755fbc6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,7 +40,7 @@ services: environment: DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400} - OPENAI_API_KEY: ${OPENAI_API_KEY} + LLM_API_KEY: ${LLM_API_KEY} volumes: - ./accounts.txt:/app/accounts.txt depends_on: diff --git a/requirements.txt b/requirements.txt index 735ed42..d05196b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ sqlalchemy==2.0.36 requests==2.32.3 apscheduler==3.10.4 beautifulsoup4==4.12.3 -openai==1.58.1 +openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider) asyncpg==0.30.0