Add generic LLM provider terminology

- Update all documentation to use "LLM API" instead of "OpenAI GPT-4o-mini"
- Rename OPENAI_API_KEY to LLM_API_KEY in configuration
- Update code comments to reflect generic LLM usage
- Keep OpenAI-compatible client library (supports any LLM provider)
- Add LOCAL_OPERATIONS.md and accounts.txt to .gitignore
This commit is contained in:
Pieter 2026-04-18 20:27:09 +02:00
parent 870a0710b5
commit 754fddef12
7 changed files with 15 additions and 12 deletions

3
.gitignore vendored
View file

@ -48,6 +48,9 @@ venv.bak/
.DS_Store .DS_Store
.claude/ .claude/
# Local documentation
LOCAL_OPERATIONS.md
# Database files # Database files
*.sqlite *.sqlite
*.sqlite3 *.sqlite3

View file

@ -1,6 +1,6 @@
# Mastodon Collector # Mastodon Collector
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using OpenAI GPT-4o-mini, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline. Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
## Quick Start ## Quick Start
@ -38,7 +38,7 @@ Edit `.env` to customize:
POSTGRES_PASSWORD=collector_secret # Change for production POSTGRES_PASSWORD=collector_secret # Change for production
FLASK_SECRET_KEY=change-me-in-production FLASK_SECRET_KEY=change-me-in-production
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s) POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
OPENAI_API_KEY=sk-... # Required for toxicity analysis LLM_API_KEY=sk-... # Required for toxicity analysis
``` ```
## Toxicity Analysis ## Toxicity Analysis
@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities:
### Features ### Features
- **Automated Classification**: Uses OpenAI GPT-4o-mini to analyze posts across 12 toxicity dimensions: - **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions:
- General toxicity, threats, hate speech - General toxicity, threats, hate speech
- Racism, antisemitism, islamophobia - Racism, antisemitism, islamophobia
- Sexism, homophobia, ableism - Sexism, homophobia, ableism

View file

@ -1,7 +1,7 @@
"""Main toxicity analysis orchestrator. """Main toxicity analysis orchestrator.
Runs as a one-shot batch process: fetches unscored statuses, Runs as a one-shot batch process: fetches unscored statuses,
classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL. classifies them in batches with LLM API, and stores scores in PostgreSQL.
Usage: Usage:
python -m app.analyzer python -m app.analyzer
@ -120,7 +120,7 @@ async def run() -> None:
db = AnalyzerDB(config.database_url) db = AnalyzerDB(config.database_url)
classifier = ToxicityClassifier( classifier = ToxicityClassifier(
api_key=config.openai_api_key, api_key=config.llm_api_key,
model=config.model, model=config.model,
) )

View file

@ -1,6 +1,6 @@
"""OpenAI-powered toxicity classifier. """LLM-powered toxicity classifier.
Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring. Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring.
Returns a list of dicts of category score (0.01.0). Returns a list of dicts of category score (0.01.0).
""" """
@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
class ToxicityClassifier: class ToxicityClassifier:
"""Async OpenAI-based toxicity classifier with batch support.""" """Async LLM-based toxicity classifier with batch support."""
def __init__(self, api_key: str, model: str = "gpt-4o-mini"): def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
self.client = AsyncOpenAI(api_key=api_key) self.client = AsyncOpenAI(api_key=api_key)

View file

@ -11,7 +11,7 @@ class AnalyzerConfig:
"""Configuration for the toxicity analyzer.""" """Configuration for the toxicity analyzer."""
database_url: str database_url: str
openai_api_key: str llm_api_key: str
model: str = "gpt-4o-mini" model: str = "gpt-4o-mini"
batch_size: int = 10 batch_size: int = 10
concurrency: int = 5 concurrency: int = 5
@ -28,7 +28,7 @@ class AnalyzerConfig:
"""Load configuration from environment variables.""" """Load configuration from environment variables."""
return cls( return cls(
database_url=os.environ["DATABASE_URL"], database_url=os.environ["DATABASE_URL"],
openai_api_key=os.environ["OPENAI_API_KEY"], llm_api_key=os.environ["LLM_API_KEY"],
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"), model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")), batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")), concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),

View file

@ -40,7 +40,7 @@ services:
environment: environment:
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400} POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
OPENAI_API_KEY: ${OPENAI_API_KEY} LLM_API_KEY: ${LLM_API_KEY}
volumes: volumes:
- ./accounts.txt:/app/accounts.txt - ./accounts.txt:/app/accounts.txt
depends_on: depends_on:

View file

@ -5,5 +5,5 @@ sqlalchemy==2.0.36
requests==2.32.3 requests==2.32.3
apscheduler==3.10.4 apscheduler==3.10.4
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
openai==1.58.1 openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider)
asyncpg==0.30.0 asyncpg==0.30.0