Add generic LLM provider terminology
- Update all documentation to use "LLM API" instead of "OpenAI GPT-4o-mini" - Rename OPENAI_API_KEY to LLM_API_KEY in configuration - Update code comments to reflect generic LLM usage - Keep OpenAI-compatible client library (supports any LLM provider) - Add LOCAL_OPERATIONS.md and accounts.txt to .gitignore
This commit is contained in:
parent
870a0710b5
commit
754fddef12
7 changed files with 15 additions and 12 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -48,6 +48,9 @@ venv.bak/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.claude/
|
.claude/
|
||||||
|
|
||||||
|
# Local documentation
|
||||||
|
LOCAL_OPERATIONS.md
|
||||||
|
|
||||||
# Database files
|
# Database files
|
||||||
*.sqlite
|
*.sqlite
|
||||||
*.sqlite3
|
*.sqlite3
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# Mastodon Collector
|
# Mastodon Collector
|
||||||
|
|
||||||
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using OpenAI GPT-4o-mini, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ Edit `.env` to customize:
|
||||||
POSTGRES_PASSWORD=collector_secret # Change for production
|
POSTGRES_PASSWORD=collector_secret # Change for production
|
||||||
FLASK_SECRET_KEY=change-me-in-production
|
FLASK_SECRET_KEY=change-me-in-production
|
||||||
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
||||||
OPENAI_API_KEY=sk-... # Required for toxicity analysis
|
LLM_API_KEY=sk-... # Required for toxicity analysis
|
||||||
```
|
```
|
||||||
|
|
||||||
## Toxicity Analysis
|
## Toxicity Analysis
|
||||||
|
|
@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities:
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
- **Automated Classification**: Uses OpenAI GPT-4o-mini to analyze posts across 12 toxicity dimensions:
|
- **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions:
|
||||||
- General toxicity, threats, hate speech
|
- General toxicity, threats, hate speech
|
||||||
- Racism, antisemitism, islamophobia
|
- Racism, antisemitism, islamophobia
|
||||||
- Sexism, homophobia, ableism
|
- Sexism, homophobia, ableism
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"""Main toxicity analysis orchestrator.
|
"""Main toxicity analysis orchestrator.
|
||||||
|
|
||||||
Runs as a one-shot batch process: fetches unscored statuses,
|
Runs as a one-shot batch process: fetches unscored statuses,
|
||||||
classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL.
|
classifies them in batches with LLM API, and stores scores in PostgreSQL.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python -m app.analyzer
|
python -m app.analyzer
|
||||||
|
|
@ -120,7 +120,7 @@ async def run() -> None:
|
||||||
|
|
||||||
db = AnalyzerDB(config.database_url)
|
db = AnalyzerDB(config.database_url)
|
||||||
classifier = ToxicityClassifier(
|
classifier = ToxicityClassifier(
|
||||||
api_key=config.openai_api_key,
|
api_key=config.llm_api_key,
|
||||||
model=config.model,
|
model=config.model,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""OpenAI-powered toxicity classifier.
|
"""LLM-powered toxicity classifier.
|
||||||
|
|
||||||
Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring.
|
Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring.
|
||||||
Returns a list of dicts of category → score (0.0–1.0).
|
Returns a list of dicts of category → score (0.0–1.0).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
|
||||||
|
|
||||||
|
|
||||||
class ToxicityClassifier:
|
class ToxicityClassifier:
|
||||||
"""Async OpenAI-based toxicity classifier with batch support."""
|
"""Async LLM-based toxicity classifier with batch support."""
|
||||||
|
|
||||||
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
||||||
self.client = AsyncOpenAI(api_key=api_key)
|
self.client = AsyncOpenAI(api_key=api_key)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ class AnalyzerConfig:
|
||||||
"""Configuration for the toxicity analyzer."""
|
"""Configuration for the toxicity analyzer."""
|
||||||
|
|
||||||
database_url: str
|
database_url: str
|
||||||
openai_api_key: str
|
llm_api_key: str
|
||||||
model: str = "gpt-4o-mini"
|
model: str = "gpt-4o-mini"
|
||||||
batch_size: int = 10
|
batch_size: int = 10
|
||||||
concurrency: int = 5
|
concurrency: int = 5
|
||||||
|
|
@ -28,7 +28,7 @@ class AnalyzerConfig:
|
||||||
"""Load configuration from environment variables."""
|
"""Load configuration from environment variables."""
|
||||||
return cls(
|
return cls(
|
||||||
database_url=os.environ["DATABASE_URL"],
|
database_url=os.environ["DATABASE_URL"],
|
||||||
openai_api_key=os.environ["OPENAI_API_KEY"],
|
llm_api_key=os.environ["LLM_API_KEY"],
|
||||||
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
|
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
|
||||||
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
|
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
|
||||||
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),
|
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ services:
|
||||||
environment:
|
environment:
|
||||||
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
||||||
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
||||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
LLM_API_KEY: ${LLM_API_KEY}
|
||||||
volumes:
|
volumes:
|
||||||
- ./accounts.txt:/app/accounts.txt
|
- ./accounts.txt:/app/accounts.txt
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,5 @@ sqlalchemy==2.0.36
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
apscheduler==3.10.4
|
apscheduler==3.10.4
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
openai==1.58.1
|
openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider)
|
||||||
asyncpg==0.30.0
|
asyncpg==0.30.0
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue