Compare commits
No commits in common. "754fddef12071c8fe8ca49840b28fb72108e06e0" and "8a357766fac819167874eba160d0ce85949bdaa7" have entirely different histories.
754fddef12
...
8a357766fa
9 changed files with 133 additions and 15 deletions
21
.claude/settings.local.json
Normal file
21
.claude/settings.local.json
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(git push:*)",
|
||||
"Read(//tmp/bluesky-collector/**)",
|
||||
"Bash(mkdir -p \"/Users/pieter/Nextcloud-Hetzner/PXS Cloud/Projects/26004 HEIO 2/04 Applications/mastodon-collector/app/analyzer\")",
|
||||
"Bash(docker-compose build)",
|
||||
"Bash(docker compose build)",
|
||||
"Bash(docker compose up -d)",
|
||||
"Bash(docker exec mastodon-collector-collector-1 bash -c \"ANALYZER_LIMIT=100 python -m app.analyzer\")",
|
||||
"Bash(docker compose build collector)",
|
||||
"Bash(docker compose up -d collector)",
|
||||
"Bash(docker compose build web)",
|
||||
"Bash(docker compose up -d web)",
|
||||
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:8585/analysis)",
|
||||
"Bash(docker logs mastodon-collector-web-1 --tail 30)"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": []
|
||||
}
|
||||
}
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -48,9 +48,6 @@ venv.bak/
|
|||
.DS_Store
|
||||
.claude/
|
||||
|
||||
# Local documentation
|
||||
LOCAL_OPERATIONS.md
|
||||
|
||||
# Database files
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# Mastodon Collector
|
||||
|
||||
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
||||
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLMs, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
||||
|
||||
## Quick Start
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ Edit `.env` to customize:
|
|||
POSTGRES_PASSWORD=collector_secret # Change for production
|
||||
FLASK_SECRET_KEY=change-me-in-production
|
||||
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
||||
LLM_API_KEY=sk-... # Required for toxicity analysis
|
||||
OPENAI_API_KEY=sk-... # Required for toxicity analysis
|
||||
```
|
||||
|
||||
## Toxicity Analysis
|
||||
|
|
@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities:
|
|||
|
||||
### Features
|
||||
|
||||
- **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions:
|
||||
- **Automated Classification**: Uses an LLM to analyze posts across 12 toxicity dimensions:
|
||||
- General toxicity, threats, hate speech
|
||||
- Racism, antisemitism, islamophobia
|
||||
- Sexism, homophobia, ableism
|
||||
|
|
|
|||
100
accounts.txt
Normal file
100
accounts.txt
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# Tweede Kamer
|
||||
@HenriBontenbalCDA@mastodon.social
|
||||
@laurensdassen@mastodon.nl
|
||||
@barbarakathmann@mstdn.social
|
||||
@JesseKlaver@mstdn.social
|
||||
@JanPaternotte@mastodon.online
|
||||
@katipiri@respublicae.eu
|
||||
@Annemarijke@mastodon.social
|
||||
@LisaWesterveld@mastodon.social
|
||||
|
||||
# Eerste Kamer
|
||||
@daanroovers@mastodon.social
|
||||
|
||||
# Partijen landelijk
|
||||
@BIJ1@social.bij1.org
|
||||
@D66@mastodon.social
|
||||
@PartijvoordeDieren@mastodon.social
|
||||
@Piratenpartij@mastodon.social
|
||||
@voltnederland@mastodon.nl
|
||||
|
||||
# Kabinet
|
||||
@MinisterBZK@social.overheid.nl
|
||||
@staatssecretarisbzk@social.overheid.nl
|
||||
|
||||
# Raadsleden & wethouders
|
||||
@elisabethijmker@amsterdam.nl
|
||||
@onno@waag.social
|
||||
@erikwesselius@mastodon.social
|
||||
@paullieverse@mastodon.nl
|
||||
@ErikJonker@mastodon.social
|
||||
@tvanelferen@mastodon.social
|
||||
@joepbc@mastodon.social
|
||||
@lynchantropen@mastodon.social
|
||||
@MirjamHubert@mastodon.nl
|
||||
|
||||
# Partijen gemeentelijk
|
||||
@PvdDAlkmaar@mastodon.social
|
||||
@ArnhemNijmegenBIJ1@social.bij1.org
|
||||
@GroenLinks026@mastodon.nl
|
||||
@pga_asten@mastodon.nl
|
||||
@groenlinks020@mastodon.social
|
||||
@pvddamsterdam@mastodon.social
|
||||
@d66debilt@mastodon.social
|
||||
@D66Bunnik@mastodon.online
|
||||
@ppdelft@mastodon.pirateparty.be
|
||||
@PvdDDelft@mastodon.social
|
||||
@PvdDDenBosch@mastodon.nl
|
||||
@DenHaagBIJ1@social.bij1.org
|
||||
@voltenschede@tukkers.online
|
||||
@PRO_Heeze_Leende@mastodon.nl
|
||||
@groenlinkshengelo@mastodon.social
|
||||
@Groenlinkspvdahouten@mastodon.social
|
||||
@volthouten@mastodon.social
|
||||
@GroenLinksPvdAKampen@mastodon.nl
|
||||
@D66Nijmegen@mastodon.nl
|
||||
@glnijmegen@mastodon.nl
|
||||
@PvdDNijmegen@mastodon.nl
|
||||
@glmeppel@mastodon.nl
|
||||
@D66Ooststellingwerf@mastodon.social
|
||||
@PvdAGLDRV@mastodon.nl
|
||||
@UtrechtBIJ1@social.bij1.org
|
||||
@d66vught@mastodon.nl
|
||||
@ProgressiefWoerden@mastodon.nl
|
||||
|
||||
# Gemeentelijk overig
|
||||
@WheelieNick@mastodon.nl
|
||||
@hotelbreakfast@mastodon.social
|
||||
@johannesbeers@mastodon.social
|
||||
@TiciaVerveer@mastodon.social
|
||||
@RZondervan@mastodon.green
|
||||
|
||||
# Partijen provinciaal
|
||||
@D66Brabant@mastodon.nl
|
||||
@Statenfractie_PvdD_Drenthe@mastodon.social
|
||||
@PvdDStatenfractie_Fryslan@mastodon.social
|
||||
@GroenLinksPU@mastodon.nl
|
||||
|
||||
# Statenleden
|
||||
@SiskaPeeks@mstdn.social
|
||||
@Marjolein@mastodon.social
|
||||
|
||||
# Europees Parlement
|
||||
@bartgroothuis@mastodon.online
|
||||
@kimvsparrentak@eupolicy.social
|
||||
|
||||
# Waterschappen leden
|
||||
@veerleslegers@mastodon.social
|
||||
@PensioNien@todon.nl
|
||||
@fabianzoon@mastodon.nl
|
||||
@win_scheijde@mastodon.social
|
||||
@Matthijs85@mastodon.social
|
||||
|
||||
# Waterschappen partijen
|
||||
@PvdDHHNK@mastodon.nl
|
||||
|
||||
# Overige politici
|
||||
@alexandravanhuffelen@mastodon.social
|
||||
|
||||
# Oud-Tweede Kamerleden
|
||||
@mariekekoekkoek@mastodon.nl
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
"""Main toxicity analysis orchestrator.
|
||||
|
||||
Runs as a one-shot batch process: fetches unscored statuses,
|
||||
classifies them in batches with LLM API, and stores scores in PostgreSQL.
|
||||
classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL.
|
||||
|
||||
Usage:
|
||||
python -m app.analyzer
|
||||
|
|
@ -120,7 +120,7 @@ async def run() -> None:
|
|||
|
||||
db = AnalyzerDB(config.database_url)
|
||||
classifier = ToxicityClassifier(
|
||||
api_key=config.llm_api_key,
|
||||
api_key=config.openai_api_key,
|
||||
model=config.model,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""LLM-powered toxicity classifier.
|
||||
"""OpenAI-powered toxicity classifier.
|
||||
|
||||
Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring.
|
||||
Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring.
|
||||
Returns a list of dicts of category → score (0.0–1.0).
|
||||
"""
|
||||
|
||||
|
|
@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
|
|||
|
||||
|
||||
class ToxicityClassifier:
|
||||
"""Async LLM-based toxicity classifier with batch support."""
|
||||
"""Async OpenAI-based toxicity classifier with batch support."""
|
||||
|
||||
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
||||
self.client = AsyncOpenAI(api_key=api_key)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ class AnalyzerConfig:
|
|||
"""Configuration for the toxicity analyzer."""
|
||||
|
||||
database_url: str
|
||||
llm_api_key: str
|
||||
openai_api_key: str
|
||||
model: str = "gpt-4o-mini"
|
||||
batch_size: int = 10
|
||||
concurrency: int = 5
|
||||
|
|
@ -28,7 +28,7 @@ class AnalyzerConfig:
|
|||
"""Load configuration from environment variables."""
|
||||
return cls(
|
||||
database_url=os.environ["DATABASE_URL"],
|
||||
llm_api_key=os.environ["LLM_API_KEY"],
|
||||
openai_api_key=os.environ["OPENAI_API_KEY"],
|
||||
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
|
||||
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
|
||||
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ services:
|
|||
environment:
|
||||
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
||||
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
||||
LLM_API_KEY: ${LLM_API_KEY}
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
volumes:
|
||||
- ./accounts.txt:/app/accounts.txt
|
||||
depends_on:
|
||||
|
|
|
|||
|
|
@ -5,5 +5,5 @@ sqlalchemy==2.0.36
|
|||
requests==2.32.3
|
||||
apscheduler==3.10.4
|
||||
beautifulsoup4==4.12.3
|
||||
openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider)
|
||||
openai==1.58.1
|
||||
asyncpg==0.30.0
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue