Compare commits

...

No commits in common. "754fddef12071c8fe8ca49840b28fb72108e06e0" and "8a357766fac819167874eba160d0ce85949bdaa7" have entirely different histories.

9 changed files with 133 additions and 15 deletions

View file

@ -0,0 +1,21 @@
{
"permissions": {
"allow": [
"Bash(git push:*)",
"Read(//tmp/bluesky-collector/**)",
"Bash(mkdir -p \"/Users/pieter/Nextcloud-Hetzner/PXS Cloud/Projects/26004 HEIO 2/04 Applications/mastodon-collector/app/analyzer\")",
"Bash(docker-compose build)",
"Bash(docker compose build)",
"Bash(docker compose up -d)",
"Bash(docker exec mastodon-collector-collector-1 bash -c \"ANALYZER_LIMIT=100 python -m app.analyzer\")",
"Bash(docker compose build collector)",
"Bash(docker compose up -d collector)",
"Bash(docker compose build web)",
"Bash(docker compose up -d web)",
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:8585/analysis)",
"Bash(docker logs mastodon-collector-web-1 --tail 30)"
],
"deny": [],
"ask": []
}
}

3
.gitignore vendored
View file

@ -48,9 +48,6 @@ venv.bak/
.DS_Store
.claude/
# Local documentation
LOCAL_OPERATIONS.md
# Database files
*.sqlite
*.sqlite3

View file

@ -1,6 +1,6 @@
# Mastodon Collector
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLMs, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
## Quick Start
@ -38,7 +38,7 @@ Edit `.env` to customize:
POSTGRES_PASSWORD=collector_secret # Change for production
FLASK_SECRET_KEY=change-me-in-production
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
LLM_API_KEY=sk-... # Required for toxicity analysis
OPENAI_API_KEY=sk-... # Required for toxicity analysis
```
## Toxicity Analysis
@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities:
### Features
- **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions:
- **Automated Classification**: Uses an LLM to analyze posts across 12 toxicity dimensions:
- General toxicity, threats, hate speech
- Racism, antisemitism, islamophobia
- Sexism, homophobia, ableism

100
accounts.txt Normal file
View file

@ -0,0 +1,100 @@
# Tweede Kamer
@HenriBontenbalCDA@mastodon.social
@laurensdassen@mastodon.nl
@barbarakathmann@mstdn.social
@JesseKlaver@mstdn.social
@JanPaternotte@mastodon.online
@katipiri@respublicae.eu
@Annemarijke@mastodon.social
@LisaWesterveld@mastodon.social
# Eerste Kamer
@daanroovers@mastodon.social
# Partijen landelijk
@BIJ1@social.bij1.org
@D66@mastodon.social
@PartijvoordeDieren@mastodon.social
@Piratenpartij@mastodon.social
@voltnederland@mastodon.nl
# Kabinet
@MinisterBZK@social.overheid.nl
@staatssecretarisbzk@social.overheid.nl
# Raadsleden & wethouders
@elisabethijmker@amsterdam.nl
@onno@waag.social
@erikwesselius@mastodon.social
@paullieverse@mastodon.nl
@ErikJonker@mastodon.social
@tvanelferen@mastodon.social
@joepbc@mastodon.social
@lynchantropen@mastodon.social
@MirjamHubert@mastodon.nl
# Partijen gemeentelijk
@PvdDAlkmaar@mastodon.social
@ArnhemNijmegenBIJ1@social.bij1.org
@GroenLinks026@mastodon.nl
@pga_asten@mastodon.nl
@groenlinks020@mastodon.social
@pvddamsterdam@mastodon.social
@d66debilt@mastodon.social
@D66Bunnik@mastodon.online
@ppdelft@mastodon.pirateparty.be
@PvdDDelft@mastodon.social
@PvdDDenBosch@mastodon.nl
@DenHaagBIJ1@social.bij1.org
@voltenschede@tukkers.online
@PRO_Heeze_Leende@mastodon.nl
@groenlinkshengelo@mastodon.social
@Groenlinkspvdahouten@mastodon.social
@volthouten@mastodon.social
@GroenLinksPvdAKampen@mastodon.nl
@D66Nijmegen@mastodon.nl
@glnijmegen@mastodon.nl
@PvdDNijmegen@mastodon.nl
@glmeppel@mastodon.nl
@D66Ooststellingwerf@mastodon.social
@PvdAGLDRV@mastodon.nl
@UtrechtBIJ1@social.bij1.org
@d66vught@mastodon.nl
@ProgressiefWoerden@mastodon.nl
# Gemeentelijk overig
@WheelieNick@mastodon.nl
@hotelbreakfast@mastodon.social
@johannesbeers@mastodon.social
@TiciaVerveer@mastodon.social
@RZondervan@mastodon.green
# Partijen provinciaal
@D66Brabant@mastodon.nl
@Statenfractie_PvdD_Drenthe@mastodon.social
@PvdDStatenfractie_Fryslan@mastodon.social
@GroenLinksPU@mastodon.nl
# Statenleden
@SiskaPeeks@mstdn.social
@Marjolein@mastodon.social
# Europees Parlement
@bartgroothuis@mastodon.online
@kimvsparrentak@eupolicy.social
# Waterschappen leden
@veerleslegers@mastodon.social
@PensioNien@todon.nl
@fabianzoon@mastodon.nl
@win_scheijde@mastodon.social
@Matthijs85@mastodon.social
# Waterschappen partijen
@PvdDHHNK@mastodon.nl
# Overige politici
@alexandravanhuffelen@mastodon.social
# Oud-Tweede Kamerleden
@mariekekoekkoek@mastodon.nl

View file

@ -1,7 +1,7 @@
"""Main toxicity analysis orchestrator.
Runs as a one-shot batch process: fetches unscored statuses,
classifies them in batches with LLM API, and stores scores in PostgreSQL.
classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL.
Usage:
python -m app.analyzer
@ -120,7 +120,7 @@ async def run() -> None:
db = AnalyzerDB(config.database_url)
classifier = ToxicityClassifier(
api_key=config.llm_api_key,
api_key=config.openai_api_key,
model=config.model,
)

View file

@ -1,6 +1,6 @@
"""LLM-powered toxicity classifier.
"""OpenAI-powered toxicity classifier.
Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring.
Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring.
Returns a list of dicts of category score (0.01.0).
"""
@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
class ToxicityClassifier:
"""Async LLM-based toxicity classifier with batch support."""
"""Async OpenAI-based toxicity classifier with batch support."""
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
self.client = AsyncOpenAI(api_key=api_key)

View file

@ -11,7 +11,7 @@ class AnalyzerConfig:
"""Configuration for the toxicity analyzer."""
database_url: str
llm_api_key: str
openai_api_key: str
model: str = "gpt-4o-mini"
batch_size: int = 10
concurrency: int = 5
@ -28,7 +28,7 @@ class AnalyzerConfig:
"""Load configuration from environment variables."""
return cls(
database_url=os.environ["DATABASE_URL"],
llm_api_key=os.environ["LLM_API_KEY"],
openai_api_key=os.environ["OPENAI_API_KEY"],
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),

View file

@ -40,7 +40,7 @@ services:
environment:
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
LLM_API_KEY: ${LLM_API_KEY}
OPENAI_API_KEY: ${OPENAI_API_KEY}
volumes:
- ./accounts.txt:/app/accounts.txt
depends_on:

View file

@ -5,5 +5,5 @@ sqlalchemy==2.0.36
requests==2.32.3
apscheduler==3.10.4
beautifulsoup4==4.12.3
openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider)
openai==1.58.1
asyncpg==0.30.0