Compare commits
No commits in common. "8a357766fac819167874eba160d0ce85949bdaa7" and "754fddef12071c8fe8ca49840b28fb72108e06e0" have entirely different histories.
8a357766fa
...
754fddef12
9 changed files with 15 additions and 133 deletions
|
|
@ -1,21 +0,0 @@
|
||||||
{
|
|
||||||
"permissions": {
|
|
||||||
"allow": [
|
|
||||||
"Bash(git push:*)",
|
|
||||||
"Read(//tmp/bluesky-collector/**)",
|
|
||||||
"Bash(mkdir -p \"/Users/pieter/Nextcloud-Hetzner/PXS Cloud/Projects/26004 HEIO 2/04 Applications/mastodon-collector/app/analyzer\")",
|
|
||||||
"Bash(docker-compose build)",
|
|
||||||
"Bash(docker compose build)",
|
|
||||||
"Bash(docker compose up -d)",
|
|
||||||
"Bash(docker exec mastodon-collector-collector-1 bash -c \"ANALYZER_LIMIT=100 python -m app.analyzer\")",
|
|
||||||
"Bash(docker compose build collector)",
|
|
||||||
"Bash(docker compose up -d collector)",
|
|
||||||
"Bash(docker compose build web)",
|
|
||||||
"Bash(docker compose up -d web)",
|
|
||||||
"Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:8585/analysis)",
|
|
||||||
"Bash(docker logs mastodon-collector-web-1 --tail 30)"
|
|
||||||
],
|
|
||||||
"deny": [],
|
|
||||||
"ask": []
|
|
||||||
}
|
|
||||||
}
|
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -48,6 +48,9 @@ venv.bak/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.claude/
|
.claude/
|
||||||
|
|
||||||
|
# Local documentation
|
||||||
|
LOCAL_OPERATIONS.md
|
||||||
|
|
||||||
# Database files
|
# Database files
|
||||||
*.sqlite
|
*.sqlite
|
||||||
*.sqlite3
|
*.sqlite3
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# Mastodon Collector
|
# Mastodon Collector
|
||||||
|
|
||||||
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLMs, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
Collects posts, replies, and mentions from a list of Mastodon accounts and stores them in PostgreSQL. Includes automated toxicity analysis using LLM API, a web UI for account management, data browsing, and manual review of flagged content, plus JSON/CSV APIs for your analysis pipeline.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ Edit `.env` to customize:
|
||||||
POSTGRES_PASSWORD=collector_secret # Change for production
|
POSTGRES_PASSWORD=collector_secret # Change for production
|
||||||
FLASK_SECRET_KEY=change-me-in-production
|
FLASK_SECRET_KEY=change-me-in-production
|
||||||
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
POLL_INTERVAL_SECONDS=14400 # Default: 4 hours (14400s)
|
||||||
OPENAI_API_KEY=sk-... # Required for toxicity analysis
|
LLM_API_KEY=sk-... # Required for toxicity analysis
|
||||||
```
|
```
|
||||||
|
|
||||||
## Toxicity Analysis
|
## Toxicity Analysis
|
||||||
|
|
@ -47,7 +47,7 @@ The system includes automated toxicity detection and manual review capabilities:
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
- **Automated Classification**: Uses an LLM to analyze posts across 12 toxicity dimensions:
|
- **Automated Classification**: Uses LLM API to analyze posts across 12 toxicity dimensions:
|
||||||
- General toxicity, threats, hate speech
|
- General toxicity, threats, hate speech
|
||||||
- Racism, antisemitism, islamophobia
|
- Racism, antisemitism, islamophobia
|
||||||
- Sexism, homophobia, ableism
|
- Sexism, homophobia, ableism
|
||||||
|
|
|
||||||
100
accounts.txt
100
accounts.txt
|
|
@ -1,100 +0,0 @@
|
||||||
# Tweede Kamer
|
|
||||||
@HenriBontenbalCDA@mastodon.social
|
|
||||||
@laurensdassen@mastodon.nl
|
|
||||||
@barbarakathmann@mstdn.social
|
|
||||||
@JesseKlaver@mstdn.social
|
|
||||||
@JanPaternotte@mastodon.online
|
|
||||||
@katipiri@respublicae.eu
|
|
||||||
@Annemarijke@mastodon.social
|
|
||||||
@LisaWesterveld@mastodon.social
|
|
||||||
|
|
||||||
# Eerste Kamer
|
|
||||||
@daanroovers@mastodon.social
|
|
||||||
|
|
||||||
# Partijen landelijk
|
|
||||||
@BIJ1@social.bij1.org
|
|
||||||
@D66@mastodon.social
|
|
||||||
@PartijvoordeDieren@mastodon.social
|
|
||||||
@Piratenpartij@mastodon.social
|
|
||||||
@voltnederland@mastodon.nl
|
|
||||||
|
|
||||||
# Kabinet
|
|
||||||
@MinisterBZK@social.overheid.nl
|
|
||||||
@staatssecretarisbzk@social.overheid.nl
|
|
||||||
|
|
||||||
# Raadsleden & wethouders
|
|
||||||
@elisabethijmker@amsterdam.nl
|
|
||||||
@onno@waag.social
|
|
||||||
@erikwesselius@mastodon.social
|
|
||||||
@paullieverse@mastodon.nl
|
|
||||||
@ErikJonker@mastodon.social
|
|
||||||
@tvanelferen@mastodon.social
|
|
||||||
@joepbc@mastodon.social
|
|
||||||
@lynchantropen@mastodon.social
|
|
||||||
@MirjamHubert@mastodon.nl
|
|
||||||
|
|
||||||
# Partijen gemeentelijk
|
|
||||||
@PvdDAlkmaar@mastodon.social
|
|
||||||
@ArnhemNijmegenBIJ1@social.bij1.org
|
|
||||||
@GroenLinks026@mastodon.nl
|
|
||||||
@pga_asten@mastodon.nl
|
|
||||||
@groenlinks020@mastodon.social
|
|
||||||
@pvddamsterdam@mastodon.social
|
|
||||||
@d66debilt@mastodon.social
|
|
||||||
@D66Bunnik@mastodon.online
|
|
||||||
@ppdelft@mastodon.pirateparty.be
|
|
||||||
@PvdDDelft@mastodon.social
|
|
||||||
@PvdDDenBosch@mastodon.nl
|
|
||||||
@DenHaagBIJ1@social.bij1.org
|
|
||||||
@voltenschede@tukkers.online
|
|
||||||
@PRO_Heeze_Leende@mastodon.nl
|
|
||||||
@groenlinkshengelo@mastodon.social
|
|
||||||
@Groenlinkspvdahouten@mastodon.social
|
|
||||||
@volthouten@mastodon.social
|
|
||||||
@GroenLinksPvdAKampen@mastodon.nl
|
|
||||||
@D66Nijmegen@mastodon.nl
|
|
||||||
@glnijmegen@mastodon.nl
|
|
||||||
@PvdDNijmegen@mastodon.nl
|
|
||||||
@glmeppel@mastodon.nl
|
|
||||||
@D66Ooststellingwerf@mastodon.social
|
|
||||||
@PvdAGLDRV@mastodon.nl
|
|
||||||
@UtrechtBIJ1@social.bij1.org
|
|
||||||
@d66vught@mastodon.nl
|
|
||||||
@ProgressiefWoerden@mastodon.nl
|
|
||||||
|
|
||||||
# Gemeentelijk overig
|
|
||||||
@WheelieNick@mastodon.nl
|
|
||||||
@hotelbreakfast@mastodon.social
|
|
||||||
@johannesbeers@mastodon.social
|
|
||||||
@TiciaVerveer@mastodon.social
|
|
||||||
@RZondervan@mastodon.green
|
|
||||||
|
|
||||||
# Partijen provinciaal
|
|
||||||
@D66Brabant@mastodon.nl
|
|
||||||
@Statenfractie_PvdD_Drenthe@mastodon.social
|
|
||||||
@PvdDStatenfractie_Fryslan@mastodon.social
|
|
||||||
@GroenLinksPU@mastodon.nl
|
|
||||||
|
|
||||||
# Statenleden
|
|
||||||
@SiskaPeeks@mstdn.social
|
|
||||||
@Marjolein@mastodon.social
|
|
||||||
|
|
||||||
# Europees Parlement
|
|
||||||
@bartgroothuis@mastodon.online
|
|
||||||
@kimvsparrentak@eupolicy.social
|
|
||||||
|
|
||||||
# Waterschappen leden
|
|
||||||
@veerleslegers@mastodon.social
|
|
||||||
@PensioNien@todon.nl
|
|
||||||
@fabianzoon@mastodon.nl
|
|
||||||
@win_scheijde@mastodon.social
|
|
||||||
@Matthijs85@mastodon.social
|
|
||||||
|
|
||||||
# Waterschappen partijen
|
|
||||||
@PvdDHHNK@mastodon.nl
|
|
||||||
|
|
||||||
# Overige politici
|
|
||||||
@alexandravanhuffelen@mastodon.social
|
|
||||||
|
|
||||||
# Oud-Tweede Kamerleden
|
|
||||||
@mariekekoekkoek@mastodon.nl
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"""Main toxicity analysis orchestrator.
|
"""Main toxicity analysis orchestrator.
|
||||||
|
|
||||||
Runs as a one-shot batch process: fetches unscored statuses,
|
Runs as a one-shot batch process: fetches unscored statuses,
|
||||||
classifies them in batches with GPT-4o-mini, and stores scores in PostgreSQL.
|
classifies them in batches with LLM API, and stores scores in PostgreSQL.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python -m app.analyzer
|
python -m app.analyzer
|
||||||
|
|
@ -120,7 +120,7 @@ async def run() -> None:
|
||||||
|
|
||||||
db = AnalyzerDB(config.database_url)
|
db = AnalyzerDB(config.database_url)
|
||||||
classifier = ToxicityClassifier(
|
classifier = ToxicityClassifier(
|
||||||
api_key=config.openai_api_key,
|
api_key=config.llm_api_key,
|
||||||
model=config.model,
|
model=config.model,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""OpenAI-powered toxicity classifier.
|
"""LLM-powered toxicity classifier.
|
||||||
|
|
||||||
Sends batches of Mastodon statuses to GPT-4o-mini for multi-category toxicity scoring.
|
Sends batches of Mastodon statuses to LLM API for multi-category toxicity scoring.
|
||||||
Returns a list of dicts of category → score (0.0–1.0).
|
Returns a list of dicts of category → score (0.0–1.0).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -181,7 +181,7 @@ def parse_batch_response(raw: str, batch_size: int) -> list[ToxicityScores]:
|
||||||
|
|
||||||
|
|
||||||
class ToxicityClassifier:
|
class ToxicityClassifier:
|
||||||
"""Async OpenAI-based toxicity classifier with batch support."""
|
"""Async LLM-based toxicity classifier with batch support."""
|
||||||
|
|
||||||
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
||||||
self.client = AsyncOpenAI(api_key=api_key)
|
self.client = AsyncOpenAI(api_key=api_key)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ class AnalyzerConfig:
|
||||||
"""Configuration for the toxicity analyzer."""
|
"""Configuration for the toxicity analyzer."""
|
||||||
|
|
||||||
database_url: str
|
database_url: str
|
||||||
openai_api_key: str
|
llm_api_key: str
|
||||||
model: str = "gpt-4o-mini"
|
model: str = "gpt-4o-mini"
|
||||||
batch_size: int = 10
|
batch_size: int = 10
|
||||||
concurrency: int = 5
|
concurrency: int = 5
|
||||||
|
|
@ -28,7 +28,7 @@ class AnalyzerConfig:
|
||||||
"""Load configuration from environment variables."""
|
"""Load configuration from environment variables."""
|
||||||
return cls(
|
return cls(
|
||||||
database_url=os.environ["DATABASE_URL"],
|
database_url=os.environ["DATABASE_URL"],
|
||||||
openai_api_key=os.environ["OPENAI_API_KEY"],
|
llm_api_key=os.environ["LLM_API_KEY"],
|
||||||
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
|
model=os.getenv("ANALYZER_MODEL", "gpt-4o-mini"),
|
||||||
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
|
batch_size=int(os.getenv("ANALYZER_BATCH_SIZE", "10")),
|
||||||
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),
|
concurrency=int(os.getenv("ANALYZER_CONCURRENCY", "5")),
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ services:
|
||||||
environment:
|
environment:
|
||||||
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
DATABASE_URL: postgresql://collector:${POSTGRES_PASSWORD:-collector_secret}@db:5432/mastodon_collector
|
||||||
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-14400}
|
||||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
LLM_API_KEY: ${LLM_API_KEY}
|
||||||
volumes:
|
volumes:
|
||||||
- ./accounts.txt:/app/accounts.txt
|
- ./accounts.txt:/app/accounts.txt
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,5 @@ sqlalchemy==2.0.36
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
apscheduler==3.10.4
|
apscheduler==3.10.4
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
openai==1.58.1
|
openai==1.58.1 # OpenAI-compatible API client (supports any LLM provider)
|
||||||
asyncpg==0.30.0
|
asyncpg==0.30.0
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue