Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
801a752
feat: add bio-bait spam detection with profile bio scanning
rezhajulio May 1, 2026
b764bed
chore: sanitize spam example references
rezhajulio May 1, 2026
7097a1e
fix: narrow promo hints and word-boundary matching for bio link detec…
rezhajulio May 14, 2026
811c4ce
feat: add bio-bait monitor-only mode with owner alerts and metrics
rezhajulio May 14, 2026
f36a3a0
fix: bio-bait review - trusted bypass, monitor-only alert semantics, …
rezhajulio May 18, 2026
6d67a41
Fix bio-bait routing: remove TEXT|CAPTION filter restriction
rezhajulio May 18, 2026
8e3bb01
fix: bio-bait review - cache eviction, f-string logging, shared white…
rezhajulio May 18, 2026
b2fa84b
chore: ignore local worktrees directory
rezhajulio May 22, 2026
3202610
feat(config): add strict plugin toggle validation
rezhajulio May 22, 2026
125f731
fix(config): apply plugins_default in single-group fallback
rezhajulio May 22, 2026
70dffb1
feat(plugins): add plugin contracts and toggle resolver
rezhajulio May 22, 2026
bcfda23
fix(plugins): align manifest types and export definitions
rezhajulio May 22, 2026
c65f563
feat(plugins): add built-in wrappers with fixed registration order
rezhajulio May 22, 2026
eece630
feat(main): register handlers and jobs via plugin manager
rezhajulio May 22, 2026
8d1ce65
feat(plugins): enforce per-group plugin enable map at runtime
rezhajulio May 22, 2026
90a8c2d
fix(plugins): apply runtime gating to group-scoped handlers
rezhajulio May 22, 2026
075a673
docs(config): add plugin toggle examples for env and groups
rezhajulio May 22, 2026
ec2f004
refactor(plugins): unify plugin name registry to definitions.py
rezhajulio May 22, 2026
d4145d0
fix: resolve DeepSeek V4 Pro review issues for plugin system
rezhajulio May 22, 2026
7177a95
fix: preserve admin cache on startup failure via preload_admin_ids
rezhajulio May 31, 2026
1e11192
fix: clone handlers in captcha plugin to prevent mutation
rezhajulio May 31, 2026
54e50f2
docs: add ADMIN_COMMANDS constant to document guard_plugin skip
rezhajulio May 31, 2026
062268c
fix: add negative caching for bio fetch failures (5 min TTL)
rezhajulio May 31, 2026
24f0815
fix: clarify PLUGINS_DEFAULT test expectations for env vs constructor
rezhajulio May 31, 2026
c734fce
chore: remove unused import in test
rezhajulio May 31, 2026
c1e7b3d
chore: add docs/ to gitignore, update project guidelines
rezhajulio May 31, 2026
3a115d3
fix: move bio_bait_spam from group 6 to group 4 before profile_monitor
rezhajulio Jun 10, 2026
e9e1ffa
fix: reconcile MANIFEST_ORDER with handler_group metadata and docstrings
rezhajulio Jun 10, 2026
aefbdc8
fix: rewrite captcha-clone tests with patched get_handlers for real m…
rezhajulio Jun 10, 2026
ab6de9f
fix: use real stub handlers instead of MagicMock for copy.copy clone …
rezhajulio Jun 10, 2026
c708fd7
fix: nest init_group_registry patch and fix tautological assertion
rezhajulio Jun 10, 2026
092cd7f
fix: remove forbidden @pytest.mark.asyncio decorators
rezhajulio Jun 10, 2026
19948b7
fix: replace importlib.reload with import identity check
rezhajulio Jun 10, 2026
07169e4
refactor: extract shared validate_plugin_map helper
rezhajulio Jun 10, 2026
bace2dd
fix: use typed TelegramAdminFetchError instead of bare Exception
rezhajulio Jun 10, 2026
4019444
docs: clarify job registrars return empty list by design
rezhajulio Jun 10, 2026
e888cc8
test: add guard_plugin channel/args tests and register_all failure test
rezhajulio Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,29 @@ DUPLICATE_SPAM_MIN_LENGTH=20
# 0.95 catches minor edits, 0.97 only near-exact copies, 0.90 is more aggressive
DUPLICATE_SPAM_SIMILARITY=0.95

# Enable/disable bio bait detection (true/false)
BIO_BAIT_ENABLED=true

# Monitor-only mode for bio bait detection (true/false)
# When true: no delete/restrict/warning-topic notification, only metrics + owner alert
BIO_BAIT_MONITOR_ONLY=false

# Owner/admin chat ID to receive bio bait monitor alerts (optional)
# Example: 57747812
# BIO_BAIT_ALERT_CHAT_ID=57747812

# Path to groups.json for multi-group support (optional)
# If this file exists, per-group settings are loaded from it instead of the
# GROUP_ID/WARNING_TOPIC_ID/etc. fields above. See groups.json.example.
# GROUPS_CONFIG_PATH=groups.json

# Default plugin enable/disable map for all groups (optional, single-group mode)
# JSON object mapping built-in plugin names to booleans.
# Plugins not listed inherit their built-in default (enabled).
# Keys must match known plugin names (e.g. "captcha", "dm", "verify").
# Example: PLUGINS_DEFAULT={"captcha":true,"dm":false}
# PLUGINS_DEFAULT={"captcha":true,"dm":false}

# Logfire Configuration (optional - for production logging)
# Get your token from https://logfire.pydantic.dev
LOGFIRE_TOKEN=your_logfire_token_here
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ __pycache__/
data/
.vscode
# AGENTS.md
.worktrees/

# Agent/planning docs
docs/
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,7 @@ if user.id not in admin_ids:
- Captcha callback data encodes group_id: `captcha_verify_{group_id}_{user_id}` to avoid ambiguity
- Scheduler iterates all groups with per-group exception isolation
- DM handler scans all groups in registry for user membership and unrestriction

## Policy

- Never mention AI usage, code generation tools, or automated assistance in commit messages, PR descriptions, code comments, or documentation
22 changes: 19 additions & 3 deletions groups.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,15 @@
"duplicate_spam_window_seconds": 120,
"duplicate_spam_threshold": 2,
"duplicate_spam_min_length": 20,
"duplicate_spam_similarity": 0.95
"duplicate_spam_similarity": 0.95,
"bio_bait_enabled": true,
"bio_bait_monitor_only": false,
"bio_bait_alert_chat_id": null,
"plugins": {
"captcha": false,
"dm": true,
"verify": true
}
},
{
"group_id": -1009876543210,
Expand All @@ -33,6 +41,14 @@
"duplicate_spam_window_seconds": 60,
"duplicate_spam_threshold": 2,
"duplicate_spam_min_length": 20,
"duplicate_spam_similarity": 0.90
"duplicate_spam_similarity": 0.90,
"bio_bait_enabled": true,
"bio_bait_monitor_only": false,
"bio_bait_alert_chat_id": null,
"plugins": {
"contact_spam": false,
"duplicate_spam": false,
"profile_monitor": true
}
}
]
]
39 changes: 34 additions & 5 deletions src/bot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
(production, staging) via the BOT_ENV environment variable.
"""

import json
import logging
import os
from datetime import timedelta
from functools import lru_cache
from pathlib import Path

from pydantic import field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict

logger = logging.getLogger(__name__)


def get_env_file() -> str | None:
"""
Determine which .env file to load based on BOT_ENV environment variable.
Expand All @@ -32,7 +33,7 @@ def get_env_file() -> str | None:
"staging": ".env.staging",
}
env_file = env_files.get(env, ".env")

# Return path only if file exists, otherwise return None
# Pydantic will load from environment variables if no .env file
if Path(env_file).exists():
Expand All @@ -42,7 +43,6 @@ def get_env_file() -> str | None:
logger.debug(f"No .env file found at {env_file}, loading from environment variables")
return None


class Settings(BaseSettings):
"""
Application settings loaded from environment variables.
Expand Down Expand Up @@ -85,19 +85,45 @@ class Settings(BaseSettings):
duplicate_spam_threshold: int = 2
duplicate_spam_min_length: int = 20
duplicate_spam_similarity: float = 0.95
bio_bait_enabled: bool = True
bio_bait_monitor_only: bool = False
bio_bait_alert_chat_id: int | None = None
groups_config_path: str = "groups.json"
logfire_token: str | None = None
logfire_service_name: str = "pythonid-bot"
logfire_environment: str = "production"
logfire_enabled: bool = True
log_level: str = "INFO"
plugins_default: dict[str, bool] = {}

model_config = SettingsConfigDict(
env_file=get_env_file(),
env_file_encoding="utf-8",
extra="ignore",
)

@field_validator("plugins_default", mode="before")
@classmethod
def parse_and_validate_plugins_default(cls, v: object) -> dict[str, bool]:
"""Parse PLUGINS_DEFAULT env var as JSON object and validate keys/values."""
if isinstance(v, dict):
parsed = v
elif isinstance(v, str):
if not v.strip():
return {}
try:
parsed = json.loads(v)
except json.JSONDecodeError:
raise ValueError("PLUGINS_DEFAULT must be a valid JSON string")
if not isinstance(parsed, dict):
raise ValueError("PLUGINS_DEFAULT must be a JSON object")
elif isinstance(v, list):
raise ValueError("PLUGINS_DEFAULT must be a JSON object, got array")
else:
return {}
from bot.plugins.config import validate_plugin_map
return validate_plugin_map(parsed)

def model_post_init(self, __context):
"""Validate and log non-sensitive configuration values after initialization."""
if self.group_id >= 0:
Expand All @@ -115,7 +141,7 @@ def model_post_init(self, __context):
env = os.getenv("BOT_ENV", "production")
if self.logfire_environment == "production" and env == "staging":
self.logfire_environment = "staging"

logger.info("Configuration loaded successfully")
logger.debug(f"group_id: {self.group_id}")
logger.debug(f"warning_topic_id: {self.warning_topic_id}")
Expand All @@ -127,9 +153,13 @@ def model_post_init(self, __context):
logger.debug(f"captcha_timeout_seconds: {self.captcha_timeout_seconds}")
logger.debug(f"new_user_probation_hours: {self.new_user_probation_hours}")
logger.debug(f"new_user_violation_threshold: {self.new_user_violation_threshold}")
logger.debug(f"bio_bait_enabled: {self.bio_bait_enabled}")
logger.debug(f"bio_bait_monitor_only: {self.bio_bait_monitor_only}")
logger.debug(f"bio_bait_alert_chat_id: {self.bio_bait_alert_chat_id}")
logger.debug(f"telegram_bot_token: {'***' + self.telegram_bot_token[-4:]}") # Mask sensitive token
logger.debug(f"logfire_enabled: {self.logfire_enabled}")
logger.debug(f"logfire_environment: {self.logfire_environment}")
logger.debug(f"plugins_default: {self.plugins_default}")

@property
def probation_timedelta(self) -> timedelta:
Expand All @@ -143,7 +173,6 @@ def warning_time_threshold_timedelta(self) -> timedelta:
def captcha_timeout_timedelta(self) -> timedelta:
return timedelta(seconds=self.captcha_timeout_seconds)


@lru_cache
def get_settings() -> Settings:
"""
Expand Down
45 changes: 45 additions & 0 deletions src/bot/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,51 @@ def format_hours_display(hours: int) -> str:
"📌 [Peraturan Grup]({rules_link})"
)

# Bio bait spam notification (e.g. "cek bio aku" / "lihat byoh")
BIO_BAIT_SPAM_NOTIFICATION = (
"🚫 *Spam Bio Bait Terdeteksi*\n\n"
"Pesan dari {user_mention} telah dihapus karena berisi ajakan "
"untuk mengecek bio/profil, pola yang umum dipakai untuk spam/promosi/scam.\n\n"
"Pengguna telah dibatasi.\n\n"
"📌 [Peraturan Grup]({rules_link})"
)

BIO_BAIT_SPAM_NOTIFICATION_NO_RESTRICT = (
"🚫 *Spam Bio Bait Terdeteksi*\n\n"
"Pesan dari {user_mention} telah dihapus karena berisi ajakan "
"untuk mengecek bio/profil, pola yang umum dipakai untuk spam/promosi/scam.\n\n"
"📌 [Peraturan Grup]({rules_link})"
)

# Bio profile link spam (user's profile bio contains promo/scam links)
BIO_LINK_SPAM_NOTIFICATION = (
"🚫 *Spam Bio Profil Terdeteksi*\n\n"
"Pesan dari {user_mention} telah dihapus karena akun ini memiliki "
"bio profil dengan tautan/mention Telegram mencurigakan.\n\n"
"Pengguna telah dibatasi.\n\n"
"📌 [Peraturan Grup]({rules_link})"
)

BIO_LINK_SPAM_NOTIFICATION_NO_RESTRICT = (
"🚫 *Spam Bio Profil Terdeteksi*\n\n"
"Pesan dari {user_mention} telah dihapus karena akun ini memiliki "
"bio profil dengan tautan/mention Telegram mencurigakan.\n\n"
"📌 [Peraturan Grup]({rules_link})"
)

# Monitor-only alert for owner/admin chat when bio bait match is detected.
# Sent without parse_mode to preserve raw message/bio content for forensic review.
BIO_BAIT_MONITOR_ALERT = (
"[BIO BAIT MONITOR]\n"
"Reason: {reason}\n"
"Group ID: {group_id}\n"
"User ID: {user_id}\n"
"User: {user_name}\n"
"Username: {username}\n"
"Message:\n{message_text}\n\n"
"Profile Bio:\n{profile_bio}"
)

# Whitelisted URL domains for new user probation
# These domains are allowed even during probation period
# Matches exact domain or subdomains (e.g., "github.com" matches "www.github.com")
Expand Down
28 changes: 19 additions & 9 deletions src/bot/group_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

logger = logging.getLogger(__name__)


class GroupConfig(BaseModel):
"""
Per-group configuration settings.
Expand All @@ -41,6 +40,10 @@ class GroupConfig(BaseModel):
duplicate_spam_threshold: int = 2
duplicate_spam_min_length: int = 20
duplicate_spam_similarity: float = 0.95
bio_bait_enabled: bool = True
bio_bait_monitor_only: bool = False
bio_bait_alert_chat_id: int | None = None
plugins: dict[str, bool] | None = None

@field_validator("group_id")
@classmethod
Expand Down Expand Up @@ -77,6 +80,17 @@ def probation_hours_must_be_non_negative(cls, v: int) -> int:
raise ValueError("new_user_probation_hours must be >= 0")
return v

@field_validator("plugins", mode="before")
@classmethod
def validate_plugins(cls, v: object) -> dict[str, bool] | None:
from bot.plugins.config import validate_plugin_map

if v is None:
return None
if not isinstance(v, dict):
raise ValueError("plugins must be a dict or None")
return validate_plugin_map(v)

@property
def probation_timedelta(self) -> timedelta:
return timedelta(hours=self.new_user_probation_hours)
Expand All @@ -89,7 +103,6 @@ def warning_time_threshold_timedelta(self) -> timedelta:
def captcha_timeout_timedelta(self) -> timedelta:
return timedelta(seconds=self.captcha_timeout_seconds)


class GroupRegistry:
"""
Registry of monitored groups.
Expand All @@ -115,7 +128,6 @@ def all_groups(self) -> list[GroupConfig]:
def is_monitored(self, group_id: int) -> bool:
return group_id in self._groups


def load_groups_from_json(path: str) -> list[GroupConfig]:
"""
Parse a groups.json file into a list of GroupConfig objects.
Expand Down Expand Up @@ -151,7 +163,6 @@ def load_groups_from_json(path: str) -> list[GroupConfig]:

return configs


def build_group_registry(settings: object) -> GroupRegistry:
"""
Build a GroupRegistry from settings.
Expand Down Expand Up @@ -193,12 +204,15 @@ def build_group_registry(settings: object) -> GroupRegistry:
duplicate_spam_threshold=settings.duplicate_spam_threshold,
duplicate_spam_min_length=settings.duplicate_spam_min_length,
duplicate_spam_similarity=settings.duplicate_spam_similarity,
bio_bait_enabled=getattr(settings, "bio_bait_enabled", True),
bio_bait_monitor_only=getattr(settings, "bio_bait_monitor_only", False),
bio_bait_alert_chat_id=getattr(settings, "bio_bait_alert_chat_id", None),
plugins=getattr(settings, "plugins_default", None),
)
registry.register(config)

return registry


def get_group_config_for_update(update: Update) -> GroupConfig | None:
"""
Get the GroupConfig for the group in the given Update.
Expand All @@ -219,11 +233,9 @@ def get_group_config_for_update(update: Update) -> GroupConfig | None:
logger.error("Group registry not initialized; skipping update")
return None


# Module-level singleton
_registry: GroupRegistry | None = None


def init_group_registry(settings: object) -> GroupRegistry:
"""
Initialize the global group registry singleton.
Expand All @@ -240,7 +252,6 @@ def init_group_registry(settings: object) -> GroupRegistry:
_registry = build_group_registry(settings)
return _registry


def get_group_registry() -> GroupRegistry:
"""
Get the global group registry singleton.
Expand All @@ -255,7 +266,6 @@ def get_group_registry() -> GroupRegistry:
raise RuntimeError("Group registry not initialized. Call init_group_registry() first.")
return _registry


def reset_group_registry() -> None:
"""Reset the group registry singleton (for testing)."""
global _registry
Expand Down
Loading
Loading