wechat_msg_crawler/collector/config.py

70 lines
2.9 KiB
Python

"""采集器配置"""
import json
import os
from dataclasses import dataclass, field, asdict
from dotenv import load_dotenv
load_dotenv()
DEFAULT_CONFIG_PATH = os.path.expanduser("~/.wechat-cli/collect-chats/config.json")
@dataclass
class CollectorConfig:
# ---- MySQL ----
mysql_host: str = os.environ.get("MYSQL_HOST", "localhost")
mysql_port: int = int(os.environ.get("MYSQL_PORT", "3306"))
mysql_user: str = os.environ.get("MYSQL_USER", "root")
mysql_password: str = os.environ.get("MYSQL_PASSWORD", "")
mysql_database: str = os.environ.get("MYSQL_DATABASE", "")
mysql_table: str = os.environ.get("MYSQL_TABLE", "wechat_group_message")
# ---- 腾讯 COS ----
cos_secret_id: str = os.environ.get("COS_SECRET_ID", "")
cos_secret_key: str = os.environ.get("COS_SECRET_KEY", "")
cos_bucket: str = os.environ.get("COS_BUCKET", "")
cos_region: str = os.environ.get("COS_REGION", "ap-beijing")
cos_download_domain: str = os.environ.get("COS_DOWNLOAD_DOMAIN", "")
cos_base_path: str = os.environ.get("COS_BASE_PATH", "")
# ---- 扫描策略 ----
min_interval: float = 5.0 # hot 群扫描间隔(秒)
base_interval: float = 10.0 # warm 群扫描间隔
max_interval: float = 60.0 # cold 群最大间隔(保证 ≤1min 入库)
backoff_factor: float = 1.2 # cold 退避系数
batch_size: int = 10 # 每轮最多扫描群数
messages_per_scan: int = 200 # 每群每次最多拉取消息数
jitter_max: float = 1.0 # 群间随机延迟上限(秒)
cycle_sleep: float = 1.0 # 轮次间休眠(秒)
discovery_interval: float = 180.0 # 群聊发现间隔(秒)
hot_threshold: int = 300 # 最新消息 < N秒 算 hot
warm_threshold: int = 3600 # 最新消息 < N秒 算 warm
# ---- 过滤 ----
whitelist: list = field(default_factory=list) # 空=全部采集
blacklist: list = field(default_factory=list) # 跳过的群(支持正则)
# ---- 回溯补录 ----
backfill_interval: float = 60.0 # 定时扫描间隔(秒), 默认 1 分钟
backfill_lookback_days: int = 7 # 回溯天数
backfill_enabled: bool = True # 是否启用回溯补录
# ---- 其他 ----
log_level: str = "DEBUG"
@classmethod
def load(cls, path=None):
path = path or DEFAULT_CONFIG_PATH
if os.path.isfile(path):
with open(path, encoding="utf-8") as f:
data = json.load(f)
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
return cls()
def save(self, path=None):
path = path or DEFAULT_CONFIG_PATH
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(asdict(self), f, ensure_ascii=False, indent=2)