ai_member_xiaoxi/scripts/check_account_userid.py
2026-05-26 08:00:01 +08:00

92 lines
3.7 KiB
Python

import json, subprocess
ES_URL = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200"
AUTH = "elastic:F%?QDcWes7N2WTuiYD11"
def count_query(must_clauses):
q = {"query": {"bool": {"must": must_clauses}}}
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
f"{ES_URL}/growth_activity_behavior/_count", "-d", json.dumps(q)]
r = subprocess.run(cmd, capture_output=True, text=True)
return json.loads(r.stdout).get("count", 0)
total = count_query([])
# 四种组合
nonzero_account = {"bool": {"must_not": [{"term": {"accountId": 0}}]}}
nonzero_user = {"bool": {"must_not": [{"term": {"userId": 0}}]}}
zero_account = {"term": {"accountId": 0}}
zero_user = {"term": {"userId": 0}}
a0_u0 = count_query([zero_account, zero_user])
a0_ux = count_query([zero_account, nonzero_user])
ax_u0 = count_query([nonzero_account, zero_user])
ax_ux = count_query([nonzero_account, nonzero_user])
print("=== growth_activity_behavior 中 accountId x userId 组合 ===\n")
print(f"{'accountId':>12} {'userId':>12} {'条数':>10} {'占比':>10}")
print("-" * 48)
print(f"{'=0':>12} {'=0':>12} {a0_u0:>10} {a0_u0/total*100:>9.1f}%")
print(f"{'=0':>12} {'>0':>12} {a0_ux:>10} {a0_ux/total*100:>9.1f}%")
print(f"{'>0':>12} {'=0':>12} {ax_u0:>10} {ax_u0/total*100:>9.1f}%")
print(f"{'>0':>12} {'>0':>12} {ax_ux:>10} {ax_ux/total*100:>9.1f}%")
print(f"{'总计':>12} {'':>12} {total:>10}")
# 按 subBehavior 细分 - ES aggregation
print("\n=== 按 behavior + subBehavior 细分 ===\n")
agg_query = {
"size": 0,
"aggs": {
"behaviors": {
"terms": {"field": "behavior", "size": 10},
"aggs": {
"subs": {
"terms": {"field": "subBehavior", "size": 10},
"aggs": {
"a0_u0": {"filter": {"bool": {"must": [zero_account, zero_user]}}},
"a0_ux": {"filter": {"bool": {"must": [zero_account, nonzero_user]}}},
"ax_u0": {"filter": {"bool": {"must": [nonzero_account, zero_user]}}},
"ax_ux": {"filter": {"bool": {"must": [nonzero_account, nonzero_user]}}}
}
}
}
}
}
}
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
f"{ES_URL}/growth_activity_behavior/_search", "-d", json.dumps(agg_query)]
r = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(r.stdout)
header = f"{'behavior / subBehavior':<40} {'总计':>8} {'a=0,u=0':>8} {'a=0,u>0':>8} {'a>0,u=0':>8} {'a>0,u>0':>8}"
print(header)
print("-" * len(header))
for bb in data["aggregations"]["behaviors"]["buckets"]:
bname = bb["key"]
for sb in bb["subs"]["buckets"]:
sname = sb["key"]
n = sb["doc_count"]
a0u0 = sb["a0_u0"]["doc_count"]
a0ux = sb["a0_ux"]["doc_count"]
axu0 = sb["ax_u0"]["doc_count"]
axux = sb["ax_ux"]["doc_count"]
label = f"{bname} / {sname}"
print(f"{label:<40} {n:>8} {a0u0:>8} {a0ux:>8} {axu0:>8} {axux:>8}")
# userId 不存在的记录
print(f"\n=== userId 字段不存在 的记录 ===")
u_nonexist = count_query([{"bool": {"must_not": [{"exists": {"field": "userId"}}]}}])
print(f"userId 字段不存在: {u_nonexist}")
# 抽 userId=0 的样本
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
f"{ES_URL}/growth_activity_behavior/_search?size=3",
"-d", '{"query":{"term":{"userId":0}},"_source":true}']
r = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(r.stdout)
print(f"\nuserId=0 记录数: {data['hits']['total']['value']}")
for h in data['hits']['hits']:
print(f" {json.dumps(h['_source'], ensure_ascii=False)}")