ai_member_xiaoxi/scripts/check_account_userid.py

import json, subprocess

ES_URL = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200"
AUTH = "elastic:F%?QDcWes7N2WTuiYD11"

def count_query(must_clauses):
    q = {"query": {"bool": {"must": must_clauses}}}
    cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
           f"{ES_URL}/growth_activity_behavior/_count", "-d", json.dumps(q)]
    r = subprocess.run(cmd, capture_output=True, text=True)
    return json.loads(r.stdout).get("count", 0)

total = count_query([])

# 四种组合
nonzero_account = {"bool": {"must_not": [{"term": {"accountId": 0}}]}}
nonzero_user = {"bool": {"must_not": [{"term": {"userId": 0}}]}}
zero_account = {"term": {"accountId": 0}}
zero_user = {"term": {"userId": 0}}

a0_u0 = count_query([zero_account, zero_user])
a0_ux = count_query([zero_account, nonzero_user])
ax_u0 = count_query([nonzero_account, zero_user])
ax_ux = count_query([nonzero_account, nonzero_user])

print("=== growth_activity_behavior 中 accountId x userId 组合 ===\n")
print(f"{'accountId':>12} {'userId':>12} {'条数':>10} {'占比':>10}")
print("-" * 48)
print(f"{'=0':>12} {'=0':>12} {a0_u0:>10} {a0_u0/total*100:>9.1f}%")
print(f"{'=0':>12} {'>0':>12} {a0_ux:>10} {a0_ux/total*100:>9.1f}%")
print(f"{'>0':>12} {'=0':>12} {ax_u0:>10} {ax_u0/total*100:>9.1f}%")
print(f"{'>0':>12} {'>0':>12} {ax_ux:>10} {ax_ux/total*100:>9.1f}%")
print(f"{'总计':>12} {'':>12} {total:>10}")

# 按 subBehavior 细分 - ES aggregation
print("\n=== 按 behavior + subBehavior 细分 ===\n")
agg_query = {
    "size": 0,
    "aggs": {
        "behaviors": {
            "terms": {"field": "behavior", "size": 10},
            "aggs": {
                "subs": {
                    "terms": {"field": "subBehavior", "size": 10},
                    "aggs": {
                        "a0_u0": {"filter": {"bool": {"must": [zero_account, zero_user]}}},
                        "a0_ux": {"filter": {"bool": {"must": [zero_account, nonzero_user]}}},
                        "ax_u0": {"filter": {"bool": {"must": [nonzero_account, zero_user]}}},
                        "ax_ux": {"filter": {"bool": {"must": [nonzero_account, nonzero_user]}}}
                    }
                }
            }
        }
    }
}

cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
       f"{ES_URL}/growth_activity_behavior/_search", "-d", json.dumps(agg_query)]
r = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(r.stdout)

header = f"{'behavior / subBehavior':<40} {'总计':>8} {'a=0,u=0':>8} {'a=0,u>0':>8} {'a>0,u=0':>8} {'a>0,u>0':>8}"
print(header)
print("-" * len(header))

for bb in data["aggregations"]["behaviors"]["buckets"]:
    bname = bb["key"]
    for sb in bb["subs"]["buckets"]:
        sname = sb["key"]
        n = sb["doc_count"]
        a0u0 = sb["a0_u0"]["doc_count"]
        a0ux = sb["a0_ux"]["doc_count"]
        axu0 = sb["ax_u0"]["doc_count"]
        axux = sb["ax_ux"]["doc_count"]
        label = f"{bname} / {sname}"
        print(f"{label:<40} {n:>8} {a0u0:>8} {a0ux:>8} {axu0:>8} {axux:>8}")

# userId 不存在的记录
print(f"\n=== userId 字段不存在 的记录 ===")
u_nonexist = count_query([{"bool": {"must_not": [{"exists": {"field": "userId"}}]}}])
print(f"userId 字段不存在: {u_nonexist}")

# 抽 userId=0 的样本
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
       f"{ES_URL}/growth_activity_behavior/_search?size=3",
       "-d", '{"query":{"term":{"userId":0}},"_source":true}']
r = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(r.stdout)
print(f"\nuserId=0 记录数: {data['hits']['total']['value']}")
for h in data['hits']['hits']:
    print(f"  {json.dumps(h['_source'], ensure_ascii=False)}")