import json, subprocess ES_URL = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200" AUTH = "elastic:F%?QDcWes7N2WTuiYD11" def count_query(must_clauses): q = {"query": {"bool": {"must": must_clauses}}} cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json", f"{ES_URL}/growth_activity_behavior/_count", "-d", json.dumps(q)] r = subprocess.run(cmd, capture_output=True, text=True) return json.loads(r.stdout).get("count", 0) total = count_query([]) # 四种组合 nonzero_account = {"bool": {"must_not": [{"term": {"accountId": 0}}]}} nonzero_user = {"bool": {"must_not": [{"term": {"userId": 0}}]}} zero_account = {"term": {"accountId": 0}} zero_user = {"term": {"userId": 0}} a0_u0 = count_query([zero_account, zero_user]) a0_ux = count_query([zero_account, nonzero_user]) ax_u0 = count_query([nonzero_account, zero_user]) ax_ux = count_query([nonzero_account, nonzero_user]) print("=== growth_activity_behavior 中 accountId x userId 组合 ===\n") print(f"{'accountId':>12} {'userId':>12} {'条数':>10} {'占比':>10}") print("-" * 48) print(f"{'=0':>12} {'=0':>12} {a0_u0:>10} {a0_u0/total*100:>9.1f}%") print(f"{'=0':>12} {'>0':>12} {a0_ux:>10} {a0_ux/total*100:>9.1f}%") print(f"{'>0':>12} {'=0':>12} {ax_u0:>10} {ax_u0/total*100:>9.1f}%") print(f"{'>0':>12} {'>0':>12} {ax_ux:>10} {ax_ux/total*100:>9.1f}%") print(f"{'总计':>12} {'':>12} {total:>10}") # 按 subBehavior 细分 - ES aggregation print("\n=== 按 behavior + subBehavior 细分 ===\n") agg_query = { "size": 0, "aggs": { "behaviors": { "terms": {"field": "behavior", "size": 10}, "aggs": { "subs": { "terms": {"field": "subBehavior", "size": 10}, "aggs": { "a0_u0": {"filter": {"bool": {"must": [zero_account, zero_user]}}}, "a0_ux": {"filter": {"bool": {"must": [zero_account, nonzero_user]}}}, "ax_u0": {"filter": {"bool": {"must": [nonzero_account, zero_user]}}}, "ax_ux": {"filter": {"bool": {"must": [nonzero_account, nonzero_user]}}} } } } } } } cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json", f"{ES_URL}/growth_activity_behavior/_search", "-d", json.dumps(agg_query)] r = subprocess.run(cmd, capture_output=True, text=True) data = json.loads(r.stdout) header = f"{'behavior / subBehavior':<40} {'总计':>8} {'a=0,u=0':>8} {'a=0,u>0':>8} {'a>0,u=0':>8} {'a>0,u>0':>8}" print(header) print("-" * len(header)) for bb in data["aggregations"]["behaviors"]["buckets"]: bname = bb["key"] for sb in bb["subs"]["buckets"]: sname = sb["key"] n = sb["doc_count"] a0u0 = sb["a0_u0"]["doc_count"] a0ux = sb["a0_ux"]["doc_count"] axu0 = sb["ax_u0"]["doc_count"] axux = sb["ax_ux"]["doc_count"] label = f"{bname} / {sname}" print(f"{label:<40} {n:>8} {a0u0:>8} {a0ux:>8} {axu0:>8} {axux:>8}") # userId 不存在的记录 print(f"\n=== userId 字段不存在 的记录 ===") u_nonexist = count_query([{"bool": {"must_not": [{"exists": {"field": "userId"}}]}}]) print(f"userId 字段不存在: {u_nonexist}") # 抽 userId=0 的样本 cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json", f"{ES_URL}/growth_activity_behavior/_search?size=3", "-d", '{"query":{"term":{"userId":0}},"_source":true}'] r = subprocess.run(cmd, capture_output=True, text=True) data = json.loads(r.stdout) print(f"\nuserId=0 记录数: {data['hits']['total']['value']}") for h in data['hits']['hits']: print(f" {json.dumps(h['_source'], ensure_ascii=False)}")