92 lines
3.7 KiB
Python
92 lines
3.7 KiB
Python
import json, subprocess
|
|
|
|
ES_URL = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200"
|
|
AUTH = "elastic:F%?QDcWes7N2WTuiYD11"
|
|
|
|
def count_query(must_clauses):
|
|
q = {"query": {"bool": {"must": must_clauses}}}
|
|
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
|
|
f"{ES_URL}/growth_activity_behavior/_count", "-d", json.dumps(q)]
|
|
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
return json.loads(r.stdout).get("count", 0)
|
|
|
|
total = count_query([])
|
|
|
|
# 四种组合
|
|
nonzero_account = {"bool": {"must_not": [{"term": {"accountId": 0}}]}}
|
|
nonzero_user = {"bool": {"must_not": [{"term": {"userId": 0}}]}}
|
|
zero_account = {"term": {"accountId": 0}}
|
|
zero_user = {"term": {"userId": 0}}
|
|
|
|
a0_u0 = count_query([zero_account, zero_user])
|
|
a0_ux = count_query([zero_account, nonzero_user])
|
|
ax_u0 = count_query([nonzero_account, zero_user])
|
|
ax_ux = count_query([nonzero_account, nonzero_user])
|
|
|
|
print("=== growth_activity_behavior 中 accountId x userId 组合 ===\n")
|
|
print(f"{'accountId':>12} {'userId':>12} {'条数':>10} {'占比':>10}")
|
|
print("-" * 48)
|
|
print(f"{'=0':>12} {'=0':>12} {a0_u0:>10} {a0_u0/total*100:>9.1f}%")
|
|
print(f"{'=0':>12} {'>0':>12} {a0_ux:>10} {a0_ux/total*100:>9.1f}%")
|
|
print(f"{'>0':>12} {'=0':>12} {ax_u0:>10} {ax_u0/total*100:>9.1f}%")
|
|
print(f"{'>0':>12} {'>0':>12} {ax_ux:>10} {ax_ux/total*100:>9.1f}%")
|
|
print(f"{'总计':>12} {'':>12} {total:>10}")
|
|
|
|
# 按 subBehavior 细分 - ES aggregation
|
|
print("\n=== 按 behavior + subBehavior 细分 ===\n")
|
|
agg_query = {
|
|
"size": 0,
|
|
"aggs": {
|
|
"behaviors": {
|
|
"terms": {"field": "behavior", "size": 10},
|
|
"aggs": {
|
|
"subs": {
|
|
"terms": {"field": "subBehavior", "size": 10},
|
|
"aggs": {
|
|
"a0_u0": {"filter": {"bool": {"must": [zero_account, zero_user]}}},
|
|
"a0_ux": {"filter": {"bool": {"must": [zero_account, nonzero_user]}}},
|
|
"ax_u0": {"filter": {"bool": {"must": [nonzero_account, zero_user]}}},
|
|
"ax_ux": {"filter": {"bool": {"must": [nonzero_account, nonzero_user]}}}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
|
|
f"{ES_URL}/growth_activity_behavior/_search", "-d", json.dumps(agg_query)]
|
|
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
data = json.loads(r.stdout)
|
|
|
|
header = f"{'behavior / subBehavior':<40} {'总计':>8} {'a=0,u=0':>8} {'a=0,u>0':>8} {'a>0,u=0':>8} {'a>0,u>0':>8}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for bb in data["aggregations"]["behaviors"]["buckets"]:
|
|
bname = bb["key"]
|
|
for sb in bb["subs"]["buckets"]:
|
|
sname = sb["key"]
|
|
n = sb["doc_count"]
|
|
a0u0 = sb["a0_u0"]["doc_count"]
|
|
a0ux = sb["a0_ux"]["doc_count"]
|
|
axu0 = sb["ax_u0"]["doc_count"]
|
|
axux = sb["ax_ux"]["doc_count"]
|
|
label = f"{bname} / {sname}"
|
|
print(f"{label:<40} {n:>8} {a0u0:>8} {a0ux:>8} {axu0:>8} {axux:>8}")
|
|
|
|
# userId 不存在的记录
|
|
print(f"\n=== userId 字段不存在 的记录 ===")
|
|
u_nonexist = count_query([{"bool": {"must_not": [{"exists": {"field": "userId"}}]}}])
|
|
print(f"userId 字段不存在: {u_nonexist}")
|
|
|
|
# 抽 userId=0 的样本
|
|
cmd = ["curl", "-s", "-u", AUTH, "-H", "Content-Type: application/json",
|
|
f"{ES_URL}/growth_activity_behavior/_search?size=3",
|
|
"-d", '{"query":{"term":{"userId":0}},"_source":true}']
|
|
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
data = json.loads(r.stdout)
|
|
print(f"\nuserId=0 记录数: {data['hits']['total']['value']}")
|
|
for h in data['hits']['hits']:
|
|
print(f" {json.dumps(h['_source'], ensure_ascii=False)}")
|