ai_member_xiaoxi/scripts/age_learning_payment_analysis.py
2026-06-09 08:00:01 +08:00

264 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
按孩子年龄区段分析:用户付费情况、上课时长、退费率
输出 Excel 报表
"""
import psycopg2
import pandas as pd
from datetime import datetime, date
import os
PG_CONFIG = {
'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
'port': 28591,
'user': 'ai_member',
'password': 'LdfjdjL83h3h3^$&**YGG*',
'dbname': 'vala_bi',
}
OUTPUT = '/root/.openclaw/workspace/output/age_learning_payment_analysis.xlsx'
def get_conn():
return psycopg2.connect(**PG_CONFIG)
def calc_age(birthday_str):
"""从生日字符串计算当前年龄"""
if not birthday_str or birthday_str == '':
return None
try:
# 尝试多种日期格式
for fmt in ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y/%m/%d', '%Y-%m-%dT%H:%M:%S']:
try:
bd = datetime.strptime(birthday_str.strip(), fmt).date()
break
except ValueError:
continue
else:
# 尝试 YYYY-M-D 格式
parts = birthday_str.strip().split('-')
if len(parts) == 3:
bd = date(int(parts[0]), int(parts[1]), int(parts[2]))
else:
return None
today = date.today()
age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day))
return age
except:
return None
def age_group(age):
"""年龄分组"""
if age is None:
return '未知'
if age <= 3:
return '0-3岁'
elif age <= 5:
return '4-5岁'
elif age <= 7:
return '6-7岁'
elif age <= 9:
return '8-9岁'
elif age <= 11:
return '10-11岁'
elif age <= 14:
return '12-14岁'
else:
return '15岁以上'
def main():
conn = get_conn()
print("1/5 获取角色生日数据...")
chars_df = pd.read_sql("""
SELECT c.id AS char_id, c.account_id, c.birthday, c.status, c.deleted_at,
a.status AS account_status
FROM bi_vala_app_character c
JOIN bi_vala_app_account a ON c.account_id = a.id
WHERE c.status = 1 AND c.deleted_at IS NULL
AND a.status = 1 AND a.deleted_at IS NULL
AND c.birthday IS NOT NULL AND c.birthday != ''
""", conn)
print(f" 有效角色数: {len(chars_df)}")
# 计算年龄
chars_df['age'] = chars_df['birthday'].apply(calc_age)
chars_df['age_group'] = chars_df['age'].apply(age_group)
print("2/5 获取订单数据...")
orders_df = pd.read_sql("""
SELECT o.account_id, o.id AS order_id, o.trade_no,
o.pay_amount_int::numeric/100 AS pay_amount,
o.order_status, o.key_from, o.pay_success_date
FROM bi_vala_order o
JOIN bi_vala_app_account a ON o.account_id = a.id
WHERE a.status = 1 AND a.deleted_at IS NULL
AND o.order_status IN (3, 4)
AND o.pay_success_date IS NOT NULL
""", conn)
print(f" 订单数: {len(orders_df)}")
print("3/5 获取退款数据...")
refunds_df = pd.read_sql("""
SELECT r.trade_no, r.refund_amount_int::numeric/100 AS refund_amount,
r.status AS refund_status
FROM bi_refund_order r
WHERE r.status = 3
""", conn)
print(f" 退款数: {len(refunds_df)}")
print("4/5 获取学习时长数据...")
learning_df = pd.read_sql("""
SELECT ul.user_id AS char_id, SUM(ul.learning_time) AS total_learning_seconds
FROM user_learning ul
GROUP BY ul.user_id
""", conn)
print(f" 有学习记录的角色数: {len(learning_df)}")
conn.close()
print("5/5 关联计算...")
# 关联订单到角色(通过 account_id
# 一个 account 可能有多个角色,这里按 account 维度统计付费
account_orders = orders_df.groupby('account_id').agg(
order_count=('order_id', 'count'),
total_gmv=('pay_amount', 'sum'),
has_order=('order_id', lambda x: 1),
).reset_index()
# 退款关联
refund_trade_nos = set(refunds_df['trade_no'].tolist())
orders_df['is_refunded'] = orders_df['trade_no'].apply(lambda x: x in refund_trade_nos)
# 按 account 统计退款
account_refund = orders_df.groupby('account_id').agg(
refund_order_count=('is_refunded', 'sum'),
total_refund_amount=('pay_amount', lambda x: x[orders_df.loc[x.index, 'is_refunded']].sum()),
).reset_index()
account_refund['all_refunded'] = account_refund.apply(
lambda r: 1 if r['refund_order_count'] >= r['refund_order_count'] else 0, axis=1
)
# 合并到角色
chars_df = chars_df.merge(account_orders[['account_id', 'order_count', 'total_gmv', 'has_order']],
on='account_id', how='left')
chars_df = chars_df.merge(account_refund[['account_id', 'refund_order_count', 'total_refund_amount']],
on='account_id', how='left')
chars_df['order_count'] = chars_df['order_count'].fillna(0).astype(int)
chars_df['total_gmv'] = chars_df['total_gmv'].fillna(0)
chars_df['has_order'] = chars_df['has_order'].fillna(0).astype(int)
chars_df['refund_order_count'] = chars_df['refund_order_count'].fillna(0).astype(int)
chars_df['total_refund_amount'] = chars_df['total_refund_amount'].fillna(0)
chars_df['gsv'] = chars_df['total_gmv'] - chars_df['total_refund_amount']
chars_df['is_paid'] = (chars_df['has_order'] == 1).astype(int)
chars_df['is_all_refunded'] = ((chars_df['order_count'] > 0) & (chars_df['refund_order_count'] >= chars_df['order_count'])).astype(int)
# 关联学习时长
chars_df = chars_df.merge(learning_df[['char_id', 'total_learning_seconds']],
on='char_id', how='left')
chars_df['total_learning_seconds'] = chars_df['total_learning_seconds'].fillna(0)
chars_df['total_learning_min'] = chars_df['total_learning_seconds'] / 60.0
# 按年龄组汇总
age_order = ['0-3岁', '4-5岁', '6-7岁', '8-9岁', '10-11岁', '12-14岁', '15岁以上', '未知']
results = []
for ag in age_order:
subset = chars_df[chars_df['age_group'] == ag]
if len(subset) == 0:
continue
total_chars = len(subset)
total_accounts = subset['account_id'].nunique()
paid_accounts = subset[subset['is_paid'] == 1]['account_id'].nunique()
all_refunded_accounts = subset[subset['is_all_refunded'] == 1]['account_id'].nunique()
# 付费率 = 付费account数 / 总account数
pay_rate = paid_accounts / total_accounts * 100 if total_accounts > 0 else 0
# 退费率 = 全部退款的account数 / 付费account数
refund_rate = all_refunded_accounts / paid_accounts * 100 if paid_accounts > 0 else 0
# GMV / GSV按角色汇总的account去重
paid_subset = subset[subset['is_paid'] == 1]
# 按account去重取GMV
account_gmv = paid_subset.groupby('account_id')['total_gmv'].first().sum()
account_gsv = paid_subset.groupby('account_id')['gsv'].first().sum()
# 人均GMV
avg_gmv_per_paid = account_gmv / paid_accounts if paid_accounts > 0 else 0
# 学习时长
avg_learning_min = subset['total_learning_min'].mean()
median_learning_min = subset['total_learning_min'].median()
learned_chars = (subset['total_learning_seconds'] > 0).sum()
learn_rate = learned_chars / total_chars * 100 if total_chars > 0 else 0
# 有学习的角色平均学习时长
learned_subset = subset[subset['total_learning_seconds'] > 0]
avg_learn_min_learned = learned_subset['total_learning_min'].mean() if len(learned_subset) > 0 else 0
results.append({
'年龄组': ag,
'角色数': total_chars,
'用户数(account)': total_accounts,
'付费用户数': paid_accounts,
'付费率': round(pay_rate, 1),
'全部退款用户数': all_refunded_accounts,
'退费率(全额退)': round(refund_rate, 1),
'GMV(元)': round(account_gmv, 0),
'GSV(元)': round(account_gsv, 0),
'人均GMV(付费用户)': round(avg_gmv_per_paid, 0),
'有学习记录角色数': learned_chars,
'学习参与率': round(learn_rate, 1),
'全员平均学习时长(分钟)': round(avg_learning_min, 1),
'有学习角色平均时长(分钟)': round(avg_learn_min_learned, 1),
'中位学习时长(分钟)': round(median_learning_min, 1),
})
result_df = pd.DataFrame(results)
# 汇总行
total_row = {
'年龄组': '合计',
'角色数': chars_df['char_id'].nunique(),
'用户数(account)': chars_df['account_id'].nunique(),
'付费用户数': chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(),
'付费率': round(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique() / chars_df['account_id'].nunique() * 100, 1),
'全部退款用户数': chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique(),
'退费率(全额退)': round(chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique() / max(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(), 1) * 100, 1),
'GMV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().sum(), 0),
'GSV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['gsv'].first().sum(), 0),
'人均GMV(付费用户)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().mean(), 0),
'有学习记录角色数': (chars_df['total_learning_seconds'] > 0).sum(),
'学习参与率': round((chars_df['total_learning_seconds'] > 0).sum() / len(chars_df) * 100, 1),
'全员平均学习时长(分钟)': round(chars_df['total_learning_min'].mean(), 1),
'有学习角色平均时长(分钟)': round(chars_df[chars_df['total_learning_seconds'] > 0]['total_learning_min'].mean(), 1),
'中位学习时长(分钟)': round(chars_df['total_learning_min'].median(), 1),
}
result_df = pd.concat([result_df, pd.DataFrame([total_row])], ignore_index=True)
# 写入 Excel
with pd.ExcelWriter(OUTPUT, engine='openpyxl') as writer:
result_df.to_excel(writer, sheet_name='年龄分析', index=False)
# 年龄分布明细
age_dist = chars_df.groupby('age').agg(
角色数=('char_id', 'count'),
付费角色数=('is_paid', 'sum'),
).reset_index()
age_dist['付费率'] = round(age_dist['付费角色数'] / age_dist['角色数'] * 100, 1)
age_dist = age_dist.sort_values('age')
age_dist.to_excel(writer, sheet_name='年龄分布明细', index=False)
print(f"\n✅ 报表已生成: {OUTPUT}")
print("\n=== 按年龄组汇总 ===")
print(result_df.to_string(index=False))
return OUTPUT
if __name__ == '__main__':
main()