264 lines
11 KiB
Python
264 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
按孩子年龄区段分析:用户付费情况、上课时长、退费率
|
||
输出 Excel 报表
|
||
"""
|
||
import psycopg2
|
||
import pandas as pd
|
||
from datetime import datetime, date
|
||
import os
|
||
|
||
PG_CONFIG = {
|
||
'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
|
||
'port': 28591,
|
||
'user': 'ai_member',
|
||
'password': 'LdfjdjL83h3h3^$&**YGG*',
|
||
'dbname': 'vala_bi',
|
||
}
|
||
|
||
OUTPUT = '/root/.openclaw/workspace/output/age_learning_payment_analysis.xlsx'
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(**PG_CONFIG)
|
||
|
||
def calc_age(birthday_str):
|
||
"""从生日字符串计算当前年龄"""
|
||
if not birthday_str or birthday_str == '':
|
||
return None
|
||
try:
|
||
# 尝试多种日期格式
|
||
for fmt in ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y/%m/%d', '%Y-%m-%dT%H:%M:%S']:
|
||
try:
|
||
bd = datetime.strptime(birthday_str.strip(), fmt).date()
|
||
break
|
||
except ValueError:
|
||
continue
|
||
else:
|
||
# 尝试 YYYY-M-D 格式
|
||
parts = birthday_str.strip().split('-')
|
||
if len(parts) == 3:
|
||
bd = date(int(parts[0]), int(parts[1]), int(parts[2]))
|
||
else:
|
||
return None
|
||
|
||
today = date.today()
|
||
age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day))
|
||
return age
|
||
except:
|
||
return None
|
||
|
||
def age_group(age):
|
||
"""年龄分组"""
|
||
if age is None:
|
||
return '未知'
|
||
if age <= 3:
|
||
return '0-3岁'
|
||
elif age <= 5:
|
||
return '4-5岁'
|
||
elif age <= 7:
|
||
return '6-7岁'
|
||
elif age <= 9:
|
||
return '8-9岁'
|
||
elif age <= 11:
|
||
return '10-11岁'
|
||
elif age <= 14:
|
||
return '12-14岁'
|
||
else:
|
||
return '15岁以上'
|
||
|
||
def main():
|
||
conn = get_conn()
|
||
|
||
print("1/5 获取角色生日数据...")
|
||
chars_df = pd.read_sql("""
|
||
SELECT c.id AS char_id, c.account_id, c.birthday, c.status, c.deleted_at,
|
||
a.status AS account_status
|
||
FROM bi_vala_app_character c
|
||
JOIN bi_vala_app_account a ON c.account_id = a.id
|
||
WHERE c.status = 1 AND c.deleted_at IS NULL
|
||
AND a.status = 1 AND a.deleted_at IS NULL
|
||
AND c.birthday IS NOT NULL AND c.birthday != ''
|
||
""", conn)
|
||
print(f" 有效角色数: {len(chars_df)}")
|
||
|
||
# 计算年龄
|
||
chars_df['age'] = chars_df['birthday'].apply(calc_age)
|
||
chars_df['age_group'] = chars_df['age'].apply(age_group)
|
||
|
||
print("2/5 获取订单数据...")
|
||
orders_df = pd.read_sql("""
|
||
SELECT o.account_id, o.id AS order_id, o.trade_no,
|
||
o.pay_amount_int::numeric/100 AS pay_amount,
|
||
o.order_status, o.key_from, o.pay_success_date
|
||
FROM bi_vala_order o
|
||
JOIN bi_vala_app_account a ON o.account_id = a.id
|
||
WHERE a.status = 1 AND a.deleted_at IS NULL
|
||
AND o.order_status IN (3, 4)
|
||
AND o.pay_success_date IS NOT NULL
|
||
""", conn)
|
||
print(f" 订单数: {len(orders_df)}")
|
||
|
||
print("3/5 获取退款数据...")
|
||
refunds_df = pd.read_sql("""
|
||
SELECT r.trade_no, r.refund_amount_int::numeric/100 AS refund_amount,
|
||
r.status AS refund_status
|
||
FROM bi_refund_order r
|
||
WHERE r.status = 3
|
||
""", conn)
|
||
print(f" 退款数: {len(refunds_df)}")
|
||
|
||
print("4/5 获取学习时长数据...")
|
||
learning_df = pd.read_sql("""
|
||
SELECT ul.user_id AS char_id, SUM(ul.learning_time) AS total_learning_seconds
|
||
FROM user_learning ul
|
||
GROUP BY ul.user_id
|
||
""", conn)
|
||
print(f" 有学习记录的角色数: {len(learning_df)}")
|
||
|
||
conn.close()
|
||
|
||
print("5/5 关联计算...")
|
||
|
||
# 关联订单到角色(通过 account_id)
|
||
# 一个 account 可能有多个角色,这里按 account 维度统计付费
|
||
account_orders = orders_df.groupby('account_id').agg(
|
||
order_count=('order_id', 'count'),
|
||
total_gmv=('pay_amount', 'sum'),
|
||
has_order=('order_id', lambda x: 1),
|
||
).reset_index()
|
||
|
||
# 退款关联
|
||
refund_trade_nos = set(refunds_df['trade_no'].tolist())
|
||
orders_df['is_refunded'] = orders_df['trade_no'].apply(lambda x: x in refund_trade_nos)
|
||
|
||
# 按 account 统计退款
|
||
account_refund = orders_df.groupby('account_id').agg(
|
||
refund_order_count=('is_refunded', 'sum'),
|
||
total_refund_amount=('pay_amount', lambda x: x[orders_df.loc[x.index, 'is_refunded']].sum()),
|
||
).reset_index()
|
||
account_refund['all_refunded'] = account_refund.apply(
|
||
lambda r: 1 if r['refund_order_count'] >= r['refund_order_count'] else 0, axis=1
|
||
)
|
||
|
||
# 合并到角色
|
||
chars_df = chars_df.merge(account_orders[['account_id', 'order_count', 'total_gmv', 'has_order']],
|
||
on='account_id', how='left')
|
||
chars_df = chars_df.merge(account_refund[['account_id', 'refund_order_count', 'total_refund_amount']],
|
||
on='account_id', how='left')
|
||
|
||
chars_df['order_count'] = chars_df['order_count'].fillna(0).astype(int)
|
||
chars_df['total_gmv'] = chars_df['total_gmv'].fillna(0)
|
||
chars_df['has_order'] = chars_df['has_order'].fillna(0).astype(int)
|
||
chars_df['refund_order_count'] = chars_df['refund_order_count'].fillna(0).astype(int)
|
||
chars_df['total_refund_amount'] = chars_df['total_refund_amount'].fillna(0)
|
||
chars_df['gsv'] = chars_df['total_gmv'] - chars_df['total_refund_amount']
|
||
chars_df['is_paid'] = (chars_df['has_order'] == 1).astype(int)
|
||
chars_df['is_all_refunded'] = ((chars_df['order_count'] > 0) & (chars_df['refund_order_count'] >= chars_df['order_count'])).astype(int)
|
||
|
||
# 关联学习时长
|
||
chars_df = chars_df.merge(learning_df[['char_id', 'total_learning_seconds']],
|
||
on='char_id', how='left')
|
||
chars_df['total_learning_seconds'] = chars_df['total_learning_seconds'].fillna(0)
|
||
chars_df['total_learning_min'] = chars_df['total_learning_seconds'] / 60.0
|
||
|
||
# 按年龄组汇总
|
||
age_order = ['0-3岁', '4-5岁', '6-7岁', '8-9岁', '10-11岁', '12-14岁', '15岁以上', '未知']
|
||
|
||
results = []
|
||
for ag in age_order:
|
||
subset = chars_df[chars_df['age_group'] == ag]
|
||
if len(subset) == 0:
|
||
continue
|
||
|
||
total_chars = len(subset)
|
||
total_accounts = subset['account_id'].nunique()
|
||
paid_accounts = subset[subset['is_paid'] == 1]['account_id'].nunique()
|
||
all_refunded_accounts = subset[subset['is_all_refunded'] == 1]['account_id'].nunique()
|
||
|
||
# 付费率 = 付费account数 / 总account数
|
||
pay_rate = paid_accounts / total_accounts * 100 if total_accounts > 0 else 0
|
||
|
||
# 退费率 = 全部退款的account数 / 付费account数
|
||
refund_rate = all_refunded_accounts / paid_accounts * 100 if paid_accounts > 0 else 0
|
||
|
||
# GMV / GSV(按角色汇总的account去重)
|
||
paid_subset = subset[subset['is_paid'] == 1]
|
||
# 按account去重取GMV
|
||
account_gmv = paid_subset.groupby('account_id')['total_gmv'].first().sum()
|
||
account_gsv = paid_subset.groupby('account_id')['gsv'].first().sum()
|
||
|
||
# 人均GMV
|
||
avg_gmv_per_paid = account_gmv / paid_accounts if paid_accounts > 0 else 0
|
||
|
||
# 学习时长
|
||
avg_learning_min = subset['total_learning_min'].mean()
|
||
median_learning_min = subset['total_learning_min'].median()
|
||
learned_chars = (subset['total_learning_seconds'] > 0).sum()
|
||
learn_rate = learned_chars / total_chars * 100 if total_chars > 0 else 0
|
||
|
||
# 有学习的角色平均学习时长
|
||
learned_subset = subset[subset['total_learning_seconds'] > 0]
|
||
avg_learn_min_learned = learned_subset['total_learning_min'].mean() if len(learned_subset) > 0 else 0
|
||
|
||
results.append({
|
||
'年龄组': ag,
|
||
'角色数': total_chars,
|
||
'用户数(account)': total_accounts,
|
||
'付费用户数': paid_accounts,
|
||
'付费率': round(pay_rate, 1),
|
||
'全部退款用户数': all_refunded_accounts,
|
||
'退费率(全额退)': round(refund_rate, 1),
|
||
'GMV(元)': round(account_gmv, 0),
|
||
'GSV(元)': round(account_gsv, 0),
|
||
'人均GMV(付费用户)': round(avg_gmv_per_paid, 0),
|
||
'有学习记录角色数': learned_chars,
|
||
'学习参与率': round(learn_rate, 1),
|
||
'全员平均学习时长(分钟)': round(avg_learning_min, 1),
|
||
'有学习角色平均时长(分钟)': round(avg_learn_min_learned, 1),
|
||
'中位学习时长(分钟)': round(median_learning_min, 1),
|
||
})
|
||
|
||
result_df = pd.DataFrame(results)
|
||
|
||
# 汇总行
|
||
total_row = {
|
||
'年龄组': '合计',
|
||
'角色数': chars_df['char_id'].nunique(),
|
||
'用户数(account)': chars_df['account_id'].nunique(),
|
||
'付费用户数': chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(),
|
||
'付费率': round(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique() / chars_df['account_id'].nunique() * 100, 1),
|
||
'全部退款用户数': chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique(),
|
||
'退费率(全额退)': round(chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique() / max(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(), 1) * 100, 1),
|
||
'GMV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().sum(), 0),
|
||
'GSV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['gsv'].first().sum(), 0),
|
||
'人均GMV(付费用户)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().mean(), 0),
|
||
'有学习记录角色数': (chars_df['total_learning_seconds'] > 0).sum(),
|
||
'学习参与率': round((chars_df['total_learning_seconds'] > 0).sum() / len(chars_df) * 100, 1),
|
||
'全员平均学习时长(分钟)': round(chars_df['total_learning_min'].mean(), 1),
|
||
'有学习角色平均时长(分钟)': round(chars_df[chars_df['total_learning_seconds'] > 0]['total_learning_min'].mean(), 1),
|
||
'中位学习时长(分钟)': round(chars_df['total_learning_min'].median(), 1),
|
||
}
|
||
result_df = pd.concat([result_df, pd.DataFrame([total_row])], ignore_index=True)
|
||
|
||
# 写入 Excel
|
||
with pd.ExcelWriter(OUTPUT, engine='openpyxl') as writer:
|
||
result_df.to_excel(writer, sheet_name='年龄分析', index=False)
|
||
|
||
# 年龄分布明细
|
||
age_dist = chars_df.groupby('age').agg(
|
||
角色数=('char_id', 'count'),
|
||
付费角色数=('is_paid', 'sum'),
|
||
).reset_index()
|
||
age_dist['付费率'] = round(age_dist['付费角色数'] / age_dist['角色数'] * 100, 1)
|
||
age_dist = age_dist.sort_values('age')
|
||
age_dist.to_excel(writer, sheet_name='年龄分布明细', index=False)
|
||
|
||
print(f"\n✅ 报表已生成: {OUTPUT}")
|
||
print("\n=== 按年龄组汇总 ===")
|
||
print(result_df.to_string(index=False))
|
||
|
||
return OUTPUT
|
||
|
||
if __name__ == '__main__':
|
||
main()
|