#!/usr/bin/env python3 """ 按孩子年龄区段分析:用户付费情况、上课时长、退费率 输出 Excel 报表 """ import psycopg2 import pandas as pd from datetime import datetime, date import os PG_CONFIG = { 'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com', 'port': 28591, 'user': 'ai_member', 'password': 'LdfjdjL83h3h3^$&**YGG*', 'dbname': 'vala_bi', } OUTPUT = '/root/.openclaw/workspace/output/age_learning_payment_analysis.xlsx' def get_conn(): return psycopg2.connect(**PG_CONFIG) def calc_age(birthday_str): """从生日字符串计算当前年龄""" if not birthday_str or birthday_str == '': return None try: # 尝试多种日期格式 for fmt in ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y/%m/%d', '%Y-%m-%dT%H:%M:%S']: try: bd = datetime.strptime(birthday_str.strip(), fmt).date() break except ValueError: continue else: # 尝试 YYYY-M-D 格式 parts = birthday_str.strip().split('-') if len(parts) == 3: bd = date(int(parts[0]), int(parts[1]), int(parts[2])) else: return None today = date.today() age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day)) return age except: return None def age_group(age): """年龄分组""" if age is None: return '未知' if age <= 3: return '0-3岁' elif age <= 5: return '4-5岁' elif age <= 7: return '6-7岁' elif age <= 9: return '8-9岁' elif age <= 11: return '10-11岁' elif age <= 14: return '12-14岁' else: return '15岁以上' def main(): conn = get_conn() print("1/5 获取角色生日数据...") chars_df = pd.read_sql(""" SELECT c.id AS char_id, c.account_id, c.birthday, c.status, c.deleted_at, a.status AS account_status FROM bi_vala_app_character c JOIN bi_vala_app_account a ON c.account_id = a.id WHERE c.status = 1 AND c.deleted_at IS NULL AND a.status = 1 AND a.deleted_at IS NULL AND c.birthday IS NOT NULL AND c.birthday != '' """, conn) print(f" 有效角色数: {len(chars_df)}") # 计算年龄 chars_df['age'] = chars_df['birthday'].apply(calc_age) chars_df['age_group'] = chars_df['age'].apply(age_group) print("2/5 获取订单数据...") orders_df = pd.read_sql(""" SELECT o.account_id, o.id AS order_id, o.trade_no, o.pay_amount_int::numeric/100 AS pay_amount, o.order_status, o.key_from, o.pay_success_date FROM bi_vala_order o JOIN bi_vala_app_account a ON o.account_id = a.id WHERE a.status = 1 AND a.deleted_at IS NULL AND o.order_status IN (3, 4) AND o.pay_success_date IS NOT NULL """, conn) print(f" 订单数: {len(orders_df)}") print("3/5 获取退款数据...") refunds_df = pd.read_sql(""" SELECT r.trade_no, r.refund_amount_int::numeric/100 AS refund_amount, r.status AS refund_status FROM bi_refund_order r WHERE r.status = 3 """, conn) print(f" 退款数: {len(refunds_df)}") print("4/5 获取学习时长数据...") learning_df = pd.read_sql(""" SELECT ul.user_id AS char_id, SUM(ul.learning_time) AS total_learning_seconds FROM user_learning ul GROUP BY ul.user_id """, conn) print(f" 有学习记录的角色数: {len(learning_df)}") conn.close() print("5/5 关联计算...") # 关联订单到角色(通过 account_id) # 一个 account 可能有多个角色,这里按 account 维度统计付费 account_orders = orders_df.groupby('account_id').agg( order_count=('order_id', 'count'), total_gmv=('pay_amount', 'sum'), has_order=('order_id', lambda x: 1), ).reset_index() # 退款关联 refund_trade_nos = set(refunds_df['trade_no'].tolist()) orders_df['is_refunded'] = orders_df['trade_no'].apply(lambda x: x in refund_trade_nos) # 按 account 统计退款 account_refund = orders_df.groupby('account_id').agg( refund_order_count=('is_refunded', 'sum'), total_refund_amount=('pay_amount', lambda x: x[orders_df.loc[x.index, 'is_refunded']].sum()), ).reset_index() account_refund['all_refunded'] = account_refund.apply( lambda r: 1 if r['refund_order_count'] >= r['refund_order_count'] else 0, axis=1 ) # 合并到角色 chars_df = chars_df.merge(account_orders[['account_id', 'order_count', 'total_gmv', 'has_order']], on='account_id', how='left') chars_df = chars_df.merge(account_refund[['account_id', 'refund_order_count', 'total_refund_amount']], on='account_id', how='left') chars_df['order_count'] = chars_df['order_count'].fillna(0).astype(int) chars_df['total_gmv'] = chars_df['total_gmv'].fillna(0) chars_df['has_order'] = chars_df['has_order'].fillna(0).astype(int) chars_df['refund_order_count'] = chars_df['refund_order_count'].fillna(0).astype(int) chars_df['total_refund_amount'] = chars_df['total_refund_amount'].fillna(0) chars_df['gsv'] = chars_df['total_gmv'] - chars_df['total_refund_amount'] chars_df['is_paid'] = (chars_df['has_order'] == 1).astype(int) chars_df['is_all_refunded'] = ((chars_df['order_count'] > 0) & (chars_df['refund_order_count'] >= chars_df['order_count'])).astype(int) # 关联学习时长 chars_df = chars_df.merge(learning_df[['char_id', 'total_learning_seconds']], on='char_id', how='left') chars_df['total_learning_seconds'] = chars_df['total_learning_seconds'].fillna(0) chars_df['total_learning_min'] = chars_df['total_learning_seconds'] / 60.0 # 按年龄组汇总 age_order = ['0-3岁', '4-5岁', '6-7岁', '8-9岁', '10-11岁', '12-14岁', '15岁以上', '未知'] results = [] for ag in age_order: subset = chars_df[chars_df['age_group'] == ag] if len(subset) == 0: continue total_chars = len(subset) total_accounts = subset['account_id'].nunique() paid_accounts = subset[subset['is_paid'] == 1]['account_id'].nunique() all_refunded_accounts = subset[subset['is_all_refunded'] == 1]['account_id'].nunique() # 付费率 = 付费account数 / 总account数 pay_rate = paid_accounts / total_accounts * 100 if total_accounts > 0 else 0 # 退费率 = 全部退款的account数 / 付费account数 refund_rate = all_refunded_accounts / paid_accounts * 100 if paid_accounts > 0 else 0 # GMV / GSV(按角色汇总的account去重) paid_subset = subset[subset['is_paid'] == 1] # 按account去重取GMV account_gmv = paid_subset.groupby('account_id')['total_gmv'].first().sum() account_gsv = paid_subset.groupby('account_id')['gsv'].first().sum() # 人均GMV avg_gmv_per_paid = account_gmv / paid_accounts if paid_accounts > 0 else 0 # 学习时长 avg_learning_min = subset['total_learning_min'].mean() median_learning_min = subset['total_learning_min'].median() learned_chars = (subset['total_learning_seconds'] > 0).sum() learn_rate = learned_chars / total_chars * 100 if total_chars > 0 else 0 # 有学习的角色平均学习时长 learned_subset = subset[subset['total_learning_seconds'] > 0] avg_learn_min_learned = learned_subset['total_learning_min'].mean() if len(learned_subset) > 0 else 0 results.append({ '年龄组': ag, '角色数': total_chars, '用户数(account)': total_accounts, '付费用户数': paid_accounts, '付费率': round(pay_rate, 1), '全部退款用户数': all_refunded_accounts, '退费率(全额退)': round(refund_rate, 1), 'GMV(元)': round(account_gmv, 0), 'GSV(元)': round(account_gsv, 0), '人均GMV(付费用户)': round(avg_gmv_per_paid, 0), '有学习记录角色数': learned_chars, '学习参与率': round(learn_rate, 1), '全员平均学习时长(分钟)': round(avg_learning_min, 1), '有学习角色平均时长(分钟)': round(avg_learn_min_learned, 1), '中位学习时长(分钟)': round(median_learning_min, 1), }) result_df = pd.DataFrame(results) # 汇总行 total_row = { '年龄组': '合计', '角色数': chars_df['char_id'].nunique(), '用户数(account)': chars_df['account_id'].nunique(), '付费用户数': chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(), '付费率': round(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique() / chars_df['account_id'].nunique() * 100, 1), '全部退款用户数': chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique(), '退费率(全额退)': round(chars_df[chars_df['is_all_refunded'] == 1]['account_id'].nunique() / max(chars_df[chars_df['is_paid'] == 1]['account_id'].nunique(), 1) * 100, 1), 'GMV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().sum(), 0), 'GSV(元)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['gsv'].first().sum(), 0), '人均GMV(付费用户)': round(chars_df[chars_df['is_paid'] == 1].groupby('account_id')['total_gmv'].first().mean(), 0), '有学习记录角色数': (chars_df['total_learning_seconds'] > 0).sum(), '学习参与率': round((chars_df['total_learning_seconds'] > 0).sum() / len(chars_df) * 100, 1), '全员平均学习时长(分钟)': round(chars_df['total_learning_min'].mean(), 1), '有学习角色平均时长(分钟)': round(chars_df[chars_df['total_learning_seconds'] > 0]['total_learning_min'].mean(), 1), '中位学习时长(分钟)': round(chars_df['total_learning_min'].median(), 1), } result_df = pd.concat([result_df, pd.DataFrame([total_row])], ignore_index=True) # 写入 Excel with pd.ExcelWriter(OUTPUT, engine='openpyxl') as writer: result_df.to_excel(writer, sheet_name='年龄分析', index=False) # 年龄分布明细 age_dist = chars_df.groupby('age').agg( 角色数=('char_id', 'count'), 付费角色数=('is_paid', 'sum'), ).reset_index() age_dist['付费率'] = round(age_dist['付费角色数'] / age_dist['角色数'] * 100, 1) age_dist = age_dist.sort_values('age') age_dist.to_excel(writer, sheet_name='年龄分布明细', index=False) print(f"\n✅ 报表已生成: {OUTPUT}") print("\n=== 按年龄组汇总 ===") print(result_df.to_string(index=False)) return OUTPUT if __name__ == '__main__': main()