Cohort Analysis
1. Introduction
Cohort Analysis là gì?
Cohort Analysis nhóm users theo đặc điểm chung (thường là thời điểm acquisition) và theo dõi hành vi của họ qua thời gian. Đây là công cụ powerful để hiểu retention, engagement, và customer lifetime value.
1.1 Cohort Types
Text
1┌─────────────────────────────────────────────────────────┐2│ Types of Cohorts │3├─────────────────────────────────────────────────────────┤4│ │5│ ┌─────────────────────────────────────────────────┐ │6│ │ 📅 Time-based Cohorts │ │7│ │ • Acquisition cohort: Month/week signed up │ │8│ │ • First purchase cohort: First order month │ │9│ │ • Activation cohort: Completed onboarding │ │10│ └─────────────────────────────────────────────────┘ │11│ │12│ ┌─────────────────────────────────────────────────┐ │13│ │ 📊 Behavior-based Cohorts │ │14│ │ • Power users vs casual users │ │15│ │ • High spenders vs low spenders │ │16│ │ • Channel cohorts (organic vs paid) │ │17│ └─────────────────────────────────────────────────┘ │18│ │19│ ┌─────────────────────────────────────────────────┐ │20│ │ 🎯 Segment-based Cohorts │ │21│ │ • Product category │ │22│ │ • Geographic region │ │23│ │ • Customer tier (premium vs free) │ │24│ └─────────────────────────────────────────────────┘ │25│ │26└─────────────────────────────────────────────────────────┘2. Data Preparation
2.1 Create Sample Data
Python
1import pandas as pd2import numpy as np3import matplotlib.pyplot as plt4import seaborn as sns56# Generate sample transaction data7np.random.seed(42)89n_customers = 500010n_transactions = 500001112# Customer signup dates (spread over 12 months)13customers = pd.DataFrame({14 'customer_id': range(1, n_customers + 1),15 'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(16 np.random.randint(0, 365, n_customers), unit='D'17 )18})1920# Generate transactions21transactions = pd.DataFrame({22 'transaction_id': range(1, n_transactions + 1),23 'customer_id': np.random.randint(1, n_customers + 1, n_transactions),24 'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(25 np.random.randint(0, 400, n_transactions), unit='D'26 ),27 'amount': np.random.lognormal(4, 1, n_transactions)28})2930# Merge to get signup date31transactions = transactions.merge(customers, on='customer_id')3233# Only keep transactions after signup34transactions = transactions[35 transactions['transaction_date'] >= transactions['signup_date']36]3738print(f"Customers: {len(customers)}")39print(f"Transactions: {len(transactions)}")40print(transactions.head())2.2 Calculate Cohort Fields
Python
1def assign_cohorts(df, signup_col, activity_col, freq='M'):2 """Assign cohort and period to each row"""3 df = df.copy()4 5 # Cohort: Month of signup6 df['cohort'] = df[signup_col].dt.to_period(freq)7 8 # Activity period9 df['activity_period'] = df[activity_col].dt.to_period(freq)10 11 # Period number (0, 1, 2, ...)12 df['period_number'] = (13 df['activity_period'].view('int64') - 14 df['cohort'].view('int64')15 )16 17 return df1819transactions = assign_cohorts(20 transactions, 21 'signup_date', 22 'transaction_date',23 freq='M'24)2526print(transactions[['customer_id', 'cohort', 'activity_period', 'period_number']].head(10))3. Retention Analysis
3.1 Build Retention Matrix
Python
1def build_retention_matrix(df, value_col='customer_id', agg_func='nunique'):2 """Build cohort retention matrix"""3 4 # Aggregate by cohort and period5 cohort_data = df.groupby(['cohort', 'period_number']).agg({6 value_col: agg_func7 }).reset_index()8 9 # Pivot to matrix10 cohort_matrix = cohort_data.pivot(11 index='cohort',12 columns='period_number',13 values=value_col14 )15 16 return cohort_matrix1718# Active users matrix19retention_counts = build_retention_matrix(transactions)20print("Retention Counts:")21print(retention_counts.iloc[:6, :7])2223# Calculate retention rate24cohort_sizes = retention_counts.iloc[:, 0]25retention_rate = retention_counts.divide(cohort_sizes, axis=0) * 1002627print("\nRetention Rate (%):")28print(retention_rate.iloc[:6, :7].round(1))3.2 Visualize Retention Heatmap
Python
1def plot_retention_heatmap(retention_matrix, title='Retention Rate (%)'):2 """Plot retention heatmap"""3 plt.figure(figsize=(14, 10))4 5 # Create annotations6 annot = retention_matrix.round(1).astype(str) + '%'7 8 sns.heatmap(9 retention_matrix,10 annot=annot,11 fmt='',12 cmap='YlGnBu',13 vmin=0,14 vmax=100,15 linewidths=0.5,16 cbar_kws={'label': 'Retention Rate (%)'}17 )18 19 plt.title(title, fontsize=14)20 plt.xlabel('Months Since First Purchase')21 plt.ylabel('Cohort (Month)')22 plt.tight_layout()23 plt.savefig('retention_heatmap.png', dpi=150)24 plt.show()2526plot_retention_heatmap(retention_rate.iloc[:12, :12])3.3 Retention Curves
Python
1def plot_retention_curves(retention_matrix, top_n=6):2 """Plot retention curves for multiple cohorts"""3 plt.figure(figsize=(12, 6))4 5 for cohort in retention_matrix.index[:top_n]:6 values = retention_matrix.loc[cohort].dropna()7 plt.plot(values.index, values.values, marker='o', label=str(cohort))8 9 plt.xlabel('Months Since Acquisition')10 plt.ylabel('Retention Rate (%)')11 plt.title('Retention Curves by Cohort')12 plt.legend(title='Cohort')13 plt.grid(True, alpha=0.3)14 plt.ylim(0, 105)15 plt.savefig('retention_curves.png', dpi=150)16 plt.show()1718plot_retention_curves(retention_rate)4. Revenue Cohorts
4.1 Revenue per Cohort
Python
1# Revenue matrix2revenue_matrix = build_retention_matrix(3 transactions, 4 value_col='amount', 5 agg_func='sum'6)78print("Total Revenue by Cohort and Period:")9print(revenue_matrix.iloc[:6, :7].round(0))1011# Cumulative revenue12cumulative_revenue = revenue_matrix.cumsum(axis=1)13print("\nCumulative Revenue:")14print(cumulative_revenue.iloc[:6, :7].round(0))4.2 Average Revenue per User (ARPU)
Python
1# ARPU = Total Revenue / Total Users in Cohort2cohort_users = transactions.groupby('cohort')['customer_id'].nunique()34arpu_matrix = revenue_matrix.divide(cohort_users, axis=0)5print("ARPU by Cohort and Period:")6print(arpu_matrix.iloc[:6, :7].round(2))78# Cumulative ARPU9cum_arpu = arpu_matrix.cumsum(axis=1)1011# Plot ARPU trends12plt.figure(figsize=(12, 6))13for cohort in cum_arpu.index[:6]:14 values = cum_arpu.loc[cohort].dropna()15 plt.plot(values.index, values.values, marker='o', label=str(cohort))1617plt.xlabel('Months Since Acquisition')18plt.ylabel('Cumulative ARPU ($)')19plt.title('Cumulative ARPU by Cohort')20plt.legend(title='Cohort')21plt.grid(True, alpha=0.3)22plt.show()4.3 Customer Lifetime Value (LTV)
Python
1def calculate_ltv_by_cohort(df, periods=12):2 """Calculate LTV for each cohort"""3 results = []4 5 for cohort in df['cohort'].unique():6 cohort_data = df[df['cohort'] == cohort]7 cohort_users = cohort_data['customer_id'].nunique()8 9 # Revenue for first N periods10 period_revenue = cohort_data[11 cohort_data['period_number'] < periods12 ].groupby('period_number')['amount'].sum()13 14 total_revenue = period_revenue.sum()15 ltv = total_revenue / cohort_users16 17 results.append({18 'cohort': cohort,19 'users': cohort_users,20 'total_revenue': total_revenue,21 'ltv': ltv22 })23 24 return pd.DataFrame(results)2526ltv_df = calculate_ltv_by_cohort(transactions, periods=12)27print("LTV by Cohort:")28print(ltv_df.sort_values('cohort').head(10))2930# Plot LTV trend31plt.figure(figsize=(12, 5))32plt.bar(ltv_df['cohort'].astype(str), ltv_df['ltv'])33plt.xlabel('Cohort')34plt.ylabel('12-Month LTV ($)')35plt.title('Customer Lifetime Value by Acquisition Cohort')36plt.xticks(rotation=45)37plt.tight_layout()38plt.show()5. Behavioral Cohorts
5.1 Purchase Frequency Cohorts
Python
1# Calculate purchase frequency per customer2customer_freq = transactions.groupby('customer_id').agg({3 'transaction_id': 'count',4 'amount': 'sum',5 'signup_date': 'first'6}).rename(columns={'transaction_id': 'frequency', 'amount': 'total_spent'})78# Create frequency segments9customer_freq['freq_segment'] = pd.cut(10 customer_freq['frequency'],11 bins=[0, 1, 3, 10, float('inf')],12 labels=['One-time', 'Occasional', 'Regular', 'Power User']13)1415print("Customer Segments:")16print(customer_freq['freq_segment'].value_counts())1718# Analyze by segment19segment_stats = customer_freq.groupby('freq_segment').agg({20 'frequency': ['count', 'mean'],21 'total_spent': ['mean', 'sum']22})23print("\nSegment Statistics:")24print(segment_stats)5.2 RFM-based Cohorts
Python
1# Calculate RFM metrics2reference_date = transactions['transaction_date'].max() + pd.Timedelta(days=1)34rfm = transactions.groupby('customer_id').agg({5 'transaction_date': lambda x: (reference_date - x.max()).days, # Recency6 'transaction_id': 'count', # Frequency7 'amount': 'sum' # Monetary8}).rename(columns={9 'transaction_date': 'recency',10 'transaction_id': 'frequency',11 'amount': 'monetary'12})1314# Create RFM scores (1-5)15rfm['R_score'] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])16rfm['F_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])17rfm['M_score'] = pd.qcut(rfm['monetary'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])1819# RFM segment20rfm['RFM_score'] = (rfm['R_score'].astype(str) + 21 rfm['F_score'].astype(str) + 22 rfm['M_score'].astype(str))2324# Simplified segments25def rfm_segment(row):26 if row['R_score'] >= 4 and row['F_score'] >= 4:27 return 'Champions'28 elif row['R_score'] >= 4 and row['F_score'] < 3:29 return 'New Customers'30 elif row['R_score'] < 3 and row['F_score'] >= 4:31 return 'At Risk'32 elif row['R_score'] < 2:33 return 'Lost'34 else:35 return 'Regular'3637rfm['segment'] = rfm.apply(rfm_segment, axis=1)3839print("RFM Segments:")40print(rfm['segment'].value_counts())5.3 Channel Cohorts
Python
1# Add channel to customers2channels = ['Organic', 'Paid Search', 'Social', 'Email', 'Referral']3customers['channel'] = np.random.choice(channels, len(customers), p=[0.3, 0.25, 0.2, 0.15, 0.1])45# Merge channel to transactions6transactions = transactions.merge(7 customers[['customer_id', 'channel']], 8 on='customer_id',9 how='left'10)1112# Analyze by channel13channel_analysis = transactions.groupby('channel').agg({14 'customer_id': 'nunique',15 'transaction_id': 'count',16 'amount': ['sum', 'mean']17}).round(2)18channel_analysis.columns = ['customers', 'transactions', 'total_revenue', 'avg_order']1920print("Channel Performance:")21print(channel_analysis)2223# Retention by channel24def channel_retention(df, channel):25 """Calculate retention for a specific channel"""26 channel_data = df[df['channel'] == channel]27 return build_retention_matrix(channel_data)2829# Compare channels30fig, axes = plt.subplots(2, 3, figsize=(15, 10))31axes = axes.flatten()3233for i, channel in enumerate(channels):34 channel_ret = channel_retention(transactions, channel)35 cohort_sizes = channel_ret.iloc[:, 0]36 channel_rate = channel_ret.divide(cohort_sizes, axis=0) * 10037 38 # Average retention by period39 avg_retention = channel_rate.mean()[:7]40 axes[i].bar(range(len(avg_retention)), avg_retention.values)41 axes[i].set_title(f'{channel}')42 axes[i].set_xlabel('Period')43 axes[i].set_ylabel('Retention %')44 axes[i].set_ylim(0, 100)4546axes[5].axis('off')47plt.suptitle('Average Retention Rate by Channel', y=1.02)48plt.tight_layout()49plt.show()6. Cohort Analysis SQL
6.1 Basic Retention Query
SQL
1-- Step 1: Assign cohorts2WITH customer_cohorts AS (3 SELECT 4 customer_id,5 DATE_TRUNC('month', MIN(order_date)) AS cohort_month6 FROM orders7 GROUP BY customer_id8),910-- Step 2: Join transactions with cohorts11transactions_with_cohort AS (12 SELECT 13 o.customer_id,14 c.cohort_month,15 DATE_TRUNC('month', o.order_date) AS activity_month,16 -- Period number17 EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -18 EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period_number19 FROM orders o20 JOIN customer_cohorts c ON o.customer_id = c.customer_id21),2223-- Step 3: Count active users per cohort and period24cohort_data AS (25 SELECT 26 cohort_month,27 period_number,28 COUNT(DISTINCT customer_id) AS active_users29 FROM transactions_with_cohort30 GROUP BY cohort_month, period_number31),3233-- Step 4: Get cohort sizes34cohort_sizes AS (35 SELECT 36 cohort_month,37 active_users AS cohort_size38 FROM cohort_data39 WHERE period_number = 040)4142-- Step 5: Calculate retention rate43SELECT 44 cd.cohort_month,45 cd.period_number,46 cd.active_users,47 cs.cohort_size,48 ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) AS retention_rate49FROM cohort_data cd50JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month51ORDER BY cd.cohort_month, cd.period_number;6.2 Revenue Cohort Query
SQL
1WITH customer_cohorts AS (2 SELECT 3 customer_id,4 DATE_TRUNC('month', MIN(order_date)) AS cohort_month5 FROM orders6 GROUP BY customer_id7),89cohort_revenue AS (10 SELECT 11 c.cohort_month,12 EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -13 EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period,14 COUNT(DISTINCT o.customer_id) AS customers,15 SUM(o.amount) AS revenue16 FROM orders o17 JOIN customer_cohorts c ON o.customer_id = c.customer_id18 GROUP BY c.cohort_month, period19)2021SELECT 22 cohort_month,23 period,24 customers,25 revenue,26 ROUND(revenue / customers, 2) AS arpu,27 SUM(revenue) OVER (28 PARTITION BY cohort_month 29 ORDER BY period30 ) AS cumulative_revenue31FROM cohort_revenue32WHERE period <= 1233ORDER BY cohort_month, period;6.3 Pivot Retention Matrix
SQL
1-- PostgreSQL: Create pivot table2WITH retention_data AS (3 -- ... previous CTEs to get retention rates4 SELECT cohort_month, period_number, retention_rate5 FROM cohort_analysis6)7SELECT 8 cohort_month,9 MAX(CASE WHEN period_number = 0 THEN retention_rate END) AS m0,10 MAX(CASE WHEN period_number = 1 THEN retention_rate END) AS m1,11 MAX(CASE WHEN period_number = 2 THEN retention_rate END) AS m2,12 MAX(CASE WHEN period_number = 3 THEN retention_rate END) AS m3,13 MAX(CASE WHEN period_number = 4 THEN retention_rate END) AS m4,14 MAX(CASE WHEN period_number = 5 THEN retention_rate END) AS m5,15 MAX(CASE WHEN period_number = 6 THEN retention_rate END) AS m616FROM retention_data17GROUP BY cohort_month18ORDER BY cohort_month;7. Advanced Metrics
7.1 Churn Analysis
Python
1def calculate_churn(df, churn_days=90):2 """Calculate churn rate by cohort"""3 reference_date = df['transaction_date'].max()4 5 # Last activity per customer6 last_activity = df.groupby('customer_id').agg({7 'transaction_date': 'max',8 'cohort': 'first'9 })10 11 # Days since last activity12 last_activity['days_inactive'] = (reference_date - last_activity['transaction_date']).dt.days13 last_activity['is_churned'] = last_activity['days_inactive'] > churn_days14 15 # Churn rate by cohort16 churn_by_cohort = last_activity.groupby('cohort').agg({17 'is_churned': ['sum', 'count', 'mean']18 })19 churn_by_cohort.columns = ['churned', 'total', 'churn_rate']20 churn_by_cohort['churn_rate'] = (churn_by_cohort['churn_rate'] * 100).round(2)21 22 return churn_by_cohort2324churn_df = calculate_churn(transactions, churn_days=60)25print("Churn Rate by Cohort:")26print(churn_df)7.2 Reactivation Rate
Python
1def analyze_reactivation(df, dormant_days=60):2 """Analyze customer reactivation"""3 # Sort by customer and date4 df_sorted = df.sort_values(['customer_id', 'transaction_date'])5 6 # Calculate days between transactions7 df_sorted['prev_transaction'] = df_sorted.groupby('customer_id')['transaction_date'].shift()8 df_sorted['days_gap'] = (df_sorted['transaction_date'] - df_sorted['prev_transaction']).dt.days9 10 # Identify reactivations (gap > dormant_days)11 df_sorted['is_reactivation'] = df_sorted['days_gap'] > dormant_days12 13 # Reactivation by month14 reactivations = df_sorted[df_sorted['is_reactivation']].groupby(15 df_sorted['transaction_date'].dt.to_period('M')16 ).agg({17 'customer_id': 'nunique',18 'amount': 'sum'19 }).rename(columns={20 'customer_id': 'reactivated_customers',21 'amount': 'reactivation_revenue'22 })23 24 return reactivations2526reactivation_df = analyze_reactivation(transactions)27print("Reactivation Analysis:")28print(reactivation_df.tail(10))7.3 Cohort Comparison
Python
1def compare_cohorts(df, cohort1, cohort2, metric='retention'):2 """Statistical comparison between two cohorts"""3 from scipy import stats4 5 c1_data = df[df['cohort'] == cohort1]6 c2_data = df[df['cohort'] == cohort2]7 8 if metric == 'retention':9 # Compare retention at period 310 c1_ret = c1_data.groupby('period_number')['customer_id'].nunique()11 c2_ret = c2_data.groupby('period_number')['customer_id'].nunique()12 13 # Calculate retention rates14 c1_rate = c1_ret / c1_ret.iloc[0] if len(c1_ret) > 0 else 015 c2_rate = c2_ret / c2_ret.iloc[0] if len(c2_ret) > 0 else 016 17 elif metric == 'revenue':18 c1_vals = c1_data.groupby('customer_id')['amount'].sum()19 c2_vals = c2_data.groupby('customer_id')['amount'].sum()20 21 # T-test22 t_stat, p_value = stats.ttest_ind(c1_vals, c2_vals)23 24 print(f"\nCohort Comparison: {cohort1} vs {cohort2}")25 print(f"Mean Revenue - {cohort1}: ${c1_vals.mean():.2f}")26 print(f"Mean Revenue - {cohort2}: ${c2_vals.mean():.2f}")27 print(f"T-statistic: {t_stat:.4f}")28 print(f"P-value: {p_value:.4f}")29 30 if p_value < 0.05:31 print("✓ Statistically significant difference")32 else:33 print("✗ No significant difference")3435# Compare two cohorts36compare_cohorts(transactions, '2024-01', '2024-06', metric='revenue')8. Thực hành
Cohort Analysis Project
Exercise: Complete Cohort Analysis
Python
1# Build a comprehensive cohort analysis:2# 1. Create retention matrix3# 2. Calculate revenue cohorts4# 3. Build LTV analysis5# 4. Create behavioral segments6# 5. Generate executive summary78# YOUR CODE HERE💡 Xem đáp án
Python
1import pandas as pd2import numpy as np3import matplotlib.pyplot as plt4import seaborn as sns56class CohortAnalyzer:7 def __init__(self, transactions_df, signup_col='signup_date', 8 activity_col='transaction_date', customer_col='customer_id'):9 self.df = transactions_df.copy()10 self.signup_col = signup_col11 self.activity_col = activity_col12 self.customer_col = customer_col13 self._prepare_data()14 15 def _prepare_data(self):16 """Prepare cohort fields"""17 self.df['cohort'] = self.df[self.signup_col].dt.to_period('M')18 self.df['activity_period'] = self.df[self.activity_col].dt.to_period('M')19 self.df['period_number'] = (20 self.df['activity_period'].view('int64') - 21 self.df['cohort'].view('int64')22 )23 24 def retention_matrix(self):25 """Build retention matrix"""26 matrix = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()27 cohort_sizes = matrix.iloc[:, 0]28 return matrix.divide(cohort_sizes, axis=0) * 10029 30 def revenue_matrix(self):31 """Build revenue matrix"""32 return self.df.groupby(['cohort', 'period_number'])['amount'].sum().unstack()33 34 def arpu_matrix(self):35 """Build ARPU matrix"""36 revenue = self.revenue_matrix()37 users = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()38 return revenue / users39 40 def ltv_by_cohort(self, periods=12):41 """Calculate LTV for each cohort"""42 results = []43 for cohort in self.df['cohort'].unique():44 cohort_data = self.df[self.df['cohort'] == cohort]45 cohort_users = cohort_data[self.customer_col].nunique()46 revenue = cohort_data[cohort_data['period_number'] < periods]['amount'].sum()47 results.append({48 'cohort': cohort,49 'users': cohort_users,50 'revenue': revenue,51 'ltv': revenue / cohort_users52 })53 return pd.DataFrame(results).sort_values('cohort')54 55 def segment_customers(self):56 """Create RFM segments"""57 ref_date = self.df[self.activity_col].max() + pd.Timedelta(days=1)58 59 rfm = self.df.groupby(self.customer_col).agg({60 self.activity_col: lambda x: (ref_date - x.max()).days,61 'transaction_id': 'count',62 'amount': 'sum'63 }).rename(columns={64 self.activity_col: 'recency',65 'transaction_id': 'frequency',66 'amount': 'monetary'67 })68 69 # Scores70 rfm['R'] = pd.qcut(rfm['recency'], 4, labels=[4, 3, 2, 1])71 rfm['F'] = pd.qcut(rfm['frequency'].rank(method='first'), 4, labels=[1, 2, 3, 4])72 rfm['M'] = pd.qcut(rfm['monetary'].rank(method='first'), 4, labels=[1, 2, 3, 4])73 74 # Segment75 def segment(row):76 r, f = int(row['R']), int(row['F'])77 if r >= 3 and f >= 3: return 'Champions'78 elif r >= 3 and f < 3: return 'New/Promising'79 elif r < 3 and f >= 3: return 'At Risk'80 elif r < 2: return 'Lost'81 else: return 'Regular'82 83 rfm['segment'] = rfm.apply(segment, axis=1)84 return rfm85 86 def plot_retention_heatmap(self, max_periods=12):87 """Plot retention heatmap"""88 retention = self.retention_matrix().iloc[:12, :max_periods]89 90 plt.figure(figsize=(14, 8))91 sns.heatmap(92 retention,93 annot=True,94 fmt='.1f',95 cmap='YlGnBu',96 vmin=0,97 vmax=10098 )99 plt.title('Retention Rate by Cohort (%)')100 plt.xlabel('Months Since First Purchase')101 plt.ylabel('Cohort')102 plt.tight_layout()103 plt.savefig('cohort_retention.png', dpi=150)104 plt.show()105 106 def generate_report(self):107 """Generate executive summary"""108 retention = self.retention_matrix()109 ltv = self.ltv_by_cohort()110 segments = self.segment_customers()111 112 print("=" * 60)113 print("COHORT ANALYSIS EXECUTIVE SUMMARY")114 print("=" * 60)115 116 # Overall metrics117 print(f"\n📊 OVERALL METRICS")118 print(f"Total Customers: {self.df[self.customer_col].nunique():,}")119 print(f"Total Transactions: {len(self.df):,}")120 print(f"Total Revenue: ${self.df['amount'].sum():,.2f}")121 122 # Retention summary123 print(f"\n📈 RETENTION SUMMARY")124 avg_m1 = retention[1].mean() if 1 in retention.columns else 0125 avg_m3 = retention[3].mean() if 3 in retention.columns else 0126 avg_m6 = retention[6].mean() if 6 in retention.columns else 0127 print(f"Month 1 Avg Retention: {avg_m1:.1f}%")128 print(f"Month 3 Avg Retention: {avg_m3:.1f}%")129 print(f"Month 6 Avg Retention: {avg_m6:.1f}%")130 131 # LTV summary132 print(f"\n💰 LTV SUMMARY")133 print(f"Average 12-Month LTV: ${ltv['ltv'].mean():.2f}")134 print(f"Best Cohort: {ltv.loc[ltv['ltv'].idxmax(), 'cohort']} (${ltv['ltv'].max():.2f})")135 print(f"Worst Cohort: {ltv.loc[ltv['ltv'].idxmin(), 'cohort']} (${ltv['ltv'].min():.2f})")136 137 # Segment summary138 print(f"\n👥 CUSTOMER SEGMENTS")139 segment_counts = segments['segment'].value_counts()140 for seg, count in segment_counts.items():141 pct = count / len(segments) * 100142 print(f" {seg}: {count:,} ({pct:.1f}%)")143 144 print("\n" + "=" * 60)145146147# Generate test data148np.random.seed(42)149n_cust, n_trans = 3000, 30000150151customers = pd.DataFrame({152 'customer_id': range(1, n_cust + 1),153 'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(154 np.random.randint(0, 300, n_cust), unit='D'155 )156})157158transactions = pd.DataFrame({159 'transaction_id': range(1, n_trans + 1),160 'customer_id': np.random.randint(1, n_cust + 1, n_trans),161 'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(162 np.random.randint(0, 365, n_trans), unit='D'163 ),164 'amount': np.random.lognormal(4, 1, n_trans)165})166167transactions = transactions.merge(customers, on='customer_id')168transactions = transactions[transactions['transaction_date'] >= transactions['signup_date']]169170# Run analysis171analyzer = CohortAnalyzer(transactions)172173# Generate visualizations174analyzer.plot_retention_heatmap()175176# Generate report177analyzer.generate_report()9. Tổng kết
| Topic | Key Concepts |
|---|---|
| Cohort Types | Time-based, Behavioral, Segment-based |
| Retention | Retention matrix, retention curves |
| Revenue | ARPU, cumulative revenue, LTV |
| Segments | RFM analysis, behavioral segments |
| Metrics | Churn rate, reactivation rate |
Bài tiếp theo: Funnel Analysis
