Lý thuyết
Bài 10/17

Cohort Analysis

Phân tích hành vi khách hàng theo thời gian và nhóm

Cohort Analysis

Customer Cohort Analysis

1. Introduction

Cohort Analysis là gì?

Cohort Analysis nhóm users theo đặc điểm chung (thường là thời điểm acquisition) và theo dõi hành vi của họ qua thời gian. Đây là công cụ powerful để hiểu retention, engagement, và customer lifetime value.

1.1 Cohort Types

Text
1┌─────────────────────────────────────────────────────────┐
2│ Types of Cohorts │
3├─────────────────────────────────────────────────────────┤
4│ │
5│ ┌─────────────────────────────────────────────────┐ │
6│ │ 📅 Time-based Cohorts │ │
7│ │ • Acquisition cohort: Month/week signed up │ │
8│ │ • First purchase cohort: First order month │ │
9│ │ • Activation cohort: Completed onboarding │ │
10│ └─────────────────────────────────────────────────┘ │
11│ │
12│ ┌─────────────────────────────────────────────────┐ │
13│ │ 📊 Behavior-based Cohorts │ │
14│ │ • Power users vs casual users │ │
15│ │ • High spenders vs low spenders │ │
16│ │ • Channel cohorts (organic vs paid) │ │
17│ └─────────────────────────────────────────────────┘ │
18│ │
19│ ┌─────────────────────────────────────────────────┐ │
20│ │ 🎯 Segment-based Cohorts │ │
21│ │ • Product category │ │
22│ │ • Geographic region │ │
23│ │ • Customer tier (premium vs free) │ │
24│ └─────────────────────────────────────────────────┘ │
25│ │
26└─────────────────────────────────────────────────────────┘

2. Data Preparation

2.1 Create Sample Data

Python
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6# Generate sample transaction data
7np.random.seed(42)
8
9n_customers = 5000
10n_transactions = 50000
11
12# Customer signup dates (spread over 12 months)
13customers = pd.DataFrame({
14 'customer_id': range(1, n_customers + 1),
15 'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
16 np.random.randint(0, 365, n_customers), unit='D'
17 )
18})
19
20# Generate transactions
21transactions = pd.DataFrame({
22 'transaction_id': range(1, n_transactions + 1),
23 'customer_id': np.random.randint(1, n_customers + 1, n_transactions),
24 'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
25 np.random.randint(0, 400, n_transactions), unit='D'
26 ),
27 'amount': np.random.lognormal(4, 1, n_transactions)
28})
29
30# Merge to get signup date
31transactions = transactions.merge(customers, on='customer_id')
32
33# Only keep transactions after signup
34transactions = transactions[
35 transactions['transaction_date'] >= transactions['signup_date']
36]
37
38print(f"Customers: {len(customers)}")
39print(f"Transactions: {len(transactions)}")
40print(transactions.head())

2.2 Calculate Cohort Fields

Python
1def assign_cohorts(df, signup_col, activity_col, freq='M'):
2 """Assign cohort and period to each row"""
3 df = df.copy()
4
5 # Cohort: Month of signup
6 df['cohort'] = df[signup_col].dt.to_period(freq)
7
8 # Activity period
9 df['activity_period'] = df[activity_col].dt.to_period(freq)
10
11 # Period number (0, 1, 2, ...)
12 df['period_number'] = (
13 df['activity_period'].view('int64') -
14 df['cohort'].view('int64')
15 )
16
17 return df
18
19transactions = assign_cohorts(
20 transactions,
21 'signup_date',
22 'transaction_date',
23 freq='M'
24)
25
26print(transactions[['customer_id', 'cohort', 'activity_period', 'period_number']].head(10))

3. Retention Analysis

3.1 Build Retention Matrix

Python
1def build_retention_matrix(df, value_col='customer_id', agg_func='nunique'):
2 """Build cohort retention matrix"""
3
4 # Aggregate by cohort and period
5 cohort_data = df.groupby(['cohort', 'period_number']).agg({
6 value_col: agg_func
7 }).reset_index()
8
9 # Pivot to matrix
10 cohort_matrix = cohort_data.pivot(
11 index='cohort',
12 columns='period_number',
13 values=value_col
14 )
15
16 return cohort_matrix
17
18# Active users matrix
19retention_counts = build_retention_matrix(transactions)
20print("Retention Counts:")
21print(retention_counts.iloc[:6, :7])
22
23# Calculate retention rate
24cohort_sizes = retention_counts.iloc[:, 0]
25retention_rate = retention_counts.divide(cohort_sizes, axis=0) * 100
26
27print("\nRetention Rate (%):")
28print(retention_rate.iloc[:6, :7].round(1))

3.2 Visualize Retention Heatmap

Python
1def plot_retention_heatmap(retention_matrix, title='Retention Rate (%)'):
2 """Plot retention heatmap"""
3 plt.figure(figsize=(14, 10))
4
5 # Create annotations
6 annot = retention_matrix.round(1).astype(str) + '%'
7
8 sns.heatmap(
9 retention_matrix,
10 annot=annot,
11 fmt='',
12 cmap='YlGnBu',
13 vmin=0,
14 vmax=100,
15 linewidths=0.5,
16 cbar_kws={'label': 'Retention Rate (%)'}
17 )
18
19 plt.title(title, fontsize=14)
20 plt.xlabel('Months Since First Purchase')
21 plt.ylabel('Cohort (Month)')
22 plt.tight_layout()
23 plt.savefig('retention_heatmap.png', dpi=150)
24 plt.show()
25
26plot_retention_heatmap(retention_rate.iloc[:12, :12])

3.3 Retention Curves

Python
1def plot_retention_curves(retention_matrix, top_n=6):
2 """Plot retention curves for multiple cohorts"""
3 plt.figure(figsize=(12, 6))
4
5 for cohort in retention_matrix.index[:top_n]:
6 values = retention_matrix.loc[cohort].dropna()
7 plt.plot(values.index, values.values, marker='o', label=str(cohort))
8
9 plt.xlabel('Months Since Acquisition')
10 plt.ylabel('Retention Rate (%)')
11 plt.title('Retention Curves by Cohort')
12 plt.legend(title='Cohort')
13 plt.grid(True, alpha=0.3)
14 plt.ylim(0, 105)
15 plt.savefig('retention_curves.png', dpi=150)
16 plt.show()
17
18plot_retention_curves(retention_rate)

4. Revenue Cohorts

4.1 Revenue per Cohort

Python
1# Revenue matrix
2revenue_matrix = build_retention_matrix(
3 transactions,
4 value_col='amount',
5 agg_func='sum'
6)
7
8print("Total Revenue by Cohort and Period:")
9print(revenue_matrix.iloc[:6, :7].round(0))
10
11# Cumulative revenue
12cumulative_revenue = revenue_matrix.cumsum(axis=1)
13print("\nCumulative Revenue:")
14print(cumulative_revenue.iloc[:6, :7].round(0))

4.2 Average Revenue per User (ARPU)

Python
1# ARPU = Total Revenue / Total Users in Cohort
2cohort_users = transactions.groupby('cohort')['customer_id'].nunique()
3
4arpu_matrix = revenue_matrix.divide(cohort_users, axis=0)
5print("ARPU by Cohort and Period:")
6print(arpu_matrix.iloc[:6, :7].round(2))
7
8# Cumulative ARPU
9cum_arpu = arpu_matrix.cumsum(axis=1)
10
11# Plot ARPU trends
12plt.figure(figsize=(12, 6))
13for cohort in cum_arpu.index[:6]:
14 values = cum_arpu.loc[cohort].dropna()
15 plt.plot(values.index, values.values, marker='o', label=str(cohort))
16
17plt.xlabel('Months Since Acquisition')
18plt.ylabel('Cumulative ARPU ($)')
19plt.title('Cumulative ARPU by Cohort')
20plt.legend(title='Cohort')
21plt.grid(True, alpha=0.3)
22plt.show()

4.3 Customer Lifetime Value (LTV)

Python
1def calculate_ltv_by_cohort(df, periods=12):
2 """Calculate LTV for each cohort"""
3 results = []
4
5 for cohort in df['cohort'].unique():
6 cohort_data = df[df['cohort'] == cohort]
7 cohort_users = cohort_data['customer_id'].nunique()
8
9 # Revenue for first N periods
10 period_revenue = cohort_data[
11 cohort_data['period_number'] < periods
12 ].groupby('period_number')['amount'].sum()
13
14 total_revenue = period_revenue.sum()
15 ltv = total_revenue / cohort_users
16
17 results.append({
18 'cohort': cohort,
19 'users': cohort_users,
20 'total_revenue': total_revenue,
21 'ltv': ltv
22 })
23
24 return pd.DataFrame(results)
25
26ltv_df = calculate_ltv_by_cohort(transactions, periods=12)
27print("LTV by Cohort:")
28print(ltv_df.sort_values('cohort').head(10))
29
30# Plot LTV trend
31plt.figure(figsize=(12, 5))
32plt.bar(ltv_df['cohort'].astype(str), ltv_df['ltv'])
33plt.xlabel('Cohort')
34plt.ylabel('12-Month LTV ($)')
35plt.title('Customer Lifetime Value by Acquisition Cohort')
36plt.xticks(rotation=45)
37plt.tight_layout()
38plt.show()

5. Behavioral Cohorts

5.1 Purchase Frequency Cohorts

Python
1# Calculate purchase frequency per customer
2customer_freq = transactions.groupby('customer_id').agg({
3 'transaction_id': 'count',
4 'amount': 'sum',
5 'signup_date': 'first'
6}).rename(columns={'transaction_id': 'frequency', 'amount': 'total_spent'})
7
8# Create frequency segments
9customer_freq['freq_segment'] = pd.cut(
10 customer_freq['frequency'],
11 bins=[0, 1, 3, 10, float('inf')],
12 labels=['One-time', 'Occasional', 'Regular', 'Power User']
13)
14
15print("Customer Segments:")
16print(customer_freq['freq_segment'].value_counts())
17
18# Analyze by segment
19segment_stats = customer_freq.groupby('freq_segment').agg({
20 'frequency': ['count', 'mean'],
21 'total_spent': ['mean', 'sum']
22})
23print("\nSegment Statistics:")
24print(segment_stats)

5.2 RFM-based Cohorts

Python
1# Calculate RFM metrics
2reference_date = transactions['transaction_date'].max() + pd.Timedelta(days=1)
3
4rfm = transactions.groupby('customer_id').agg({
5 'transaction_date': lambda x: (reference_date - x.max()).days, # Recency
6 'transaction_id': 'count', # Frequency
7 'amount': 'sum' # Monetary
8}).rename(columns={
9 'transaction_date': 'recency',
10 'transaction_id': 'frequency',
11 'amount': 'monetary'
12})
13
14# Create RFM scores (1-5)
15rfm['R_score'] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
16rfm['F_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
17rfm['M_score'] = pd.qcut(rfm['monetary'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
18
19# RFM segment
20rfm['RFM_score'] = (rfm['R_score'].astype(str) +
21 rfm['F_score'].astype(str) +
22 rfm['M_score'].astype(str))
23
24# Simplified segments
25def rfm_segment(row):
26 if row['R_score'] >= 4 and row['F_score'] >= 4:
27 return 'Champions'
28 elif row['R_score'] >= 4 and row['F_score'] < 3:
29 return 'New Customers'
30 elif row['R_score'] < 3 and row['F_score'] >= 4:
31 return 'At Risk'
32 elif row['R_score'] < 2:
33 return 'Lost'
34 else:
35 return 'Regular'
36
37rfm['segment'] = rfm.apply(rfm_segment, axis=1)
38
39print("RFM Segments:")
40print(rfm['segment'].value_counts())

5.3 Channel Cohorts

Python
1# Add channel to customers
2channels = ['Organic', 'Paid Search', 'Social', 'Email', 'Referral']
3customers['channel'] = np.random.choice(channels, len(customers), p=[0.3, 0.25, 0.2, 0.15, 0.1])
4
5# Merge channel to transactions
6transactions = transactions.merge(
7 customers[['customer_id', 'channel']],
8 on='customer_id',
9 how='left'
10)
11
12# Analyze by channel
13channel_analysis = transactions.groupby('channel').agg({
14 'customer_id': 'nunique',
15 'transaction_id': 'count',
16 'amount': ['sum', 'mean']
17}).round(2)
18channel_analysis.columns = ['customers', 'transactions', 'total_revenue', 'avg_order']
19
20print("Channel Performance:")
21print(channel_analysis)
22
23# Retention by channel
24def channel_retention(df, channel):
25 """Calculate retention for a specific channel"""
26 channel_data = df[df['channel'] == channel]
27 return build_retention_matrix(channel_data)
28
29# Compare channels
30fig, axes = plt.subplots(2, 3, figsize=(15, 10))
31axes = axes.flatten()
32
33for i, channel in enumerate(channels):
34 channel_ret = channel_retention(transactions, channel)
35 cohort_sizes = channel_ret.iloc[:, 0]
36 channel_rate = channel_ret.divide(cohort_sizes, axis=0) * 100
37
38 # Average retention by period
39 avg_retention = channel_rate.mean()[:7]
40 axes[i].bar(range(len(avg_retention)), avg_retention.values)
41 axes[i].set_title(f'{channel}')
42 axes[i].set_xlabel('Period')
43 axes[i].set_ylabel('Retention %')
44 axes[i].set_ylim(0, 100)
45
46axes[5].axis('off')
47plt.suptitle('Average Retention Rate by Channel', y=1.02)
48plt.tight_layout()
49plt.show()

6. Cohort Analysis SQL

6.1 Basic Retention Query

SQL
1-- Step 1: Assign cohorts
2WITH customer_cohorts AS (
3 SELECT
4 customer_id,
5 DATE_TRUNC('month', MIN(order_date)) AS cohort_month
6 FROM orders
7 GROUP BY customer_id
8),
9
10-- Step 2: Join transactions with cohorts
11transactions_with_cohort AS (
12 SELECT
13 o.customer_id,
14 c.cohort_month,
15 DATE_TRUNC('month', o.order_date) AS activity_month,
16 -- Period number
17 EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -
18 EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period_number
19 FROM orders o
20 JOIN customer_cohorts c ON o.customer_id = c.customer_id
21),
22
23-- Step 3: Count active users per cohort and period
24cohort_data AS (
25 SELECT
26 cohort_month,
27 period_number,
28 COUNT(DISTINCT customer_id) AS active_users
29 FROM transactions_with_cohort
30 GROUP BY cohort_month, period_number
31),
32
33-- Step 4: Get cohort sizes
34cohort_sizes AS (
35 SELECT
36 cohort_month,
37 active_users AS cohort_size
38 FROM cohort_data
39 WHERE period_number = 0
40)
41
42-- Step 5: Calculate retention rate
43SELECT
44 cd.cohort_month,
45 cd.period_number,
46 cd.active_users,
47 cs.cohort_size,
48 ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) AS retention_rate
49FROM cohort_data cd
50JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month
51ORDER BY cd.cohort_month, cd.period_number;

6.2 Revenue Cohort Query

SQL
1WITH customer_cohorts AS (
2 SELECT
3 customer_id,
4 DATE_TRUNC('month', MIN(order_date)) AS cohort_month
5 FROM orders
6 GROUP BY customer_id
7),
8
9cohort_revenue AS (
10 SELECT
11 c.cohort_month,
12 EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -
13 EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period,
14 COUNT(DISTINCT o.customer_id) AS customers,
15 SUM(o.amount) AS revenue
16 FROM orders o
17 JOIN customer_cohorts c ON o.customer_id = c.customer_id
18 GROUP BY c.cohort_month, period
19)
20
21SELECT
22 cohort_month,
23 period,
24 customers,
25 revenue,
26 ROUND(revenue / customers, 2) AS arpu,
27 SUM(revenue) OVER (
28 PARTITION BY cohort_month
29 ORDER BY period
30 ) AS cumulative_revenue
31FROM cohort_revenue
32WHERE period <= 12
33ORDER BY cohort_month, period;

6.3 Pivot Retention Matrix

SQL
1-- PostgreSQL: Create pivot table
2WITH retention_data AS (
3 -- ... previous CTEs to get retention rates
4 SELECT cohort_month, period_number, retention_rate
5 FROM cohort_analysis
6)
7SELECT
8 cohort_month,
9 MAX(CASE WHEN period_number = 0 THEN retention_rate END) AS m0,
10 MAX(CASE WHEN period_number = 1 THEN retention_rate END) AS m1,
11 MAX(CASE WHEN period_number = 2 THEN retention_rate END) AS m2,
12 MAX(CASE WHEN period_number = 3 THEN retention_rate END) AS m3,
13 MAX(CASE WHEN period_number = 4 THEN retention_rate END) AS m4,
14 MAX(CASE WHEN period_number = 5 THEN retention_rate END) AS m5,
15 MAX(CASE WHEN period_number = 6 THEN retention_rate END) AS m6
16FROM retention_data
17GROUP BY cohort_month
18ORDER BY cohort_month;

7. Advanced Metrics

7.1 Churn Analysis

Python
1def calculate_churn(df, churn_days=90):
2 """Calculate churn rate by cohort"""
3 reference_date = df['transaction_date'].max()
4
5 # Last activity per customer
6 last_activity = df.groupby('customer_id').agg({
7 'transaction_date': 'max',
8 'cohort': 'first'
9 })
10
11 # Days since last activity
12 last_activity['days_inactive'] = (reference_date - last_activity['transaction_date']).dt.days
13 last_activity['is_churned'] = last_activity['days_inactive'] > churn_days
14
15 # Churn rate by cohort
16 churn_by_cohort = last_activity.groupby('cohort').agg({
17 'is_churned': ['sum', 'count', 'mean']
18 })
19 churn_by_cohort.columns = ['churned', 'total', 'churn_rate']
20 churn_by_cohort['churn_rate'] = (churn_by_cohort['churn_rate'] * 100).round(2)
21
22 return churn_by_cohort
23
24churn_df = calculate_churn(transactions, churn_days=60)
25print("Churn Rate by Cohort:")
26print(churn_df)

7.2 Reactivation Rate

Python
1def analyze_reactivation(df, dormant_days=60):
2 """Analyze customer reactivation"""
3 # Sort by customer and date
4 df_sorted = df.sort_values(['customer_id', 'transaction_date'])
5
6 # Calculate days between transactions
7 df_sorted['prev_transaction'] = df_sorted.groupby('customer_id')['transaction_date'].shift()
8 df_sorted['days_gap'] = (df_sorted['transaction_date'] - df_sorted['prev_transaction']).dt.days
9
10 # Identify reactivations (gap > dormant_days)
11 df_sorted['is_reactivation'] = df_sorted['days_gap'] > dormant_days
12
13 # Reactivation by month
14 reactivations = df_sorted[df_sorted['is_reactivation']].groupby(
15 df_sorted['transaction_date'].dt.to_period('M')
16 ).agg({
17 'customer_id': 'nunique',
18 'amount': 'sum'
19 }).rename(columns={
20 'customer_id': 'reactivated_customers',
21 'amount': 'reactivation_revenue'
22 })
23
24 return reactivations
25
26reactivation_df = analyze_reactivation(transactions)
27print("Reactivation Analysis:")
28print(reactivation_df.tail(10))

7.3 Cohort Comparison

Python
1def compare_cohorts(df, cohort1, cohort2, metric='retention'):
2 """Statistical comparison between two cohorts"""
3 from scipy import stats
4
5 c1_data = df[df['cohort'] == cohort1]
6 c2_data = df[df['cohort'] == cohort2]
7
8 if metric == 'retention':
9 # Compare retention at period 3
10 c1_ret = c1_data.groupby('period_number')['customer_id'].nunique()
11 c2_ret = c2_data.groupby('period_number')['customer_id'].nunique()
12
13 # Calculate retention rates
14 c1_rate = c1_ret / c1_ret.iloc[0] if len(c1_ret) > 0 else 0
15 c2_rate = c2_ret / c2_ret.iloc[0] if len(c2_ret) > 0 else 0
16
17 elif metric == 'revenue':
18 c1_vals = c1_data.groupby('customer_id')['amount'].sum()
19 c2_vals = c2_data.groupby('customer_id')['amount'].sum()
20
21 # T-test
22 t_stat, p_value = stats.ttest_ind(c1_vals, c2_vals)
23
24 print(f"\nCohort Comparison: {cohort1} vs {cohort2}")
25 print(f"Mean Revenue - {cohort1}: ${c1_vals.mean():.2f}")
26 print(f"Mean Revenue - {cohort2}: ${c2_vals.mean():.2f}")
27 print(f"T-statistic: {t_stat:.4f}")
28 print(f"P-value: {p_value:.4f}")
29
30 if p_value < 0.05:
31 print("✓ Statistically significant difference")
32 else:
33 print("✗ No significant difference")
34
35# Compare two cohorts
36compare_cohorts(transactions, '2024-01', '2024-06', metric='revenue')

8. Thực hành

Cohort Analysis Project

Exercise: Complete Cohort Analysis

Python
1# Build a comprehensive cohort analysis:
2# 1. Create retention matrix
3# 2. Calculate revenue cohorts
4# 3. Build LTV analysis
5# 4. Create behavioral segments
6# 5. Generate executive summary
7
8# YOUR CODE HERE
💡 Xem đáp án
Python
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6class CohortAnalyzer:
7 def __init__(self, transactions_df, signup_col='signup_date',
8 activity_col='transaction_date', customer_col='customer_id'):
9 self.df = transactions_df.copy()
10 self.signup_col = signup_col
11 self.activity_col = activity_col
12 self.customer_col = customer_col
13 self._prepare_data()
14
15 def _prepare_data(self):
16 """Prepare cohort fields"""
17 self.df['cohort'] = self.df[self.signup_col].dt.to_period('M')
18 self.df['activity_period'] = self.df[self.activity_col].dt.to_period('M')
19 self.df['period_number'] = (
20 self.df['activity_period'].view('int64') -
21 self.df['cohort'].view('int64')
22 )
23
24 def retention_matrix(self):
25 """Build retention matrix"""
26 matrix = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()
27 cohort_sizes = matrix.iloc[:, 0]
28 return matrix.divide(cohort_sizes, axis=0) * 100
29
30 def revenue_matrix(self):
31 """Build revenue matrix"""
32 return self.df.groupby(['cohort', 'period_number'])['amount'].sum().unstack()
33
34 def arpu_matrix(self):
35 """Build ARPU matrix"""
36 revenue = self.revenue_matrix()
37 users = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()
38 return revenue / users
39
40 def ltv_by_cohort(self, periods=12):
41 """Calculate LTV for each cohort"""
42 results = []
43 for cohort in self.df['cohort'].unique():
44 cohort_data = self.df[self.df['cohort'] == cohort]
45 cohort_users = cohort_data[self.customer_col].nunique()
46 revenue = cohort_data[cohort_data['period_number'] < periods]['amount'].sum()
47 results.append({
48 'cohort': cohort,
49 'users': cohort_users,
50 'revenue': revenue,
51 'ltv': revenue / cohort_users
52 })
53 return pd.DataFrame(results).sort_values('cohort')
54
55 def segment_customers(self):
56 """Create RFM segments"""
57 ref_date = self.df[self.activity_col].max() + pd.Timedelta(days=1)
58
59 rfm = self.df.groupby(self.customer_col).agg({
60 self.activity_col: lambda x: (ref_date - x.max()).days,
61 'transaction_id': 'count',
62 'amount': 'sum'
63 }).rename(columns={
64 self.activity_col: 'recency',
65 'transaction_id': 'frequency',
66 'amount': 'monetary'
67 })
68
69 # Scores
70 rfm['R'] = pd.qcut(rfm['recency'], 4, labels=[4, 3, 2, 1])
71 rfm['F'] = pd.qcut(rfm['frequency'].rank(method='first'), 4, labels=[1, 2, 3, 4])
72 rfm['M'] = pd.qcut(rfm['monetary'].rank(method='first'), 4, labels=[1, 2, 3, 4])
73
74 # Segment
75 def segment(row):
76 r, f = int(row['R']), int(row['F'])
77 if r >= 3 and f >= 3: return 'Champions'
78 elif r >= 3 and f < 3: return 'New/Promising'
79 elif r < 3 and f >= 3: return 'At Risk'
80 elif r < 2: return 'Lost'
81 else: return 'Regular'
82
83 rfm['segment'] = rfm.apply(segment, axis=1)
84 return rfm
85
86 def plot_retention_heatmap(self, max_periods=12):
87 """Plot retention heatmap"""
88 retention = self.retention_matrix().iloc[:12, :max_periods]
89
90 plt.figure(figsize=(14, 8))
91 sns.heatmap(
92 retention,
93 annot=True,
94 fmt='.1f',
95 cmap='YlGnBu',
96 vmin=0,
97 vmax=100
98 )
99 plt.title('Retention Rate by Cohort (%)')
100 plt.xlabel('Months Since First Purchase')
101 plt.ylabel('Cohort')
102 plt.tight_layout()
103 plt.savefig('cohort_retention.png', dpi=150)
104 plt.show()
105
106 def generate_report(self):
107 """Generate executive summary"""
108 retention = self.retention_matrix()
109 ltv = self.ltv_by_cohort()
110 segments = self.segment_customers()
111
112 print("=" * 60)
113 print("COHORT ANALYSIS EXECUTIVE SUMMARY")
114 print("=" * 60)
115
116 # Overall metrics
117 print(f"\n📊 OVERALL METRICS")
118 print(f"Total Customers: {self.df[self.customer_col].nunique():,}")
119 print(f"Total Transactions: {len(self.df):,}")
120 print(f"Total Revenue: ${self.df['amount'].sum():,.2f}")
121
122 # Retention summary
123 print(f"\n📈 RETENTION SUMMARY")
124 avg_m1 = retention[1].mean() if 1 in retention.columns else 0
125 avg_m3 = retention[3].mean() if 3 in retention.columns else 0
126 avg_m6 = retention[6].mean() if 6 in retention.columns else 0
127 print(f"Month 1 Avg Retention: {avg_m1:.1f}%")
128 print(f"Month 3 Avg Retention: {avg_m3:.1f}%")
129 print(f"Month 6 Avg Retention: {avg_m6:.1f}%")
130
131 # LTV summary
132 print(f"\n💰 LTV SUMMARY")
133 print(f"Average 12-Month LTV: ${ltv['ltv'].mean():.2f}")
134 print(f"Best Cohort: {ltv.loc[ltv['ltv'].idxmax(), 'cohort']} (${ltv['ltv'].max():.2f})")
135 print(f"Worst Cohort: {ltv.loc[ltv['ltv'].idxmin(), 'cohort']} (${ltv['ltv'].min():.2f})")
136
137 # Segment summary
138 print(f"\n👥 CUSTOMER SEGMENTS")
139 segment_counts = segments['segment'].value_counts()
140 for seg, count in segment_counts.items():
141 pct = count / len(segments) * 100
142 print(f" {seg}: {count:,} ({pct:.1f}%)")
143
144 print("\n" + "=" * 60)
145
146
147# Generate test data
148np.random.seed(42)
149n_cust, n_trans = 3000, 30000
150
151customers = pd.DataFrame({
152 'customer_id': range(1, n_cust + 1),
153 'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
154 np.random.randint(0, 300, n_cust), unit='D'
155 )
156})
157
158transactions = pd.DataFrame({
159 'transaction_id': range(1, n_trans + 1),
160 'customer_id': np.random.randint(1, n_cust + 1, n_trans),
161 'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
162 np.random.randint(0, 365, n_trans), unit='D'
163 ),
164 'amount': np.random.lognormal(4, 1, n_trans)
165})
166
167transactions = transactions.merge(customers, on='customer_id')
168transactions = transactions[transactions['transaction_date'] >= transactions['signup_date']]
169
170# Run analysis
171analyzer = CohortAnalyzer(transactions)
172
173# Generate visualizations
174analyzer.plot_retention_heatmap()
175
176# Generate report
177analyzer.generate_report()

9. Tổng kết

TopicKey Concepts
Cohort TypesTime-based, Behavioral, Segment-based
RetentionRetention matrix, retention curves
RevenueARPU, cumulative revenue, LTV
SegmentsRFM analysis, behavioral segments
MetricsChurn rate, reactivation rate

Bài tiếp theo: Funnel Analysis