Cohort Analysis

Customer Cohort Analysis

1. Introduction

Cohort Analysis là gì?

Cohort Analysis nhóm users theo đặc điểm chung (thường là thời điểm acquisition) và theo dõi hành vi của họ qua thời gian. Đây là công cụ powerful để hiểu retention, engagement, và customer lifetime value.

1.1 Cohort Types

Text

1┌─────────────────────────────────────────────────────────┐
2│                   Types of Cohorts                       │
3├─────────────────────────────────────────────────────────┤
4│                                                          │
5│  ┌─────────────────────────────────────────────────┐    │
6│  │ 📅 Time-based Cohorts                           │    │
7│  │    • Acquisition cohort: Month/week signed up   │    │
8│  │    • First purchase cohort: First order month   │    │
9│  │    • Activation cohort: Completed onboarding    │    │
10│  └─────────────────────────────────────────────────┘    │
11│                                                          │
12│  ┌─────────────────────────────────────────────────┐    │
13│  │ 📊 Behavior-based Cohorts                       │    │
14│  │    • Power users vs casual users                │    │
15│  │    • High spenders vs low spenders              │    │
16│  │    • Channel cohorts (organic vs paid)          │    │
17│  └─────────────────────────────────────────────────┘    │
18│                                                          │
19│  ┌─────────────────────────────────────────────────┐    │
20│  │ 🎯 Segment-based Cohorts                        │    │
21│  │    • Product category                           │    │
22│  │    • Geographic region                          │    │
23│  │    • Customer tier (premium vs free)            │    │
24│  └─────────────────────────────────────────────────┘    │
25│                                                          │
26└─────────────────────────────────────────────────────────┘

2. Data Preparation

2.1 Create Sample Data

Python

1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6# Generate sample transaction data
7np.random.seed(42)
8
9n_customers = 5000
10n_transactions = 50000
11
12# Customer signup dates (spread over 12 months)
13customers = pd.DataFrame({
14    'customer_id': range(1, n_customers + 1),
15    'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
16        np.random.randint(0, 365, n_customers), unit='D'
17    )
18})
19
20# Generate transactions
21transactions = pd.DataFrame({
22    'transaction_id': range(1, n_transactions + 1),
23    'customer_id': np.random.randint(1, n_customers + 1, n_transactions),
24    'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
25        np.random.randint(0, 400, n_transactions), unit='D'
26    ),
27    'amount': np.random.lognormal(4, 1, n_transactions)
28})
29
30# Merge to get signup date
31transactions = transactions.merge(customers, on='customer_id')
32
33# Only keep transactions after signup
34transactions = transactions[
35    transactions['transaction_date'] >= transactions['signup_date']
36]
37
38print(f"Customers: {len(customers)}")
39print(f"Transactions: {len(transactions)}")
40print(transactions.head())

2.2 Calculate Cohort Fields

Python

1def assign_cohorts(df, signup_col, activity_col, freq='M'):
2    """Assign cohort and period to each row"""
3    df = df.copy()
4    
5    # Cohort: Month of signup
6    df['cohort'] = df[signup_col].dt.to_period(freq)
7    
8    # Activity period
9    df['activity_period'] = df[activity_col].dt.to_period(freq)
10    
11    # Period number (0, 1, 2, ...)
12    df['period_number'] = (
13        df['activity_period'].view('int64') - 
14        df['cohort'].view('int64')
15    )
16    
17    return df
18
19transactions = assign_cohorts(
20    transactions, 
21    'signup_date', 
22    'transaction_date',
23    freq='M'
24)
25
26print(transactions[['customer_id', 'cohort', 'activity_period', 'period_number']].head(10))

3. Retention Analysis

3.1 Build Retention Matrix

Python

1def build_retention_matrix(df, value_col='customer_id', agg_func='nunique'):
2    """Build cohort retention matrix"""
3    
4    # Aggregate by cohort and period
5    cohort_data = df.groupby(['cohort', 'period_number']).agg({
6        value_col: agg_func
7    }).reset_index()
8    
9    # Pivot to matrix
10    cohort_matrix = cohort_data.pivot(
11        index='cohort',
12        columns='period_number',
13        values=value_col
14    )
15    
16    return cohort_matrix
17
18# Active users matrix
19retention_counts = build_retention_matrix(transactions)
20print("Retention Counts:")
21print(retention_counts.iloc[:6, :7])
22
23# Calculate retention rate
24cohort_sizes = retention_counts.iloc[:, 0]
25retention_rate = retention_counts.divide(cohort_sizes, axis=0) * 100
26
27print("\nRetention Rate (%):")
28print(retention_rate.iloc[:6, :7].round(1))

3.2 Visualize Retention Heatmap

Python

1def plot_retention_heatmap(retention_matrix, title='Retention Rate (%)'):
2    """Plot retention heatmap"""
3    plt.figure(figsize=(14, 10))
4    
5    # Create annotations
6    annot = retention_matrix.round(1).astype(str) + '%'
7    
8    sns.heatmap(
9        retention_matrix,
10        annot=annot,
11        fmt='',
12        cmap='YlGnBu',
13        vmin=0,
14        vmax=100,
15        linewidths=0.5,
16        cbar_kws={'label': 'Retention Rate (%)'}
17    )
18    
19    plt.title(title, fontsize=14)
20    plt.xlabel('Months Since First Purchase')
21    plt.ylabel('Cohort (Month)')
22    plt.tight_layout()
23    plt.savefig('retention_heatmap.png', dpi=150)
24    plt.show()
25
26plot_retention_heatmap(retention_rate.iloc[:12, :12])

3.3 Retention Curves

Python

1def plot_retention_curves(retention_matrix, top_n=6):
2    """Plot retention curves for multiple cohorts"""
3    plt.figure(figsize=(12, 6))
4    
5    for cohort in retention_matrix.index[:top_n]:
6        values = retention_matrix.loc[cohort].dropna()
7        plt.plot(values.index, values.values, marker='o', label=str(cohort))
8    
9    plt.xlabel('Months Since Acquisition')
10    plt.ylabel('Retention Rate (%)')
11    plt.title('Retention Curves by Cohort')
12    plt.legend(title='Cohort')
13    plt.grid(True, alpha=0.3)
14    plt.ylim(0, 105)
15    plt.savefig('retention_curves.png', dpi=150)
16    plt.show()
17
18plot_retention_curves(retention_rate)

4. Revenue Cohorts

4.1 Revenue per Cohort

Python

1# Revenue matrix
2revenue_matrix = build_retention_matrix(
3    transactions, 
4    value_col='amount', 
5    agg_func='sum'
6)
7
8print("Total Revenue by Cohort and Period:")
9print(revenue_matrix.iloc[:6, :7].round(0))
10
11# Cumulative revenue
12cumulative_revenue = revenue_matrix.cumsum(axis=1)
13print("\nCumulative Revenue:")
14print(cumulative_revenue.iloc[:6, :7].round(0))

4.2 Average Revenue per User (ARPU)

Python

1# ARPU = Total Revenue / Total Users in Cohort
2cohort_users = transactions.groupby('cohort')['customer_id'].nunique()
3
4arpu_matrix = revenue_matrix.divide(cohort_users, axis=0)
5print("ARPU by Cohort and Period:")
6print(arpu_matrix.iloc[:6, :7].round(2))
7
8# Cumulative ARPU
9cum_arpu = arpu_matrix.cumsum(axis=1)
10
11# Plot ARPU trends
12plt.figure(figsize=(12, 6))
13for cohort in cum_arpu.index[:6]:
14    values = cum_arpu.loc[cohort].dropna()
15    plt.plot(values.index, values.values, marker='o', label=str(cohort))
16
17plt.xlabel('Months Since Acquisition')
18plt.ylabel('Cumulative ARPU ($)')
19plt.title('Cumulative ARPU by Cohort')
20plt.legend(title='Cohort')
21plt.grid(True, alpha=0.3)
22plt.show()

4.3 Customer Lifetime Value (LTV)

Python

1def calculate_ltv_by_cohort(df, periods=12):
2    """Calculate LTV for each cohort"""
3    results = []
4    
5    for cohort in df['cohort'].unique():
6        cohort_data = df[df['cohort'] == cohort]
7        cohort_users = cohort_data['customer_id'].nunique()
8        
9        # Revenue for first N periods
10        period_revenue = cohort_data[
11            cohort_data['period_number'] < periods
12        ].groupby('period_number')['amount'].sum()
13        
14        total_revenue = period_revenue.sum()
15        ltv = total_revenue / cohort_users
16        
17        results.append({
18            'cohort': cohort,
19            'users': cohort_users,
20            'total_revenue': total_revenue,
21            'ltv': ltv
22        })
23    
24    return pd.DataFrame(results)
25
26ltv_df = calculate_ltv_by_cohort(transactions, periods=12)
27print("LTV by Cohort:")
28print(ltv_df.sort_values('cohort').head(10))
29
30# Plot LTV trend
31plt.figure(figsize=(12, 5))
32plt.bar(ltv_df['cohort'].astype(str), ltv_df['ltv'])
33plt.xlabel('Cohort')
34plt.ylabel('12-Month LTV ($)')
35plt.title('Customer Lifetime Value by Acquisition Cohort')
36plt.xticks(rotation=45)
37plt.tight_layout()
38plt.show()

5. Behavioral Cohorts

5.1 Purchase Frequency Cohorts

Python

1# Calculate purchase frequency per customer
2customer_freq = transactions.groupby('customer_id').agg({
3    'transaction_id': 'count',
4    'amount': 'sum',
5    'signup_date': 'first'
6}).rename(columns={'transaction_id': 'frequency', 'amount': 'total_spent'})
7
8# Create frequency segments
9customer_freq['freq_segment'] = pd.cut(
10    customer_freq['frequency'],
11    bins=[0, 1, 3, 10, float('inf')],
12    labels=['One-time', 'Occasional', 'Regular', 'Power User']
13)
14
15print("Customer Segments:")
16print(customer_freq['freq_segment'].value_counts())
17
18# Analyze by segment
19segment_stats = customer_freq.groupby('freq_segment').agg({
20    'frequency': ['count', 'mean'],
21    'total_spent': ['mean', 'sum']
22})
23print("\nSegment Statistics:")
24print(segment_stats)

5.2 RFM-based Cohorts

Python

1# Calculate RFM metrics
2reference_date = transactions['transaction_date'].max() + pd.Timedelta(days=1)
3
4rfm = transactions.groupby('customer_id').agg({
5    'transaction_date': lambda x: (reference_date - x.max()).days,  # Recency
6    'transaction_id': 'count',  # Frequency
7    'amount': 'sum'  # Monetary
8}).rename(columns={
9    'transaction_date': 'recency',
10    'transaction_id': 'frequency',
11    'amount': 'monetary'
12})
13
14# Create RFM scores (1-5)
15rfm['R_score'] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
16rfm['F_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
17rfm['M_score'] = pd.qcut(rfm['monetary'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
18
19# RFM segment
20rfm['RFM_score'] = (rfm['R_score'].astype(str) + 
21                    rfm['F_score'].astype(str) + 
22                    rfm['M_score'].astype(str))
23
24# Simplified segments
25def rfm_segment(row):
26    if row['R_score'] >= 4 and row['F_score'] >= 4:
27        return 'Champions'
28    elif row['R_score'] >= 4 and row['F_score'] < 3:
29        return 'New Customers'
30    elif row['R_score'] < 3 and row['F_score'] >= 4:
31        return 'At Risk'
32    elif row['R_score'] < 2:
33        return 'Lost'
34    else:
35        return 'Regular'
36
37rfm['segment'] = rfm.apply(rfm_segment, axis=1)
38
39print("RFM Segments:")
40print(rfm['segment'].value_counts())

5.3 Channel Cohorts

Python

1# Add channel to customers
2channels = ['Organic', 'Paid Search', 'Social', 'Email', 'Referral']
3customers['channel'] = np.random.choice(channels, len(customers), p=[0.3, 0.25, 0.2, 0.15, 0.1])
4
5# Merge channel to transactions
6transactions = transactions.merge(
7    customers[['customer_id', 'channel']], 
8    on='customer_id',
9    how='left'
10)
11
12# Analyze by channel
13channel_analysis = transactions.groupby('channel').agg({
14    'customer_id': 'nunique',
15    'transaction_id': 'count',
16    'amount': ['sum', 'mean']
17}).round(2)
18channel_analysis.columns = ['customers', 'transactions', 'total_revenue', 'avg_order']
19
20print("Channel Performance:")
21print(channel_analysis)
22
23# Retention by channel
24def channel_retention(df, channel):
25    """Calculate retention for a specific channel"""
26    channel_data = df[df['channel'] == channel]
27    return build_retention_matrix(channel_data)
28
29# Compare channels
30fig, axes = plt.subplots(2, 3, figsize=(15, 10))
31axes = axes.flatten()
32
33for i, channel in enumerate(channels):
34    channel_ret = channel_retention(transactions, channel)
35    cohort_sizes = channel_ret.iloc[:, 0]
36    channel_rate = channel_ret.divide(cohort_sizes, axis=0) * 100
37    
38    # Average retention by period
39    avg_retention = channel_rate.mean()[:7]
40    axes[i].bar(range(len(avg_retention)), avg_retention.values)
41    axes[i].set_title(f'{channel}')
42    axes[i].set_xlabel('Period')
43    axes[i].set_ylabel('Retention %')
44    axes[i].set_ylim(0, 100)
45
46axes[5].axis('off')
47plt.suptitle('Average Retention Rate by Channel', y=1.02)
48plt.tight_layout()
49plt.show()

6. Cohort Analysis SQL

6.1 Basic Retention Query

SQL

1-- Step 1: Assign cohorts
2WITH customer_cohorts AS (
3    SELECT 
4        customer_id,
5        DATE_TRUNC('month', MIN(order_date)) AS cohort_month
6    FROM orders
7    GROUP BY customer_id
8),
9
10-- Step 2: Join transactions with cohorts
11transactions_with_cohort AS (
12    SELECT 
13        o.customer_id,
14        c.cohort_month,
15        DATE_TRUNC('month', o.order_date) AS activity_month,
16        -- Period number
17        EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -
18        EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period_number
19    FROM orders o
20    JOIN customer_cohorts c ON o.customer_id = c.customer_id
21),
22
23-- Step 3: Count active users per cohort and period
24cohort_data AS (
25    SELECT 
26        cohort_month,
27        period_number,
28        COUNT(DISTINCT customer_id) AS active_users
29    FROM transactions_with_cohort
30    GROUP BY cohort_month, period_number
31),
32
33-- Step 4: Get cohort sizes
34cohort_sizes AS (
35    SELECT 
36        cohort_month,
37        active_users AS cohort_size
38    FROM cohort_data
39    WHERE period_number = 0
40)
41
42-- Step 5: Calculate retention rate
43SELECT 
44    cd.cohort_month,
45    cd.period_number,
46    cd.active_users,
47    cs.cohort_size,
48    ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) AS retention_rate
49FROM cohort_data cd
50JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month
51ORDER BY cd.cohort_month, cd.period_number;

6.2 Revenue Cohort Query

SQL

1WITH customer_cohorts AS (
2    SELECT 
3        customer_id,
4        DATE_TRUNC('month', MIN(order_date)) AS cohort_month
5    FROM orders
6    GROUP BY customer_id
7),
8
9cohort_revenue AS (
10    SELECT 
11        c.cohort_month,
12        EXTRACT(YEAR FROM o.order_date) * 12 + EXTRACT(MONTH FROM o.order_date) -
13        EXTRACT(YEAR FROM c.cohort_month) * 12 - EXTRACT(MONTH FROM c.cohort_month) AS period,
14        COUNT(DISTINCT o.customer_id) AS customers,
15        SUM(o.amount) AS revenue
16    FROM orders o
17    JOIN customer_cohorts c ON o.customer_id = c.customer_id
18    GROUP BY c.cohort_month, period
19)
20
21SELECT 
22    cohort_month,
23    period,
24    customers,
25    revenue,
26    ROUND(revenue / customers, 2) AS arpu,
27    SUM(revenue) OVER (
28        PARTITION BY cohort_month 
29        ORDER BY period
30    ) AS cumulative_revenue
31FROM cohort_revenue
32WHERE period <= 12
33ORDER BY cohort_month, period;

6.3 Pivot Retention Matrix

SQL

1-- PostgreSQL: Create pivot table
2WITH retention_data AS (
3    -- ... previous CTEs to get retention rates
4    SELECT cohort_month, period_number, retention_rate
5    FROM cohort_analysis
6)
7SELECT 
8    cohort_month,
9    MAX(CASE WHEN period_number = 0 THEN retention_rate END) AS m0,
10    MAX(CASE WHEN period_number = 1 THEN retention_rate END) AS m1,
11    MAX(CASE WHEN period_number = 2 THEN retention_rate END) AS m2,
12    MAX(CASE WHEN period_number = 3 THEN retention_rate END) AS m3,
13    MAX(CASE WHEN period_number = 4 THEN retention_rate END) AS m4,
14    MAX(CASE WHEN period_number = 5 THEN retention_rate END) AS m5,
15    MAX(CASE WHEN period_number = 6 THEN retention_rate END) AS m6
16FROM retention_data
17GROUP BY cohort_month
18ORDER BY cohort_month;

7. Advanced Metrics

7.1 Churn Analysis

Python

1def calculate_churn(df, churn_days=90):
2    """Calculate churn rate by cohort"""
3    reference_date = df['transaction_date'].max()
4    
5    # Last activity per customer
6    last_activity = df.groupby('customer_id').agg({
7        'transaction_date': 'max',
8        'cohort': 'first'
9    })
10    
11    # Days since last activity
12    last_activity['days_inactive'] = (reference_date - last_activity['transaction_date']).dt.days
13    last_activity['is_churned'] = last_activity['days_inactive'] > churn_days
14    
15    # Churn rate by cohort
16    churn_by_cohort = last_activity.groupby('cohort').agg({
17        'is_churned': ['sum', 'count', 'mean']
18    })
19    churn_by_cohort.columns = ['churned', 'total', 'churn_rate']
20    churn_by_cohort['churn_rate'] = (churn_by_cohort['churn_rate'] * 100).round(2)
21    
22    return churn_by_cohort
23
24churn_df = calculate_churn(transactions, churn_days=60)
25print("Churn Rate by Cohort:")
26print(churn_df)

7.2 Reactivation Rate

Python

1def analyze_reactivation(df, dormant_days=60):
2    """Analyze customer reactivation"""
3    # Sort by customer and date
4    df_sorted = df.sort_values(['customer_id', 'transaction_date'])
5    
6    # Calculate days between transactions
7    df_sorted['prev_transaction'] = df_sorted.groupby('customer_id')['transaction_date'].shift()
8    df_sorted['days_gap'] = (df_sorted['transaction_date'] - df_sorted['prev_transaction']).dt.days
9    
10    # Identify reactivations (gap > dormant_days)
11    df_sorted['is_reactivation'] = df_sorted['days_gap'] > dormant_days
12    
13    # Reactivation by month
14    reactivations = df_sorted[df_sorted['is_reactivation']].groupby(
15        df_sorted['transaction_date'].dt.to_period('M')
16    ).agg({
17        'customer_id': 'nunique',
18        'amount': 'sum'
19    }).rename(columns={
20        'customer_id': 'reactivated_customers',
21        'amount': 'reactivation_revenue'
22    })
23    
24    return reactivations
25
26reactivation_df = analyze_reactivation(transactions)
27print("Reactivation Analysis:")
28print(reactivation_df.tail(10))

7.3 Cohort Comparison

Python

1def compare_cohorts(df, cohort1, cohort2, metric='retention'):
2    """Statistical comparison between two cohorts"""
3    from scipy import stats
4    
5    c1_data = df[df['cohort'] == cohort1]
6    c2_data = df[df['cohort'] == cohort2]
7    
8    if metric == 'retention':
9        # Compare retention at period 3
10        c1_ret = c1_data.groupby('period_number')['customer_id'].nunique()
11        c2_ret = c2_data.groupby('period_number')['customer_id'].nunique()
12        
13        # Calculate retention rates
14        c1_rate = c1_ret / c1_ret.iloc[0] if len(c1_ret) > 0 else 0
15        c2_rate = c2_ret / c2_ret.iloc[0] if len(c2_ret) > 0 else 0
16        
17    elif metric == 'revenue':
18        c1_vals = c1_data.groupby('customer_id')['amount'].sum()
19        c2_vals = c2_data.groupby('customer_id')['amount'].sum()
20        
21        # T-test
22        t_stat, p_value = stats.ttest_ind(c1_vals, c2_vals)
23        
24        print(f"\nCohort Comparison: {cohort1} vs {cohort2}")
25        print(f"Mean Revenue - {cohort1}: ${c1_vals.mean():.2f}")
26        print(f"Mean Revenue - {cohort2}: ${c2_vals.mean():.2f}")
27        print(f"T-statistic: {t_stat:.4f}")
28        print(f"P-value: {p_value:.4f}")
29        
30        if p_value < 0.05:
31            print("✓ Statistically significant difference")
32        else:
33            print("✗ No significant difference")
34
35# Compare two cohorts
36compare_cohorts(transactions, '2024-01', '2024-06', metric='revenue')

8. Thực hành

Cohort Analysis Project

Exercise: Complete Cohort Analysis

Python

1# Build a comprehensive cohort analysis:
2# 1. Create retention matrix
3# 2. Calculate revenue cohorts
4# 3. Build LTV analysis
5# 4. Create behavioral segments
6# 5. Generate executive summary
7
8# YOUR CODE HERE

💡 Xem đáp án

Python

1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6class CohortAnalyzer:
7    def __init__(self, transactions_df, signup_col='signup_date', 
8                 activity_col='transaction_date', customer_col='customer_id'):
9        self.df = transactions_df.copy()
10        self.signup_col = signup_col
11        self.activity_col = activity_col
12        self.customer_col = customer_col
13        self._prepare_data()
14    
15    def _prepare_data(self):
16        """Prepare cohort fields"""
17        self.df['cohort'] = self.df[self.signup_col].dt.to_period('M')
18        self.df['activity_period'] = self.df[self.activity_col].dt.to_period('M')
19        self.df['period_number'] = (
20            self.df['activity_period'].view('int64') - 
21            self.df['cohort'].view('int64')
22        )
23    
24    def retention_matrix(self):
25        """Build retention matrix"""
26        matrix = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()
27        cohort_sizes = matrix.iloc[:, 0]
28        return matrix.divide(cohort_sizes, axis=0) * 100
29    
30    def revenue_matrix(self):
31        """Build revenue matrix"""
32        return self.df.groupby(['cohort', 'period_number'])['amount'].sum().unstack()
33    
34    def arpu_matrix(self):
35        """Build ARPU matrix"""
36        revenue = self.revenue_matrix()
37        users = self.df.groupby(['cohort', 'period_number'])[self.customer_col].nunique().unstack()
38        return revenue / users
39    
40    def ltv_by_cohort(self, periods=12):
41        """Calculate LTV for each cohort"""
42        results = []
43        for cohort in self.df['cohort'].unique():
44            cohort_data = self.df[self.df['cohort'] == cohort]
45            cohort_users = cohort_data[self.customer_col].nunique()
46            revenue = cohort_data[cohort_data['period_number'] < periods]['amount'].sum()
47            results.append({
48                'cohort': cohort,
49                'users': cohort_users,
50                'revenue': revenue,
51                'ltv': revenue / cohort_users
52            })
53        return pd.DataFrame(results).sort_values('cohort')
54    
55    def segment_customers(self):
56        """Create RFM segments"""
57        ref_date = self.df[self.activity_col].max() + pd.Timedelta(days=1)
58        
59        rfm = self.df.groupby(self.customer_col).agg({
60            self.activity_col: lambda x: (ref_date - x.max()).days,
61            'transaction_id': 'count',
62            'amount': 'sum'
63        }).rename(columns={
64            self.activity_col: 'recency',
65            'transaction_id': 'frequency',
66            'amount': 'monetary'
67        })
68        
69        # Scores
70        rfm['R'] = pd.qcut(rfm['recency'], 4, labels=[4, 3, 2, 1])
71        rfm['F'] = pd.qcut(rfm['frequency'].rank(method='first'), 4, labels=[1, 2, 3, 4])
72        rfm['M'] = pd.qcut(rfm['monetary'].rank(method='first'), 4, labels=[1, 2, 3, 4])
73        
74        # Segment
75        def segment(row):
76            r, f = int(row['R']), int(row['F'])
77            if r >= 3 and f >= 3: return 'Champions'
78            elif r >= 3 and f < 3: return 'New/Promising'
79            elif r < 3 and f >= 3: return 'At Risk'
80            elif r < 2: return 'Lost'
81            else: return 'Regular'
82        
83        rfm['segment'] = rfm.apply(segment, axis=1)
84        return rfm
85    
86    def plot_retention_heatmap(self, max_periods=12):
87        """Plot retention heatmap"""
88        retention = self.retention_matrix().iloc[:12, :max_periods]
89        
90        plt.figure(figsize=(14, 8))
91        sns.heatmap(
92            retention,
93            annot=True,
94            fmt='.1f',
95            cmap='YlGnBu',
96            vmin=0,
97            vmax=100
98        )
99        plt.title('Retention Rate by Cohort (%)')
100        plt.xlabel('Months Since First Purchase')
101        plt.ylabel('Cohort')
102        plt.tight_layout()
103        plt.savefig('cohort_retention.png', dpi=150)
104        plt.show()
105    
106    def generate_report(self):
107        """Generate executive summary"""
108        retention = self.retention_matrix()
109        ltv = self.ltv_by_cohort()
110        segments = self.segment_customers()
111        
112        print("=" * 60)
113        print("COHORT ANALYSIS EXECUTIVE SUMMARY")
114        print("=" * 60)
115        
116        # Overall metrics
117        print(f"\n📊 OVERALL METRICS")
118        print(f"Total Customers: {self.df[self.customer_col].nunique():,}")
119        print(f"Total Transactions: {len(self.df):,}")
120        print(f"Total Revenue: ${self.df['amount'].sum():,.2f}")
121        
122        # Retention summary
123        print(f"\n📈 RETENTION SUMMARY")
124        avg_m1 = retention[1].mean() if 1 in retention.columns else 0
125        avg_m3 = retention[3].mean() if 3 in retention.columns else 0
126        avg_m6 = retention[6].mean() if 6 in retention.columns else 0
127        print(f"Month 1 Avg Retention: {avg_m1:.1f}%")
128        print(f"Month 3 Avg Retention: {avg_m3:.1f}%")
129        print(f"Month 6 Avg Retention: {avg_m6:.1f}%")
130        
131        # LTV summary
132        print(f"\n💰 LTV SUMMARY")
133        print(f"Average 12-Month LTV: ${ltv['ltv'].mean():.2f}")
134        print(f"Best Cohort: {ltv.loc[ltv['ltv'].idxmax(), 'cohort']} (${ltv['ltv'].max():.2f})")
135        print(f"Worst Cohort: {ltv.loc[ltv['ltv'].idxmin(), 'cohort']} (${ltv['ltv'].min():.2f})")
136        
137        # Segment summary
138        print(f"\n👥 CUSTOMER SEGMENTS")
139        segment_counts = segments['segment'].value_counts()
140        for seg, count in segment_counts.items():
141            pct = count / len(segments) * 100
142            print(f"  {seg}: {count:,} ({pct:.1f}%)")
143        
144        print("\n" + "=" * 60)
145
146
147# Generate test data
148np.random.seed(42)
149n_cust, n_trans = 3000, 30000
150
151customers = pd.DataFrame({
152    'customer_id': range(1, n_cust + 1),
153    'signup_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
154        np.random.randint(0, 300, n_cust), unit='D'
155    )
156})
157
158transactions = pd.DataFrame({
159    'transaction_id': range(1, n_trans + 1),
160    'customer_id': np.random.randint(1, n_cust + 1, n_trans),
161    'transaction_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(
162        np.random.randint(0, 365, n_trans), unit='D'
163    ),
164    'amount': np.random.lognormal(4, 1, n_trans)
165})
166
167transactions = transactions.merge(customers, on='customer_id')
168transactions = transactions[transactions['transaction_date'] >= transactions['signup_date']]
169
170# Run analysis
171analyzer = CohortAnalyzer(transactions)
172
173# Generate visualizations
174analyzer.plot_retention_heatmap()
175
176# Generate report
177analyzer.generate_report()

9. Tổng kết

Topic	Key Concepts
Cohort Types	Time-based, Behavioral, Segment-based
Retention	Retention matrix, retention curves
Revenue	ARPU, cumulative revenue, LTV
Segments	RFM analysis, behavioral segments
Metrics	Churn rate, reactivation rate

Bài tiếp theo: Funnel Analysis