Lý thuyết
Bài 12/17

Predictive Analytics Basics

Giới thiệu forecasting, regression, và classification cho Data Analysts

Predictive Analytics Basics

Predictive Analytics and Forecasting

1. Introduction

Predictive Analytics cho Analysts

Predictive analytics sử dụng historical data để forecast future outcomes. Là Data Analyst, bạn cần hiểu fundamentals để collaborate với Data Scientists và interpret model results.

1.1 Types of Predictions

Text
1┌─────────────────────────────────────────────────────────┐
2│ Predictive Analytics Types │
3├─────────────────────────────────────────────────────────┤
4│ │
5│ ┌─────────────────────────────────────────────────┐ │
6│ │ 📈 Regression (Continuous) │ │
7│ │ • Sales forecasting │ │
8│ │ • Customer lifetime value │ │
9│ │ • Demand prediction │ │
10│ │ Output: Number (e.g., $1,234.56) │ │
11│ └─────────────────────────────────────────────────┘ │
12│ │
13│ ┌─────────────────────────────────────────────────┐ │
14│ │ 🏷️ Classification (Categorical) │ │
15│ │ • Churn prediction (yes/no) │ │
16│ │ • Fraud detection │ │
17│ │ • Customer segmentation │ │
18│ │ Output: Category (e.g., "High Risk") │ │
19│ └─────────────────────────────────────────────────┘ │
20│ │
21│ ┌─────────────────────────────────────────────────┐ │
22│ │ ⏰ Time Series Forecasting │ │
23│ │ • Next month's revenue │ │
24│ │ • Inventory needs │ │
25│ │ • Trend extrapolation │ │
26│ │ Output: Future values over time │ │
27│ └─────────────────────────────────────────────────┘ │
28│ │
29└─────────────────────────────────────────────────────────┘

2. Simple Linear Regression

2.1 Concept và Implementation

Python
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4from sklearn.linear_model import LinearRegression
5from sklearn.model_selection import train_test_split
6from sklearn.metrics import mean_squared_error, r2_score
7
8# Sample data: Marketing spend vs Sales
9np.random.seed(42)
10n = 100
11
12marketing_spend = np.random.uniform(1000, 10000, n)
13noise = np.random.normal(0, 500, n)
14sales = 500 + 3.5 * marketing_spend + noise # True relationship
15
16df = pd.DataFrame({
17 'marketing_spend': marketing_spend,
18 'sales': sales
19})
20
21# Visualize
22plt.figure(figsize=(10, 5))
23plt.scatter(df['marketing_spend'], df['sales'], alpha=0.5)
24plt.xlabel('Marketing Spend ($)')
25plt.ylabel('Sales ($)')
26plt.title('Marketing Spend vs Sales')
27plt.show()
28
29print(df.describe())

2.2 Build và Evaluate Model

Python
1# Prepare data
2X = df[['marketing_spend']] # Features (must be 2D)
3y = df['sales'] # Target
4
5# Split data
6X_train, X_test, y_train, y_test = train_test_split(
7 X, y, test_size=0.2, random_state=42
8)
9
10# Train model
11model = LinearRegression()
12model.fit(X_train, y_train)
13
14# Coefficients
15print(f"Intercept: ${model.intercept_:.2f}")
16print(f"Coefficient: ${model.coef_[0]:.2f} per $1 marketing spend")
17
18# Predictions
19y_pred = model.predict(X_test)
20
21# Evaluation
22rmse = np.sqrt(mean_squared_error(y_test, y_pred))
23r2 = r2_score(y_test, y_pred)
24
25print(f"\nModel Performance:")
26print(f"RMSE: ${rmse:.2f}")
27print(f"R² Score: {r2:.4f}")
28print(f" → Model explains {r2*100:.1f}% of variance")

2.3 Visualize Results

Python
1# Plot actual vs predicted
2fig, axes = plt.subplots(1, 2, figsize=(14, 5))
3
4# Regression line
5axes[0].scatter(X, y, alpha=0.5, label='Actual')
6X_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
7y_line = model.predict(X_line)
8axes[0].plot(X_line, y_line, 'r-', linewidth=2, label='Prediction')
9axes[0].set_xlabel('Marketing Spend ($)')
10axes[0].set_ylabel('Sales ($)')
11axes[0].set_title('Linear Regression Fit')
12axes[0].legend()
13
14# Actual vs Predicted
15axes[1].scatter(y_test, y_pred, alpha=0.5)
16axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
17axes[1].set_xlabel('Actual Sales ($)')
18axes[1].set_ylabel('Predicted Sales ($)')
19axes[1].set_title('Actual vs Predicted')
20
21plt.tight_layout()
22plt.show()

2.4 Make Predictions

Python
1# Predict for new values
2new_spend = pd.DataFrame({'marketing_spend': [5000, 7500, 10000]})
3predictions = model.predict(new_spend)
4
5print("Sales Predictions:")
6for spend, pred in zip(new_spend['marketing_spend'], predictions):
7 print(f" ${spend:,.0f} spend → ${pred:,.2f} sales (expected)")

3. Multiple Regression

3.1 Multiple Features

Python
1# Generate multi-feature data
2np.random.seed(42)
3n = 200
4
5df = pd.DataFrame({
6 'marketing_spend': np.random.uniform(1000, 10000, n),
7 'store_traffic': np.random.uniform(500, 5000, n),
8 'avg_price': np.random.uniform(20, 100, n),
9 'num_promotions': np.random.randint(0, 10, n),
10 'is_weekend': np.random.choice([0, 1], n)
11})
12
13# Generate sales with all features
14df['sales'] = (
15 500 +
16 2.5 * df['marketing_spend'] +
17 1.2 * df['store_traffic'] -
18 3.0 * df['avg_price'] +
19 200 * df['num_promotions'] +
20 1500 * df['is_weekend'] +
21 np.random.normal(0, 500, n)
22)
23
24print("Features correlation with sales:")
25print(df.corr()['sales'].sort_values(ascending=False))

3.2 Train Multi-Feature Model

Python
1# Prepare data
2features = ['marketing_spend', 'store_traffic', 'avg_price', 'num_promotions', 'is_weekend']
3X = df[features]
4y = df['sales']
5
6# Split
7X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
8
9# Train
10model = LinearRegression()
11model.fit(X_train, y_train)
12
13# Coefficients
14print("Feature Coefficients:")
15for feature, coef in zip(features, model.coef_):
16 direction = "↑" if coef > 0 else "↓"
17 print(f" {feature}: {coef:.2f} {direction}")
18
19# Evaluate
20y_pred = model.predict(X_test)
21print(f"\nR² Score: {r2_score(y_test, y_pred):.4f}")
22print(f"RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

3.3 Feature Importance

Python
1# Standardized coefficients for comparison
2from sklearn.preprocessing import StandardScaler
3
4scaler = StandardScaler()
5X_scaled = scaler.fit_transform(X_train)
6
7model_scaled = LinearRegression()
8model_scaled.fit(X_scaled, y_train)
9
10# Plot feature importance
11importance = pd.DataFrame({
12 'feature': features,
13 'importance': np.abs(model_scaled.coef_)
14}).sort_values('importance', ascending=True)
15
16plt.figure(figsize=(10, 5))
17plt.barh(importance['feature'], importance['importance'])
18plt.xlabel('Absolute Standardized Coefficient')
19plt.title('Feature Importance')
20plt.show()

4. Classification Basics

4.1 Binary Classification (Churn Prediction)

Python
1from sklearn.linear_model import LogisticRegression
2from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
3
4# Generate churn data
5np.random.seed(42)
6n = 1000
7
8customers = pd.DataFrame({
9 'tenure_months': np.random.randint(1, 72, n),
10 'monthly_spend': np.random.uniform(20, 200, n),
11 'support_tickets': np.random.poisson(2, n),
12 'last_login_days': np.random.exponential(30, n),
13 'contract_type': np.random.choice(['monthly', 'annual'], n)
14})
15
16# Calculate churn probability
17churn_prob = (
18 0.1 +
19 -0.01 * customers['tenure_months'] +
20 -0.002 * customers['monthly_spend'] +
21 0.1 * customers['support_tickets'] +
22 0.01 * customers['last_login_days'] +
23 0.2 * (customers['contract_type'] == 'monthly')
24)
25churn_prob = 1 / (1 + np.exp(-churn_prob)) # Sigmoid
26customers['churned'] = (np.random.random(n) < churn_prob).astype(int)
27
28print(f"Churn Rate: {customers['churned'].mean()*100:.1f}%")
29print(customers.head())

4.2 Train Classification Model

Python
1# Prepare features
2customers['is_monthly'] = (customers['contract_type'] == 'monthly').astype(int)
3features = ['tenure_months', 'monthly_spend', 'support_tickets', 'last_login_days', 'is_monthly']
4
5X = customers[features]
6y = customers['churned']
7
8# Split
9X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
10
11# Train logistic regression
12clf = LogisticRegression(random_state=42)
13clf.fit(X_train, y_train)
14
15# Predictions
16y_pred = clf.predict(X_test)
17y_prob = clf.predict_proba(X_test)[:, 1] # Probability of churn
18
19# Evaluation
20print("Classification Report:")
21print(classification_report(y_test, y_pred))
22
23print(f"\nAccuracy: {accuracy_score(y_test, y_pred)*100:.1f}%")

4.3 Confusion Matrix

Python
1from sklearn.metrics import ConfusionMatrixDisplay
2
3fig, axes = plt.subplots(1, 2, figsize=(12, 5))
4
5# Confusion matrix
6cm = confusion_matrix(y_test, y_pred)
7ConfusionMatrixDisplay(cm, display_labels=['Stay', 'Churn']).plot(ax=axes[0])
8axes[0].set_title('Confusion Matrix')
9
10# Probability distribution
11axes[1].hist(y_prob[y_test == 0], bins=20, alpha=0.5, label='Stayed')
12axes[1].hist(y_prob[y_test == 1], bins=20, alpha=0.5, label='Churned')
13axes[1].set_xlabel('Predicted Churn Probability')
14axes[1].set_ylabel('Count')
15axes[1].set_title('Probability Distribution')
16axes[1].legend()
17
18plt.tight_layout()
19plt.show()
20
21# Interpretation
22tn, fp, fn, tp = cm.ravel()
23print(f"\nMatrix Interpretation:")
24print(f" True Negatives (correctly predicted Stay): {tn}")
25print(f" False Positives (wrongly predicted Churn): {fp}")
26print(f" False Negatives (missed Churns): {fn}")
27print(f" True Positives (correctly predicted Churn): {tp}")

4.4 Risk Scoring

Python
1# Create risk scores
2test_customers = X_test.copy()
3test_customers['actual_churn'] = y_test.values
4test_customers['churn_probability'] = y_prob
5test_customers['risk_score'] = pd.cut(
6 y_prob,
7 bins=[0, 0.3, 0.6, 1.0],
8 labels=['Low', 'Medium', 'High']
9)
10
11print("Risk Distribution:")
12print(test_customers['risk_score'].value_counts())
13
14# Churn rate by risk segment
15risk_analysis = test_customers.groupby('risk_score').agg({
16 'actual_churn': ['count', 'sum', 'mean']
17}).round(3)
18risk_analysis.columns = ['total', 'churned', 'churn_rate']
19print("\nChurn Rate by Risk Segment:")
20print(risk_analysis)

5. Decision Trees

5.1 Simple Decision Tree

Python
1from sklearn.tree import DecisionTreeClassifier, plot_tree
2
3# Train decision tree
4tree = DecisionTreeClassifier(max_depth=3, random_state=42)
5tree.fit(X_train, y_train)
6
7# Visualize
8plt.figure(figsize=(20, 10))
9plot_tree(
10 tree,
11 feature_names=features,
12 class_names=['Stay', 'Churn'],
13 filled=True,
14 rounded=True,
15 fontsize=10
16)
17plt.title('Decision Tree for Churn Prediction')
18plt.tight_layout()
19plt.savefig('decision_tree.png', dpi=150)
20plt.show()
21
22# Evaluate
23y_pred_tree = tree.predict(X_test)
24print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_tree)*100:.1f}%")

5.2 Feature Importance from Trees

Python
1# Feature importance
2importance = pd.DataFrame({
3 'feature': features,
4 'importance': tree.feature_importances_
5}).sort_values('importance', ascending=False)
6
7print("Feature Importance (Decision Tree):")
8print(importance)
9
10# Visualize
11plt.figure(figsize=(10, 5))
12plt.barh(importance['feature'], importance['importance'])
13plt.xlabel('Importance')
14plt.title('Feature Importance from Decision Tree')
15plt.gca().invert_yaxis()
16plt.show()

6. Simple Forecasting

6.1 Moving Average Forecast

Python
1# Generate time series
2dates = pd.date_range('2022-01-01', periods=365*2, freq='D')
3np.random.seed(42)
4
5trend = np.linspace(1000, 1500, len(dates))
6seasonality = 200 * np.sin(2 * np.pi * np.arange(len(dates)) / 365)
7noise = np.random.normal(0, 50, len(dates))
8
9sales = trend + seasonality + noise
10
11ts = pd.DataFrame({
12 'date': dates,
13 'sales': sales
14}).set_index('date')
15
16# Simple moving average forecast
17ts['MA_7'] = ts['sales'].rolling(7).mean()
18ts['MA_30'] = ts['sales'].rolling(30).mean()
19
20# Forecast using last moving average
21last_ma30 = ts['MA_30'].iloc[-1]
22print(f"30-day MA Forecast: ${last_ma30:.2f}")
23
24# Plot
25plt.figure(figsize=(14, 5))
26plt.plot(ts.index[-90:], ts['sales'][-90:], alpha=0.5, label='Actual')
27plt.plot(ts.index[-90:], ts['MA_30'][-90:], label='30-day MA')
28plt.axhline(last_ma30, color='r', linestyle='--', label=f'Forecast: ${last_ma30:.0f}')
29plt.legend()
30plt.title('Sales Forecast using Moving Average')
31plt.show()

6.2 Trend Extrapolation

Python
1from scipy import stats
2
3# Fit linear trend
4x = np.arange(len(ts))
5y = ts['sales'].values
6
7slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
8
9# Extrapolate 30 days
10future_x = np.arange(len(ts), len(ts) + 30)
11forecast = intercept + slope * future_x
12
13print(f"Trend: ${slope:.2f} per day")
14print(f"30-day Forecast Range: ${forecast[0]:.2f} to ${forecast[-1]:.2f}")
15
16# Plot
17plt.figure(figsize=(14, 5))
18plt.plot(ts.index, ts['sales'], alpha=0.5, label='Historical')
19future_dates = pd.date_range(ts.index[-1] + pd.Timedelta(days=1), periods=30)
20plt.plot(future_dates, forecast, 'r--', linewidth=2, label='Forecast')
21plt.legend()
22plt.title('Sales Forecast using Trend Extrapolation')
23plt.show()

6.3 Seasonal Adjustment

Python
1def seasonal_forecast(ts, periods=30):
2 """Simple seasonal forecast"""
3 # Calculate monthly averages
4 ts['month'] = ts.index.month
5 monthly_avg = ts.groupby('month')['sales'].mean()
6
7 # Overall trend
8 recent = ts.iloc[-365:] # Last year
9 x = np.arange(len(recent))
10 y = recent['sales'].values
11 slope, intercept, _, _, _ = stats.linregress(x, y)
12
13 # Generate forecast
14 forecast_dates = pd.date_range(ts.index[-1] + pd.Timedelta(days=1), periods=periods)
15 forecasts = []
16
17 for i, date in enumerate(forecast_dates):
18 # Base from trend
19 base = intercept + slope * (len(recent) + i)
20
21 # Seasonal adjustment
22 seasonal_factor = monthly_avg[date.month] / ts['sales'].mean()
23
24 forecasts.append({
25 'date': date,
26 'forecast': base * seasonal_factor
27 })
28
29 return pd.DataFrame(forecasts)
30
31forecast_df = seasonal_forecast(ts, 30)
32print("Seasonal Forecast:")
33print(forecast_df.head(10))

7. Model Evaluation Best Practices

7.1 Cross-Validation

Python
1from sklearn.model_selection import cross_val_score
2
3# Regression cross-validation
4X = df[features]
5y = df['sales']
6
7model = LinearRegression()
8scores = cross_val_score(model, X, y, cv=5, scoring='r2')
9
10print("Cross-Validation Results (R²):")
11print(f" Scores: {scores.round(4)}")
12print(f" Mean: {scores.mean():.4f}")
13print(f" Std: {scores.std():.4f}")
14
15# Classification cross-validation
16clf = LogisticRegression(random_state=42)
17X_clf = customers[features]
18y_clf = customers['churned']
19
20scores = cross_val_score(clf, X_clf, y_clf, cv=5, scoring='accuracy')
21print(f"\nClassification Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

7.2 Understanding Metrics

Python
1def explain_metrics(y_true, y_pred, y_prob=None):
2 """Explain classification metrics"""
3 from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
4
5 print("=" * 50)
6 print("CLASSIFICATION METRICS EXPLAINED")
7 print("=" * 50)
8
9 accuracy = accuracy_score(y_true, y_pred)
10 print(f"\n📊 Accuracy: {accuracy:.2%}")
11 print(f" What it means: {accuracy:.0%} of all predictions are correct")
12
13 precision = precision_score(y_true, y_pred)
14 print(f"\n🎯 Precision: {precision:.2%}")
15 print(f" What it means: When we predict Churn, we're right {precision:.0%} of the time")
16
17 recall = recall_score(y_true, y_pred)
18 print(f"\n🔍 Recall: {recall:.2%}")
19 print(f" What it means: We catch {recall:.0%} of actual Churns")
20
21 f1 = f1_score(y_true, y_pred)
22 print(f"\n⚖️ F1 Score: {f1:.2%}")
23 print(f" What it means: Balanced measure of precision and recall")
24
25 if y_prob is not None:
26 auc = roc_auc_score(y_true, y_prob)
27 print(f"\n📈 AUC-ROC: {auc:.2%}")
28 print(f" What it means: Model's ability to distinguish classes (0.5=random, 1.0=perfect)")
29
30 print("\n" + "=" * 50)
31
32explain_metrics(y_test, y_pred, y_prob)

8. Thực hành

Predictive Analytics Project

Exercise: Build Predictive Model

Python
1# Build a churn prediction model:
2# 1. Prepare and explore data
3# 2. Train multiple models
4# 3. Compare performance
5# 4. Create actionable insights
6
7# YOUR CODE HERE
💡 Xem đáp án
Python
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4from sklearn.model_selection import train_test_split, cross_val_score
5from sklearn.linear_model import LogisticRegression
6from sklearn.tree import DecisionTreeClassifier
7from sklearn.ensemble import RandomForestClassifier
8from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
9
10# 1. Generate realistic customer data
11np.random.seed(42)
12n = 2000
13
14customers = pd.DataFrame({
15 'customer_id': range(1, n+1),
16 'tenure_months': np.random.randint(1, 60, n),
17 'monthly_charges': np.random.uniform(20, 150, n),
18 'total_charges': np.random.uniform(100, 5000, n),
19 'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], n, p=[0.5, 0.3, 0.2]),
20 'payment_method': np.random.choice(['Credit card', 'Bank transfer', 'Electronic check'], n),
21 'tech_support': np.random.choice([0, 1], n, p=[0.6, 0.4]),
22 'online_security': np.random.choice([0, 1], n, p=[0.55, 0.45]),
23 'num_services': np.random.randint(1, 6, n)
24})
25
26# Create churn
27churn_score = (
28 -0.03 * customers['tenure_months'] +
29 0.01 * customers['monthly_charges'] -
30 0.001 * customers['total_charges'] +
31 0.8 * (customers['contract_type'] == 'Month-to-month') -
32 0.3 * customers['tech_support'] -
33 0.2 * customers['online_security'] -
34 0.1 * customers['num_services']
35)
36customers['churned'] = (np.random.random(n) < 1/(1+np.exp(-churn_score))).astype(int)
37
38print("="*50)
39print("DATA OVERVIEW")
40print("="*50)
41print(f"Total customers: {len(customers)}")
42print(f"Churn rate: {customers['churned'].mean()*100:.1f}%")
43
44# 2. Feature engineering
45customers['is_monthly'] = (customers['contract_type'] == 'Month-to-month').astype(int)
46customers['avg_monthly'] = customers['total_charges'] / np.maximum(customers['tenure_months'], 1)
47
48features = ['tenure_months', 'monthly_charges', 'total_charges',
49 'is_monthly', 'tech_support', 'online_security', 'num_services', 'avg_monthly']
50
51X = customers[features]
52y = customers['churned']
53
54# Split
55X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56
57# 3. Train multiple models
58models = {
59 'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
60 'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
61 'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
62}
63
64print("\n" + "="*50)
65print("MODEL COMPARISON")
66print("="*50)
67
68results = []
69for name, model in models.items():
70 # Train
71 model.fit(X_train, y_train)
72
73 # Predict
74 y_pred = model.predict(X_test)
75 y_prob = model.predict_proba(X_test)[:, 1]
76
77 # Metrics
78 acc = accuracy_score(y_test, y_pred)
79 auc = roc_auc_score(y_test, y_prob)
80 cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
81
82 results.append({
83 'Model': name,
84 'Accuracy': acc,
85 'AUC-ROC': auc,
86 'CV AUC (mean)': cv_scores.mean(),
87 'CV AUC (std)': cv_scores.std()
88 })
89
90 print(f"\n{name}:")
91 print(f" Accuracy: {acc:.2%}")
92 print(f" AUC-ROC: {auc:.4f}")
93 print(f" CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
94
95results_df = pd.DataFrame(results)
96
97# 4. Best model analysis
98best_model = models['Random Forest']
99y_prob = best_model.predict_proba(X_test)[:, 1]
100
101# Feature importance
102importance = pd.DataFrame({
103 'feature': features,
104 'importance': best_model.feature_importances_
105}).sort_values('importance', ascending=False)
106
107print("\n" + "="*50)
108print("FEATURE IMPORTANCE (Random Forest)")
109print("="*50)
110for _, row in importance.iterrows():
111 bar = "█" * int(row['importance'] * 50)
112 print(f"{row['feature']:20s} {bar} {row['importance']:.3f}")
113
114# 5. Actionable insights
115print("\n" + "="*50)
116print("ACTIONABLE INSIGHTS")
117print("="*50)
118
119# Risk segments
120test_df = X_test.copy()
121test_df['churn_prob'] = y_prob
122test_df['actual'] = y_test.values
123test_df['risk'] = pd.cut(y_prob, bins=[0, 0.3, 0.6, 1.0], labels=['Low', 'Medium', 'High'])
124
125risk_summary = test_df.groupby('risk').agg({
126 'actual': ['count', 'sum', 'mean'],
127 'churn_prob': 'mean'
128}).round(3)
129risk_summary.columns = ['Count', 'Churned', 'Actual_Rate', 'Avg_Prob']
130
131print("\nRisk Segments:")
132print(risk_summary)
133
134print("\n💡 RECOMMENDATIONS:")
135print("1. Focus retention on HIGH risk segment (highest ROI)")
136print("2. Promote annual contracts to monthly customers")
137print("3. Encourage tech support adoption")
138print("4. Bundle more services to increase stickiness")
139
140# 6. Visualizations
141fig, axes = plt.subplots(2, 2, figsize=(14, 10))
142
143# ROC Curves
144for name, model in models.items():
145 y_prob = model.predict_proba(X_test)[:, 1]
146 fpr, tpr, _ = roc_curve(y_test, y_prob)
147 auc = roc_auc_score(y_test, y_prob)
148 axes[0, 0].plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')
149axes[0, 0].plot([0, 1], [0, 1], 'k--')
150axes[0, 0].set_xlabel('False Positive Rate')
151axes[0, 0].set_ylabel('True Positive Rate')
152axes[0, 0].set_title('ROC Curves')
153axes[0, 0].legend()
154
155# Feature importance
156axes[0, 1].barh(importance['feature'], importance['importance'])
157axes[0, 1].set_xlabel('Importance')
158axes[0, 1].set_title('Feature Importance')
159axes[0, 1].invert_yaxis()
160
161# Risk distribution
162risk_counts = test_df['risk'].value_counts()
163axes[1, 0].bar(risk_counts.index, risk_counts.values, color=['green', 'orange', 'red'])
164axes[1, 0].set_xlabel('Risk Level')
165axes[1, 0].set_ylabel('Customers')
166axes[1, 0].set_title('Risk Distribution')
167
168# Churn by tenure
169tenure_churn = customers.groupby(pd.cut(customers['tenure_months'], bins=5))['churned'].mean()
170axes[1, 1].bar(range(len(tenure_churn)), tenure_churn.values)
171axes[1, 1].set_xticklabels([str(x) for x in tenure_churn.index], rotation=45, ha='right')
172axes[1, 1].set_xlabel('Tenure (months)')
173axes[1, 1].set_ylabel('Churn Rate')
174axes[1, 1].set_title('Churn Rate by Tenure')
175
176plt.tight_layout()
177plt.savefig('churn_analysis.png', dpi=150)
178plt.show()
179
180print("\n✅ Analysis complete!")

9. Tổng kết

TopicKey Concepts
RegressionPredict continuous values, coefficients, R²
ClassificationPredict categories, probability, precision/recall
Decision TreesVisual rules, feature importance
ForecastingMoving average, trend extrapolation
EvaluationCross-validation, confusion matrix, AUC

Bài tiếp theo: Data Storytelling