Exploratory Data Analysis (EDA)

1. EDA là gì?

Exploratory Data Analysis (EDA) là quá trình khám phá dữ liệu để:

Hiểu cấu trúc và đặc điểm dữ liệu
Phát hiện patterns và anomalies
Kiểm tra assumptions
Tìm insights cho modeling

Quy trình EDA

1. Load Data

2. Data Overview

3. Data Quality

4. Univariate Analysis

5. Bivariate Analysis

6. Multivariate Analysis

7. Insights & Conclusions

2. Data Overview

2.1 Load và xem dữ liệu

Python

1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6# Load data
7df = pd.read_csv("data.csv")
8
9# Basic info
10print(f"Shape: {df.shape}")           # (rows, columns)
11print(f"Columns: {df.columns.tolist()}")
12
13# First/Last rows
14df.head()
15df.tail()
16df.sample(5)  # Random 5 rows
17
18# Data types
19df.dtypes
20df.info()

2.2 Statistical Summary

Python

1# Numeric summary
2df.describe()
3
4# Include all columns
5df.describe(include='all')
6
7# Specific percentiles
8df.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
9
10# Custom summary function
11def data_summary(df):
12    summary = pd.DataFrame({
13        'dtype': df.dtypes,
14        'count': df.count(),
15        'nunique': df.nunique(),
16        'null_count': df.isnull().sum(),
17        'null_pct': (df.isnull().sum() / len(df) * 100).round(2)
18    })
19    return summary
20
21data_summary(df)

3. Data Quality Check

3.1 Missing Values

Python

1# Missing values summary
2missing = df.isnull().sum()
3missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
4
5missing_df = pd.DataFrame({
6    'Missing Count': missing,
7    'Missing %': missing_pct
8}).sort_values('Missing %', ascending=False)
9
10print(missing_df[missing_df['Missing Count'] > 0])
11
12# Visualize missing
13import missingno as msno
14
15plt.figure(figsize=(12, 6))
16msno.matrix(df)
17plt.show()
18
19# Heatmap of missing correlations
20msno.heatmap(df)
21plt.show()

3.2 Duplicates

Python

1# Check duplicates
2print(f"Duplicate rows: {df.duplicated().sum()}")
3
4# View duplicates
5df[df.duplicated(keep=False)].sort_values(by=df.columns.tolist())
6
7# Duplicates by specific columns
8df.duplicated(subset=['col1', 'col2']).sum()

3.3 Data Types Check

Python

1# Check data types
2for col in df.columns:
3    print(f"{col}: {df[col].dtype}")
4    
5# Identify potential issues
6# - Numeric columns stored as object?
7# - Dates stored as string?
8
9# Check object columns
10object_cols = df.select_dtypes(include=['object']).columns
11for col in object_cols:
12    print(f"\n{col}:")
13    print(df[col].value_counts().head())

4. Univariate Analysis

4.1 Numeric Variables

Python

1def analyze_numeric(df, col):
2    """Phân tích một biến numeric"""
3    
4    print(f"=== {col} ===")
5    print(f"Mean: {df[col].mean():.2f}")
6    print(f"Median: {df[col].median():.2f}")
7    print(f"Std: {df[col].std():.2f}")
8    print(f"Min: {df[col].min()}")
9    print(f"Max: {df[col].max()}")
10    print(f"Skewness: {df[col].skew():.2f}")
11    print(f"Kurtosis: {df[col].kurtosis():.2f}")
12    
13    # Visualization
14    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
15    
16    # Histogram
17    sns.histplot(df[col], kde=True, ax=axes[0])
18    axes[0].set_title(f"Distribution of {col}")
19    axes[0].axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
20    axes[0].axvline(df[col].median(), color='green', linestyle='--', label='Median')
21    axes[0].legend()
22    
23    # Box plot
24    sns.boxplot(y=df[col], ax=axes[1])
25    axes[1].set_title(f"Box Plot of {col}")
26    
27    # QQ plot
28    from scipy import stats
29    stats.probplot(df[col].dropna(), dist="norm", plot=axes[2])
30    axes[2].set_title(f"Q-Q Plot of {col}")
31    
32    plt.tight_layout()
33    plt.show()
34
35# Sử dụng
36numeric_cols = df.select_dtypes(include=[np.number]).columns
37for col in numeric_cols:
38    analyze_numeric(df, col)

4.2 Categorical Variables

Python

1def analyze_categorical(df, col, top_n=10):
2    """Phân tích một biến categorical"""
3    
4    print(f"=== {col} ===")
5    print(f"Unique values: {df[col].nunique()}")
6    print(f"\nValue counts:")
7    print(df[col].value_counts().head(top_n))
8    
9    # Visualization
10    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
11    
12    # Bar plot
13    value_counts = df[col].value_counts().head(top_n)
14    sns.barplot(x=value_counts.values, y=value_counts.index, ax=axes[0])
15    axes[0].set_title(f"Top {top_n} {col}")
16    
17    # Pie chart
18    if df[col].nunique() <= 8:
19        df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
20        axes[1].set_title(f"Distribution of {col}")
21    else:
22        axes[1].text(0.5, 0.5, f"Too many categories ({df[col].nunique()})", 
23                     ha='center', va='center')
24    
25    plt.tight_layout()
26    plt.show()
27
28# Sử dụng
29cat_cols = df.select_dtypes(include=['object', 'category']).columns
30for col in cat_cols:
31    analyze_categorical(df, col)

5. Bivariate Analysis

5.1 Numeric vs Numeric

Python

1def analyze_numeric_numeric(df, x, y):
2    """Phân tích mối quan hệ 2 biến numeric"""
3    
4    # Correlation
5    corr = df[x].corr(df[y])
6    print(f"Correlation ({x} vs {y}): {corr:.3f}")
7    
8    # Scatter plot
9    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
10    
11    sns.scatterplot(data=df, x=x, y=y, ax=axes[0], alpha=0.5)
12    axes[0].set_title(f"{x} vs {y} (r = {corr:.3f})")
13    
14    # Regression plot
15    sns.regplot(data=df, x=x, y=y, ax=axes[1], scatter_kws={'alpha': 0.5})
16    axes[1].set_title(f"Regression: {x} vs {y}")
17    
18    plt.tight_layout()
19    plt.show()
20
21# Correlation matrix
22plt.figure(figsize=(10, 8))
23numeric_df = df.select_dtypes(include=[np.number])
24corr_matrix = numeric_df.corr()
25sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
26plt.title("Correlation Matrix")
27plt.show()

5.2 Numeric vs Categorical

Python

1def analyze_numeric_categorical(df, numeric_col, cat_col):
2    """Phân tích numeric by categorical"""
3    
4    print(f"=== {numeric_col} by {cat_col} ===")
5    print(df.groupby(cat_col)[numeric_col].describe())
6    
7    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
8    
9    # Box plot
10    sns.boxplot(data=df, x=cat_col, y=numeric_col, ax=axes[0])
11    axes[0].set_title(f"{numeric_col} by {cat_col}")
12    axes[0].tick_params(axis='x', rotation=45)
13    
14    # Violin plot
15    sns.violinplot(data=df, x=cat_col, y=numeric_col, ax=axes[1])
16    axes[1].set_title(f"Violin: {numeric_col} by {cat_col}")
17    axes[1].tick_params(axis='x', rotation=45)
18    
19    # Bar plot (mean)
20    sns.barplot(data=df, x=cat_col, y=numeric_col, ax=axes[2], errorbar='sd')
21    axes[2].set_title(f"Mean {numeric_col} by {cat_col}")
22    axes[2].tick_params(axis='x', rotation=45)
23    
24    plt.tight_layout()
25    plt.show()

5.3 Categorical vs Categorical

Python

1def analyze_categorical_categorical(df, col1, col2):
2    """Phân tích 2 biến categorical"""
3    
4    # Cross tabulation
5    crosstab = pd.crosstab(df[col1], df[col2])
6    print("Cross tabulation:")
7    print(crosstab)
8    
9    # Chi-square test
10    from scipy.stats import chi2_contingency
11    chi2, p, dof, expected = chi2_contingency(crosstab)
12    print(f"\nChi-square: {chi2:.2f}, p-value: {p:.4f}")
13    
14    # Visualization
15    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
16    
17    # Heatmap
18    sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues', ax=axes[0])
19    axes[0].set_title(f"{col1} vs {col2}")
20    
21    # Stacked bar
22    crosstab_pct = crosstab.div(crosstab.sum(axis=1), axis=0) * 100
23    crosstab_pct.plot(kind='bar', stacked=True, ax=axes[1])
24    axes[1].set_title(f"{col1} vs {col2} (%)")
25    axes[1].legend(title=col2)
26    
27    plt.tight_layout()
28    plt.show()

6. Multivariate Analysis

6.1 Pair Plot

Python

1# Pair plot - All pairwise relationships
2sns.pairplot(df, hue='target_column', diag_kind='kde', corner=True)
3plt.show()
4
5# Selected columns
6sns.pairplot(df[['col1', 'col2', 'col3', 'target']], hue='target')
7plt.show()

6.2 Grouped Analysis

Python

1# Group by multiple columns
2df.groupby(['col1', 'col2'])['numeric_col'].agg(['mean', 'std', 'count'])
3
4# Pivot table
5pd.pivot_table(df, values='value', index='row_cat', 
6               columns='col_cat', aggfunc='mean')

6.3 Advanced Visualizations

Python

1# Facet Grid
2g = sns.FacetGrid(df, col='category1', row='category2', height=4)
3g.map(sns.histplot, 'numeric_col')
4plt.show()
5
6# Categorical plot with facets
7sns.catplot(data=df, x='cat1', y='numeric', hue='cat2', 
8            col='cat3', kind='box', height=4)
9plt.show()

7. Target Variable Analysis

Python

1def analyze_target(df, target):
2    """Phân tích biến target"""
3    
4    print(f"=== Target: {target} ===")
5    print(f"Distribution:\n{df[target].value_counts()}")
6    print(f"\nClass balance:\n{df[target].value_counts(normalize=True) * 100}")
7    
8    # Classification target
9    if df[target].nunique() <= 10:
10        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
11        
12        df[target].value_counts().plot(kind='bar', ax=axes[0])
13        axes[0].set_title(f"Target Distribution")
14        
15        df[target].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
16        axes[1].set_title(f"Target Proportion")
17        
18    # Regression target
19    else:
20        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
21        
22        sns.histplot(df[target], kde=True, ax=axes[0])
23        axes[0].set_title(f"Target Distribution")
24        
25        sns.boxplot(y=df[target], ax=axes[1])
26        axes[1].set_title(f"Target Box Plot")
27    
28    plt.tight_layout()
29    plt.show()
30    
31    # Feature vs Target
32    print("\n=== Features vs Target ===")
33    numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(target)
34    
35    for col in numeric_cols:
36        corr = df[col].corr(df[target])
37        print(f"{col}: {corr:.3f}")

8. Complete EDA Template

Python

1def complete_eda(df, target=None):
2    """Template EDA hoàn chỉnh"""
3    
4    print("=" * 50)
5    print("1. DATA OVERVIEW")
6    print("=" * 50)
7    print(f"Shape: {df.shape}")
8    print(f"\nData types:\n{df.dtypes}")
9    print(f"\nFirst 5 rows:\n{df.head()}")
10    
11    print("\n" + "=" * 50)
12    print("2. MISSING VALUES")
13    print("=" * 50)
14    missing = df.isnull().sum()
15    missing_pct = (missing / len(df) * 100).round(2)
16    missing_df = pd.DataFrame({'count': missing, 'pct': missing_pct})
17    print(missing_df[missing_df['count'] > 0].sort_values('pct', ascending=False))
18    
19    print("\n" + "=" * 50)
20    print("3. DUPLICATES")
21    print("=" * 50)
22    print(f"Duplicate rows: {df.duplicated().sum()}")
23    
24    print("\n" + "=" * 50)
25    print("4. NUMERIC SUMMARY")
26    print("=" * 50)
27    print(df.describe())
28    
29    print("\n" + "=" * 50)
30    print("5. CATEGORICAL SUMMARY")
31    print("=" * 50)
32    cat_cols = df.select_dtypes(include=['object']).columns
33    for col in cat_cols:
34        print(f"\n{col}:")
35        print(df[col].value_counts().head())
36    
37    if target:
38        print("\n" + "=" * 50)
39        print(f"6. TARGET ANALYSIS: {target}")
40        print("=" * 50)
41        print(df[target].value_counts())
42        print(f"\nCorrelations with {target}:")
43        numeric_df = df.select_dtypes(include=[np.number])
44        if target in numeric_df.columns:
45            print(numeric_df.corr()[target].sort_values(ascending=False))
46
47# Sử dụng
48complete_eda(df, target='target_column')

Tổng Kết

Trong bài này, bạn đã học:

✅ Quy trình EDA chuẩn
✅ Data overview và quality check
✅ Univariate analysis (numeric & categorical)
✅ Bivariate analysis (các loại kết hợp)
✅ Multivariate analysis
✅ Target variable analysis
✅ EDA template hoàn chỉnh

Bài tiếp theo: Feature Engineering - Tạo features cho ML!