Lý thuyết
Bài 11/14

Exploratory Data Analysis (EDA)

Phân tích khám phá dữ liệu - Quy trình và kỹ thuật

Exploratory Data Analysis (EDA)

1. EDA là gì?

Exploratory Data Analysis (EDA) là quá trình khám phá dữ liệu để:

  • Hiểu cấu trúc và đặc điểm dữ liệu
  • Phát hiện patterns và anomalies
  • Kiểm tra assumptions
  • Tìm insights cho modeling

Quy trình EDA

1. Load Data
2. Data Overview
3. Data Quality
4. Univariate Analysis
5. Bivariate Analysis
6. Multivariate Analysis
7. Insights & Conclusions

2. Data Overview

2.1 Load và xem dữ liệu

Python
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6# Load data
7df = pd.read_csv("data.csv")
8
9# Basic info
10print(f"Shape: {df.shape}") # (rows, columns)
11print(f"Columns: {df.columns.tolist()}")
12
13# First/Last rows
14df.head()
15df.tail()
16df.sample(5) # Random 5 rows
17
18# Data types
19df.dtypes
20df.info()

2.2 Statistical Summary

Python
1# Numeric summary
2df.describe()
3
4# Include all columns
5df.describe(include='all')
6
7# Specific percentiles
8df.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
9
10# Custom summary function
11def data_summary(df):
12 summary = pd.DataFrame({
13 'dtype': df.dtypes,
14 'count': df.count(),
15 'nunique': df.nunique(),
16 'null_count': df.isnull().sum(),
17 'null_pct': (df.isnull().sum() / len(df) * 100).round(2)
18 })
19 return summary
20
21data_summary(df)

3. Data Quality Check

3.1 Missing Values

Python
1# Missing values summary
2missing = df.isnull().sum()
3missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
4
5missing_df = pd.DataFrame({
6 'Missing Count': missing,
7 'Missing %': missing_pct
8}).sort_values('Missing %', ascending=False)
9
10print(missing_df[missing_df['Missing Count'] > 0])
11
12# Visualize missing
13import missingno as msno
14
15plt.figure(figsize=(12, 6))
16msno.matrix(df)
17plt.show()
18
19# Heatmap of missing correlations
20msno.heatmap(df)
21plt.show()

3.2 Duplicates

Python
1# Check duplicates
2print(f"Duplicate rows: {df.duplicated().sum()}")
3
4# View duplicates
5df[df.duplicated(keep=False)].sort_values(by=df.columns.tolist())
6
7# Duplicates by specific columns
8df.duplicated(subset=['col1', 'col2']).sum()

3.3 Data Types Check

Python
1# Check data types
2for col in df.columns:
3 print(f"{col}: {df[col].dtype}")
4
5# Identify potential issues
6# - Numeric columns stored as object?
7# - Dates stored as string?
8
9# Check object columns
10object_cols = df.select_dtypes(include=['object']).columns
11for col in object_cols:
12 print(f"\n{col}:")
13 print(df[col].value_counts().head())

4. Univariate Analysis

4.1 Numeric Variables

Python
1def analyze_numeric(df, col):
2 """Phân tích một biến numeric"""
3
4 print(f"=== {col} ===")
5 print(f"Mean: {df[col].mean():.2f}")
6 print(f"Median: {df[col].median():.2f}")
7 print(f"Std: {df[col].std():.2f}")
8 print(f"Min: {df[col].min()}")
9 print(f"Max: {df[col].max()}")
10 print(f"Skewness: {df[col].skew():.2f}")
11 print(f"Kurtosis: {df[col].kurtosis():.2f}")
12
13 # Visualization
14 fig, axes = plt.subplots(1, 3, figsize=(15, 4))
15
16 # Histogram
17 sns.histplot(df[col], kde=True, ax=axes[0])
18 axes[0].set_title(f"Distribution of {col}")
19 axes[0].axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
20 axes[0].axvline(df[col].median(), color='green', linestyle='--', label='Median')
21 axes[0].legend()
22
23 # Box plot
24 sns.boxplot(y=df[col], ax=axes[1])
25 axes[1].set_title(f"Box Plot of {col}")
26
27 # QQ plot
28 from scipy import stats
29 stats.probplot(df[col].dropna(), dist="norm", plot=axes[2])
30 axes[2].set_title(f"Q-Q Plot of {col}")
31
32 plt.tight_layout()
33 plt.show()
34
35# Sử dụng
36numeric_cols = df.select_dtypes(include=[np.number]).columns
37for col in numeric_cols:
38 analyze_numeric(df, col)

4.2 Categorical Variables

Python
1def analyze_categorical(df, col, top_n=10):
2 """Phân tích một biến categorical"""
3
4 print(f"=== {col} ===")
5 print(f"Unique values: {df[col].nunique()}")
6 print(f"\nValue counts:")
7 print(df[col].value_counts().head(top_n))
8
9 # Visualization
10 fig, axes = plt.subplots(1, 2, figsize=(12, 4))
11
12 # Bar plot
13 value_counts = df[col].value_counts().head(top_n)
14 sns.barplot(x=value_counts.values, y=value_counts.index, ax=axes[0])
15 axes[0].set_title(f"Top {top_n} {col}")
16
17 # Pie chart
18 if df[col].nunique() <= 8:
19 df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
20 axes[1].set_title(f"Distribution of {col}")
21 else:
22 axes[1].text(0.5, 0.5, f"Too many categories ({df[col].nunique()})",
23 ha='center', va='center')
24
25 plt.tight_layout()
26 plt.show()
27
28# Sử dụng
29cat_cols = df.select_dtypes(include=['object', 'category']).columns
30for col in cat_cols:
31 analyze_categorical(df, col)

5. Bivariate Analysis

5.1 Numeric vs Numeric

Python
1def analyze_numeric_numeric(df, x, y):
2 """Phân tích mối quan hệ 2 biến numeric"""
3
4 # Correlation
5 corr = df[x].corr(df[y])
6 print(f"Correlation ({x} vs {y}): {corr:.3f}")
7
8 # Scatter plot
9 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
10
11 sns.scatterplot(data=df, x=x, y=y, ax=axes[0], alpha=0.5)
12 axes[0].set_title(f"{x} vs {y} (r = {corr:.3f})")
13
14 # Regression plot
15 sns.regplot(data=df, x=x, y=y, ax=axes[1], scatter_kws={'alpha': 0.5})
16 axes[1].set_title(f"Regression: {x} vs {y}")
17
18 plt.tight_layout()
19 plt.show()
20
21# Correlation matrix
22plt.figure(figsize=(10, 8))
23numeric_df = df.select_dtypes(include=[np.number])
24corr_matrix = numeric_df.corr()
25sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
26plt.title("Correlation Matrix")
27plt.show()

5.2 Numeric vs Categorical

Python
1def analyze_numeric_categorical(df, numeric_col, cat_col):
2 """Phân tích numeric by categorical"""
3
4 print(f"=== {numeric_col} by {cat_col} ===")
5 print(df.groupby(cat_col)[numeric_col].describe())
6
7 fig, axes = plt.subplots(1, 3, figsize=(15, 5))
8
9 # Box plot
10 sns.boxplot(data=df, x=cat_col, y=numeric_col, ax=axes[0])
11 axes[0].set_title(f"{numeric_col} by {cat_col}")
12 axes[0].tick_params(axis='x', rotation=45)
13
14 # Violin plot
15 sns.violinplot(data=df, x=cat_col, y=numeric_col, ax=axes[1])
16 axes[1].set_title(f"Violin: {numeric_col} by {cat_col}")
17 axes[1].tick_params(axis='x', rotation=45)
18
19 # Bar plot (mean)
20 sns.barplot(data=df, x=cat_col, y=numeric_col, ax=axes[2], errorbar='sd')
21 axes[2].set_title(f"Mean {numeric_col} by {cat_col}")
22 axes[2].tick_params(axis='x', rotation=45)
23
24 plt.tight_layout()
25 plt.show()

5.3 Categorical vs Categorical

Python
1def analyze_categorical_categorical(df, col1, col2):
2 """Phân tích 2 biến categorical"""
3
4 # Cross tabulation
5 crosstab = pd.crosstab(df[col1], df[col2])
6 print("Cross tabulation:")
7 print(crosstab)
8
9 # Chi-square test
10 from scipy.stats import chi2_contingency
11 chi2, p, dof, expected = chi2_contingency(crosstab)
12 print(f"\nChi-square: {chi2:.2f}, p-value: {p:.4f}")
13
14 # Visualization
15 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
16
17 # Heatmap
18 sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues', ax=axes[0])
19 axes[0].set_title(f"{col1} vs {col2}")
20
21 # Stacked bar
22 crosstab_pct = crosstab.div(crosstab.sum(axis=1), axis=0) * 100
23 crosstab_pct.plot(kind='bar', stacked=True, ax=axes[1])
24 axes[1].set_title(f"{col1} vs {col2} (%)")
25 axes[1].legend(title=col2)
26
27 plt.tight_layout()
28 plt.show()

6. Multivariate Analysis

6.1 Pair Plot

Python
1# Pair plot - All pairwise relationships
2sns.pairplot(df, hue='target_column', diag_kind='kde', corner=True)
3plt.show()
4
5# Selected columns
6sns.pairplot(df[['col1', 'col2', 'col3', 'target']], hue='target')
7plt.show()

6.2 Grouped Analysis

Python
1# Group by multiple columns
2df.groupby(['col1', 'col2'])['numeric_col'].agg(['mean', 'std', 'count'])
3
4# Pivot table
5pd.pivot_table(df, values='value', index='row_cat',
6 columns='col_cat', aggfunc='mean')

6.3 Advanced Visualizations

Python
1# Facet Grid
2g = sns.FacetGrid(df, col='category1', row='category2', height=4)
3g.map(sns.histplot, 'numeric_col')
4plt.show()
5
6# Categorical plot with facets
7sns.catplot(data=df, x='cat1', y='numeric', hue='cat2',
8 col='cat3', kind='box', height=4)
9plt.show()

7. Target Variable Analysis

Python
1def analyze_target(df, target):
2 """Phân tích biến target"""
3
4 print(f"=== Target: {target} ===")
5 print(f"Distribution:\n{df[target].value_counts()}")
6 print(f"\nClass balance:\n{df[target].value_counts(normalize=True) * 100}")
7
8 # Classification target
9 if df[target].nunique() <= 10:
10 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
11
12 df[target].value_counts().plot(kind='bar', ax=axes[0])
13 axes[0].set_title(f"Target Distribution")
14
15 df[target].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
16 axes[1].set_title(f"Target Proportion")
17
18 # Regression target
19 else:
20 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
21
22 sns.histplot(df[target], kde=True, ax=axes[0])
23 axes[0].set_title(f"Target Distribution")
24
25 sns.boxplot(y=df[target], ax=axes[1])
26 axes[1].set_title(f"Target Box Plot")
27
28 plt.tight_layout()
29 plt.show()
30
31 # Feature vs Target
32 print("\n=== Features vs Target ===")
33 numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(target)
34
35 for col in numeric_cols:
36 corr = df[col].corr(df[target])
37 print(f"{col}: {corr:.3f}")

8. Complete EDA Template

Python
1def complete_eda(df, target=None):
2 """Template EDA hoàn chỉnh"""
3
4 print("=" * 50)
5 print("1. DATA OVERVIEW")
6 print("=" * 50)
7 print(f"Shape: {df.shape}")
8 print(f"\nData types:\n{df.dtypes}")
9 print(f"\nFirst 5 rows:\n{df.head()}")
10
11 print("\n" + "=" * 50)
12 print("2. MISSING VALUES")
13 print("=" * 50)
14 missing = df.isnull().sum()
15 missing_pct = (missing / len(df) * 100).round(2)
16 missing_df = pd.DataFrame({'count': missing, 'pct': missing_pct})
17 print(missing_df[missing_df['count'] > 0].sort_values('pct', ascending=False))
18
19 print("\n" + "=" * 50)
20 print("3. DUPLICATES")
21 print("=" * 50)
22 print(f"Duplicate rows: {df.duplicated().sum()}")
23
24 print("\n" + "=" * 50)
25 print("4. NUMERIC SUMMARY")
26 print("=" * 50)
27 print(df.describe())
28
29 print("\n" + "=" * 50)
30 print("5. CATEGORICAL SUMMARY")
31 print("=" * 50)
32 cat_cols = df.select_dtypes(include=['object']).columns
33 for col in cat_cols:
34 print(f"\n{col}:")
35 print(df[col].value_counts().head())
36
37 if target:
38 print("\n" + "=" * 50)
39 print(f"6. TARGET ANALYSIS: {target}")
40 print("=" * 50)
41 print(df[target].value_counts())
42 print(f"\nCorrelations with {target}:")
43 numeric_df = df.select_dtypes(include=[np.number])
44 if target in numeric_df.columns:
45 print(numeric_df.corr()[target].sort_values(ascending=False))
46
47# Sử dụng
48complete_eda(df, target='target_column')

Tổng Kết

Trong bài này, bạn đã học:

  • ✅ Quy trình EDA chuẩn
  • ✅ Data overview và quality check
  • ✅ Univariate analysis (numeric & categorical)
  • ✅ Bivariate analysis (các loại kết hợp)
  • ✅ Multivariate analysis
  • ✅ Target variable analysis
  • ✅ EDA template hoàn chỉnh

Bài tiếp theo: Feature Engineering - Tạo features cho ML!