Exploratory Data Analysis (EDA)
1. EDA là gì?
Exploratory Data Analysis (EDA) là quá trình khám phá dữ liệu để:
- Hiểu cấu trúc và đặc điểm dữ liệu
- Phát hiện patterns và anomalies
- Kiểm tra assumptions
- Tìm insights cho modeling
Quy trình EDA
1. Load Data
2. Data Overview
3. Data Quality
4. Univariate Analysis
5. Bivariate Analysis
6. Multivariate Analysis
7. Insights & Conclusions
2. Data Overview
2.1 Load và xem dữ liệu
Python
1import pandas as pd2import numpy as np3import matplotlib.pyplot as plt4import seaborn as sns56# Load data7df = pd.read_csv("data.csv")89# Basic info10print(f"Shape: {df.shape}") # (rows, columns)11print(f"Columns: {df.columns.tolist()}")1213# First/Last rows14df.head()15df.tail()16df.sample(5) # Random 5 rows1718# Data types19df.dtypes20df.info()2.2 Statistical Summary
Python
1# Numeric summary2df.describe()34# Include all columns5df.describe(include='all')67# Specific percentiles8df.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])910# Custom summary function11def data_summary(df):12 summary = pd.DataFrame({13 'dtype': df.dtypes,14 'count': df.count(),15 'nunique': df.nunique(),16 'null_count': df.isnull().sum(),17 'null_pct': (df.isnull().sum() / len(df) * 100).round(2)18 })19 return summary2021data_summary(df)3. Data Quality Check
3.1 Missing Values
Python
1# Missing values summary2missing = df.isnull().sum()3missing_pct = (df.isnull().sum() / len(df) * 100).round(2)45missing_df = pd.DataFrame({6 'Missing Count': missing,7 'Missing %': missing_pct8}).sort_values('Missing %', ascending=False)910print(missing_df[missing_df['Missing Count'] > 0])1112# Visualize missing13import missingno as msno1415plt.figure(figsize=(12, 6))16msno.matrix(df)17plt.show()1819# Heatmap of missing correlations20msno.heatmap(df)21plt.show()3.2 Duplicates
Python
1# Check duplicates2print(f"Duplicate rows: {df.duplicated().sum()}")34# View duplicates5df[df.duplicated(keep=False)].sort_values(by=df.columns.tolist())67# Duplicates by specific columns8df.duplicated(subset=['col1', 'col2']).sum()3.3 Data Types Check
Python
1# Check data types2for col in df.columns:3 print(f"{col}: {df[col].dtype}")4 5# Identify potential issues6# - Numeric columns stored as object?7# - Dates stored as string?89# Check object columns10object_cols = df.select_dtypes(include=['object']).columns11for col in object_cols:12 print(f"\n{col}:")13 print(df[col].value_counts().head())4. Univariate Analysis
4.1 Numeric Variables
Python
1def analyze_numeric(df, col):2 """Phân tích một biến numeric"""3 4 print(f"=== {col} ===")5 print(f"Mean: {df[col].mean():.2f}")6 print(f"Median: {df[col].median():.2f}")7 print(f"Std: {df[col].std():.2f}")8 print(f"Min: {df[col].min()}")9 print(f"Max: {df[col].max()}")10 print(f"Skewness: {df[col].skew():.2f}")11 print(f"Kurtosis: {df[col].kurtosis():.2f}")12 13 # Visualization14 fig, axes = plt.subplots(1, 3, figsize=(15, 4))15 16 # Histogram17 sns.histplot(df[col], kde=True, ax=axes[0])18 axes[0].set_title(f"Distribution of {col}")19 axes[0].axvline(df[col].mean(), color='red', linestyle='--', label='Mean')20 axes[0].axvline(df[col].median(), color='green', linestyle='--', label='Median')21 axes[0].legend()22 23 # Box plot24 sns.boxplot(y=df[col], ax=axes[1])25 axes[1].set_title(f"Box Plot of {col}")26 27 # QQ plot28 from scipy import stats29 stats.probplot(df[col].dropna(), dist="norm", plot=axes[2])30 axes[2].set_title(f"Q-Q Plot of {col}")31 32 plt.tight_layout()33 plt.show()3435# Sử dụng36numeric_cols = df.select_dtypes(include=[np.number]).columns37for col in numeric_cols:38 analyze_numeric(df, col)4.2 Categorical Variables
Python
1def analyze_categorical(df, col, top_n=10):2 """Phân tích một biến categorical"""3 4 print(f"=== {col} ===")5 print(f"Unique values: {df[col].nunique()}")6 print(f"\nValue counts:")7 print(df[col].value_counts().head(top_n))8 9 # Visualization10 fig, axes = plt.subplots(1, 2, figsize=(12, 4))11 12 # Bar plot13 value_counts = df[col].value_counts().head(top_n)14 sns.barplot(x=value_counts.values, y=value_counts.index, ax=axes[0])15 axes[0].set_title(f"Top {top_n} {col}")16 17 # Pie chart18 if df[col].nunique() <= 8:19 df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])20 axes[1].set_title(f"Distribution of {col}")21 else:22 axes[1].text(0.5, 0.5, f"Too many categories ({df[col].nunique()})", 23 ha='center', va='center')24 25 plt.tight_layout()26 plt.show()2728# Sử dụng29cat_cols = df.select_dtypes(include=['object', 'category']).columns30for col in cat_cols:31 analyze_categorical(df, col)5. Bivariate Analysis
5.1 Numeric vs Numeric
Python
1def analyze_numeric_numeric(df, x, y):2 """Phân tích mối quan hệ 2 biến numeric"""3 4 # Correlation5 corr = df[x].corr(df[y])6 print(f"Correlation ({x} vs {y}): {corr:.3f}")7 8 # Scatter plot9 fig, axes = plt.subplots(1, 2, figsize=(12, 5))10 11 sns.scatterplot(data=df, x=x, y=y, ax=axes[0], alpha=0.5)12 axes[0].set_title(f"{x} vs {y} (r = {corr:.3f})")13 14 # Regression plot15 sns.regplot(data=df, x=x, y=y, ax=axes[1], scatter_kws={'alpha': 0.5})16 axes[1].set_title(f"Regression: {x} vs {y}")17 18 plt.tight_layout()19 plt.show()2021# Correlation matrix22plt.figure(figsize=(10, 8))23numeric_df = df.select_dtypes(include=[np.number])24corr_matrix = numeric_df.corr()25sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f')26plt.title("Correlation Matrix")27plt.show()5.2 Numeric vs Categorical
Python
1def analyze_numeric_categorical(df, numeric_col, cat_col):2 """Phân tích numeric by categorical"""3 4 print(f"=== {numeric_col} by {cat_col} ===")5 print(df.groupby(cat_col)[numeric_col].describe())6 7 fig, axes = plt.subplots(1, 3, figsize=(15, 5))8 9 # Box plot10 sns.boxplot(data=df, x=cat_col, y=numeric_col, ax=axes[0])11 axes[0].set_title(f"{numeric_col} by {cat_col}")12 axes[0].tick_params(axis='x', rotation=45)13 14 # Violin plot15 sns.violinplot(data=df, x=cat_col, y=numeric_col, ax=axes[1])16 axes[1].set_title(f"Violin: {numeric_col} by {cat_col}")17 axes[1].tick_params(axis='x', rotation=45)18 19 # Bar plot (mean)20 sns.barplot(data=df, x=cat_col, y=numeric_col, ax=axes[2], errorbar='sd')21 axes[2].set_title(f"Mean {numeric_col} by {cat_col}")22 axes[2].tick_params(axis='x', rotation=45)23 24 plt.tight_layout()25 plt.show()5.3 Categorical vs Categorical
Python
1def analyze_categorical_categorical(df, col1, col2):2 """Phân tích 2 biến categorical"""3 4 # Cross tabulation5 crosstab = pd.crosstab(df[col1], df[col2])6 print("Cross tabulation:")7 print(crosstab)8 9 # Chi-square test10 from scipy.stats import chi2_contingency11 chi2, p, dof, expected = chi2_contingency(crosstab)12 print(f"\nChi-square: {chi2:.2f}, p-value: {p:.4f}")13 14 # Visualization15 fig, axes = plt.subplots(1, 2, figsize=(14, 5))16 17 # Heatmap18 sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues', ax=axes[0])19 axes[0].set_title(f"{col1} vs {col2}")20 21 # Stacked bar22 crosstab_pct = crosstab.div(crosstab.sum(axis=1), axis=0) * 10023 crosstab_pct.plot(kind='bar', stacked=True, ax=axes[1])24 axes[1].set_title(f"{col1} vs {col2} (%)")25 axes[1].legend(title=col2)26 27 plt.tight_layout()28 plt.show()6. Multivariate Analysis
6.1 Pair Plot
Python
1# Pair plot - All pairwise relationships2sns.pairplot(df, hue='target_column', diag_kind='kde', corner=True)3plt.show()45# Selected columns6sns.pairplot(df[['col1', 'col2', 'col3', 'target']], hue='target')7plt.show()6.2 Grouped Analysis
Python
1# Group by multiple columns2df.groupby(['col1', 'col2'])['numeric_col'].agg(['mean', 'std', 'count'])34# Pivot table5pd.pivot_table(df, values='value', index='row_cat', 6 columns='col_cat', aggfunc='mean')6.3 Advanced Visualizations
Python
1# Facet Grid2g = sns.FacetGrid(df, col='category1', row='category2', height=4)3g.map(sns.histplot, 'numeric_col')4plt.show()56# Categorical plot with facets7sns.catplot(data=df, x='cat1', y='numeric', hue='cat2', 8 col='cat3', kind='box', height=4)9plt.show()7. Target Variable Analysis
Python
1def analyze_target(df, target):2 """Phân tích biến target"""3 4 print(f"=== Target: {target} ===")5 print(f"Distribution:\n{df[target].value_counts()}")6 print(f"\nClass balance:\n{df[target].value_counts(normalize=True) * 100}")7 8 # Classification target9 if df[target].nunique() <= 10:10 fig, axes = plt.subplots(1, 2, figsize=(12, 5))11 12 df[target].value_counts().plot(kind='bar', ax=axes[0])13 axes[0].set_title(f"Target Distribution")14 15 df[target].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])16 axes[1].set_title(f"Target Proportion")17 18 # Regression target19 else:20 fig, axes = plt.subplots(1, 2, figsize=(12, 5))21 22 sns.histplot(df[target], kde=True, ax=axes[0])23 axes[0].set_title(f"Target Distribution")24 25 sns.boxplot(y=df[target], ax=axes[1])26 axes[1].set_title(f"Target Box Plot")27 28 plt.tight_layout()29 plt.show()30 31 # Feature vs Target32 print("\n=== Features vs Target ===")33 numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(target)34 35 for col in numeric_cols:36 corr = df[col].corr(df[target])37 print(f"{col}: {corr:.3f}")8. Complete EDA Template
Python
1def complete_eda(df, target=None):2 """Template EDA hoàn chỉnh"""3 4 print("=" * 50)5 print("1. DATA OVERVIEW")6 print("=" * 50)7 print(f"Shape: {df.shape}")8 print(f"\nData types:\n{df.dtypes}")9 print(f"\nFirst 5 rows:\n{df.head()}")10 11 print("\n" + "=" * 50)12 print("2. MISSING VALUES")13 print("=" * 50)14 missing = df.isnull().sum()15 missing_pct = (missing / len(df) * 100).round(2)16 missing_df = pd.DataFrame({'count': missing, 'pct': missing_pct})17 print(missing_df[missing_df['count'] > 0].sort_values('pct', ascending=False))18 19 print("\n" + "=" * 50)20 print("3. DUPLICATES")21 print("=" * 50)22 print(f"Duplicate rows: {df.duplicated().sum()}")23 24 print("\n" + "=" * 50)25 print("4. NUMERIC SUMMARY")26 print("=" * 50)27 print(df.describe())28 29 print("\n" + "=" * 50)30 print("5. CATEGORICAL SUMMARY")31 print("=" * 50)32 cat_cols = df.select_dtypes(include=['object']).columns33 for col in cat_cols:34 print(f"\n{col}:")35 print(df[col].value_counts().head())36 37 if target:38 print("\n" + "=" * 50)39 print(f"6. TARGET ANALYSIS: {target}")40 print("=" * 50)41 print(df[target].value_counts())42 print(f"\nCorrelations with {target}:")43 numeric_df = df.select_dtypes(include=[np.number])44 if target in numeric_df.columns:45 print(numeric_df.corr()[target].sort_values(ascending=False))4647# Sử dụng48complete_eda(df, target='target_column')Tổng Kết
Trong bài này, bạn đã học:
- ✅ Quy trình EDA chuẩn
- ✅ Data overview và quality check
- ✅ Univariate analysis (numeric & categorical)
- ✅ Bivariate analysis (các loại kết hợp)
- ✅ Multivariate analysis
- ✅ Target variable analysis
- ✅ EDA template hoàn chỉnh
Bài tiếp theo: Feature Engineering - Tạo features cho ML!
