Seaborn - Statistical Visualization
1. Giới thiệu Seaborn
Seaborn là thư viện visualization được xây dựng trên Matplotlib, cung cấp:
- Biểu đồ đẹp hơn với ít code
- Tích hợp tốt với Pandas DataFrame
- Built-in themes và color palettes
- Statistical plots chuyên dụng
Python
1import seaborn as sns2import matplotlib.pyplot as plt3import pandas as pd4import numpy as np56# Thiết lập style7sns.set_theme(style="whitegrid")89# Load sample dataset10tips = sns.load_dataset("tips")11print(tips.head())2. Distribution Plots
2.1 Histogram
Python
1# Basic histogram2plt.figure(figsize=(10, 6))3sns.histplot(data=tips, x="total_bill", bins=30)4plt.title("Distribution of Total Bill")5plt.show()67# Với KDE (Kernel Density Estimation)8sns.histplot(data=tips, x="total_bill", kde=True)910# Grouped by category11sns.histplot(data=tips, x="total_bill", hue="time", kde=True)2.2 KDE Plot
Python
1# KDE Plot - Smooth distribution2plt.figure(figsize=(10, 6))3sns.kdeplot(data=tips, x="total_bill", hue="time", fill=True)4plt.title("Density Plot by Time")5plt.show()67# Multiple distributions8sns.kdeplot(data=tips, x="total_bill", hue="day", 9 palette="viridis", fill=True, alpha=0.5)2.3 Box Plot
Python
1# Box Plot - Phân phối và outliers2plt.figure(figsize=(10, 6))3sns.boxplot(data=tips, x="day", y="total_bill", hue="time")4plt.title("Total Bill by Day and Time")5plt.show()67# Horizontal8sns.boxplot(data=tips, x="total_bill", y="day", orient="h")2.4 Violin Plot
Python
1# Violin Plot - Box + KDE2plt.figure(figsize=(10, 6))3sns.violinplot(data=tips, x="day", y="total_bill", hue="sex", split=True)4plt.title("Violin Plot")5plt.show()3. Categorical Plots
3.1 Bar Plot
Python
1# Bar Plot với mean và CI2plt.figure(figsize=(10, 6))3sns.barplot(data=tips, x="day", y="total_bill", hue="sex", errorbar="sd")4plt.title("Average Total Bill by Day")5plt.show()67# Custom estimator8sns.barplot(data=tips, x="day", y="total_bill", estimator=np.median)3.2 Count Plot
Python
1# Count Plot - Đếm frequency2plt.figure(figsize=(10, 6))3sns.countplot(data=tips, x="day", hue="time", palette="Set2")4plt.title("Count by Day")5plt.show()3.3 Strip Plot và Swarm Plot
Python
1# Strip Plot - Scatter for categorical2fig, axes = plt.subplots(1, 2, figsize=(14, 6))34sns.stripplot(data=tips, x="day", y="total_bill", ax=axes[0], alpha=0.7)5axes[0].set_title("Strip Plot")67# Swarm Plot - No overlap8sns.swarmplot(data=tips, x="day", y="total_bill", ax=axes[1], hue="sex")9axes[1].set_title("Swarm Plot")1011plt.tight_layout()12plt.show()4. Relational Plots
4.1 Scatter Plot
Python
1# Scatter Plot2plt.figure(figsize=(10, 6))3sns.scatterplot(data=tips, x="total_bill", y="tip", 4 hue="time", size="size", style="sex",5 palette="viridis", sizes=(20, 200))6plt.title("Total Bill vs Tip")7plt.show()4.2 Line Plot
Python
1# Line Plot với confidence interval2fmri = sns.load_dataset("fmri")34plt.figure(figsize=(10, 6))5sns.lineplot(data=fmri, x="timepoint", y="signal", 6 hue="region", style="event", markers=True)7plt.title("FMRI Signal Over Time")8plt.show()4.3 Regression Plot
Python
1# Regression Plot với CI2plt.figure(figsize=(10, 6))3sns.regplot(data=tips, x="total_bill", y="tip", 4 scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})5plt.title("Regression: Total Bill vs Tip")6plt.show()78# lmplot - Regression với facets9sns.lmplot(data=tips, x="total_bill", y="tip", hue="smoker", col="time")10plt.show()5. Matrix Plots
5.1 Heatmap
Python
1# Correlation Heatmap2plt.figure(figsize=(10, 8))34# Tính correlation matrix5numeric_cols = tips.select_dtypes(include=[np.number])6corr = numeric_cols.corr()78sns.heatmap(corr, annot=True, cmap="coolwarm", center=0,9 fmt=".2f", linewidths=0.5, square=True)10plt.title("Correlation Heatmap")11plt.show()5.2 Clustermap
Python
1# Clustered Heatmap2sns.clustermap(corr, annot=True, cmap="viridis", 3 figsize=(8, 8), linewidths=0.5)4plt.show()6. Pair Plot và Joint Plot
6.1 Pair Plot
Python
1# Pair Plot - All pairwise relationships2sns.pairplot(tips, hue="time", diag_kind="kde", 3 palette="husl", corner=True)4plt.show()56# Specific variables7sns.pairplot(tips, vars=["total_bill", "tip", "size"], hue="time")6.2 Joint Plot
Python
1# Joint Plot - Bivariate + marginal distributions2sns.jointplot(data=tips, x="total_bill", y="tip", kind="scatter")3plt.show()45# Different kinds6fig, axes = plt.subplots(1, 3, figsize=(15, 5))78# Scatter9sns.jointplot(data=tips, x="total_bill", y="tip", kind="scatter")10# Hex11sns.jointplot(data=tips, x="total_bill", y="tip", kind="hex")12# KDE13sns.jointplot(data=tips, x="total_bill", y="tip", kind="kde")1415plt.show()7. FacetGrid - Multi-Plot
Python
1# FacetGrid - Multiple subplots2g = sns.FacetGrid(tips, col="time", row="smoker", height=4)3g.map(sns.histplot, "total_bill")4g.add_legend()5plt.show()67# catplot - Categorical FacetGrid8sns.catplot(data=tips, x="day", y="total_bill", 9 hue="sex", col="time", kind="box", height=5)10plt.show()1112# relplot - Relational FacetGrid13sns.relplot(data=tips, x="total_bill", y="tip",14 hue="size", col="time", row="smoker", 15 kind="scatter", height=4)16plt.show()8. Styling và Customization
8.1 Themes
Python
1# Built-in themes2themes = ["darkgrid", "whitegrid", "dark", "white", "ticks"]34fig, axes = plt.subplots(1, 5, figsize=(20, 4))5for ax, theme in zip(axes, themes):6 sns.set_theme(style=theme)7 sns.histplot(tips["total_bill"], ax=ax)8 ax.set_title(theme)9plt.tight_layout()10plt.show()8.2 Color Palettes
Python
1# Built-in palettes2palettes = ["deep", "muted", "pastel", "bright", "dark", "colorblind"]34for palette in palettes:5 sns.set_palette(palette)6 plt.figure(figsize=(8, 4))7 sns.barplot(data=tips, x="day", y="total_bill")8 plt.title(f"Palette: {palette}")9 plt.show()1011# Custom palette12custom_palette = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]13sns.set_palette(custom_palette)1415# Sequential palettes16sns.color_palette("Blues", n_colors=10)17sns.color_palette("coolwarm", n_colors=10)8.3 Context
Python
1# Contexts: paper, notebook, talk, poster2contexts = ["paper", "notebook", "talk", "poster"]34for context in contexts:5 sns.set_context(context)6 plt.figure(figsize=(8, 4))7 sns.lineplot(data=tips, x="size", y="total_bill")8 plt.title(f"Context: {context}")9 plt.show()9. Combining với Matplotlib
Python
1# Kết hợp Seaborn với Matplotlib customization2fig, axes = plt.subplots(2, 2, figsize=(12, 10))34# Plot 1: Histogram5sns.histplot(tips["total_bill"], kde=True, ax=axes[0, 0], color="steelblue")6axes[0, 0].set_title("Distribution of Total Bill", fontsize=14, fontweight='bold')7axes[0, 0].axvline(tips["total_bill"].mean(), color='red', linestyle='--', label='Mean')8axes[0, 0].legend()910# Plot 2: Box plot11sns.boxplot(data=tips, x="day", y="total_bill", ax=axes[0, 1], palette="Set3")12axes[0, 1].set_title("Total Bill by Day", fontsize=14, fontweight='bold')1314# Plot 3: Scatter with regression15sns.regplot(data=tips, x="total_bill", y="tip", ax=axes[1, 0], 16 scatter_kws={'alpha': 0.6}, color="coral")17axes[1, 0].set_title("Bill vs Tip (with regression)", fontsize=14, fontweight='bold')1819# Plot 4: Heatmap20numeric_cols = tips.select_dtypes(include=[np.number])21sns.heatmap(numeric_cols.corr(), annot=True, cmap="RdYlGn", ax=axes[1, 1])22axes[1, 1].set_title("Correlation Matrix", fontsize=14, fontweight='bold')2324plt.suptitle("Tips Dataset Analysis", fontsize=16, fontweight='bold', y=1.02)25plt.tight_layout()26plt.savefig("tips_analysis.png", dpi=300, bbox_inches='tight')27plt.show()10. Common Patterns
Python
1# Quick EDA Template2def quick_eda(df, target_col):3 """Quick EDA visualization"""4 5 fig = plt.figure(figsize=(15, 10))6 7 # 1. Distribution of target8 ax1 = fig.add_subplot(2, 2, 1)9 sns.histplot(df[target_col], kde=True, ax=ax1)10 ax1.set_title(f"Distribution of {target_col}")11 12 # 2. Box plot13 ax2 = fig.add_subplot(2, 2, 2)14 sns.boxplot(y=df[target_col], ax=ax2)15 ax2.set_title(f"Box Plot of {target_col}")16 17 # 3. Correlation heatmap18 ax3 = fig.add_subplot(2, 2, 3)19 numeric_df = df.select_dtypes(include=[np.number])20 sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", ax=ax3)21 ax3.set_title("Correlation Matrix")22 23 # 4. Pair plot (separate figure)24 ax4 = fig.add_subplot(2, 2, 4)25 if len(numeric_df.columns) <= 5:26 ax4.text(0.5, 0.5, "See separate pair plot", ha='center', va='center')27 sns.pairplot(numeric_df)28 29 plt.tight_layout()30 plt.show()3132# Sử dụng33quick_eda(tips, "total_bill")Tổng Kết
Trong bài này, bạn đã học:
- ✅ Distribution plots: hist, kde, box, violin
- ✅ Categorical plots: bar, count, strip, swarm
- ✅ Relational plots: scatter, line, regression
- ✅ Matrix plots: heatmap, clustermap
- ✅ Pair plot và Joint plot
- ✅ FacetGrid cho multi-plots
- ✅ Themes, palettes và customization
Bài tiếp theo: Plotly - Interactive Visualization!
