Lý thuyết
Bài 8/14

Seaborn - Statistical Visualization

Vẽ biểu đồ thống kê đẹp với Seaborn

Seaborn - Statistical Visualization

1. Giới thiệu Seaborn

Seaborn là thư viện visualization được xây dựng trên Matplotlib, cung cấp:

  • Biểu đồ đẹp hơn với ít code
  • Tích hợp tốt với Pandas DataFrame
  • Built-in themes và color palettes
  • Statistical plots chuyên dụng
Python
1import seaborn as sns
2import matplotlib.pyplot as plt
3import pandas as pd
4import numpy as np
5
6# Thiết lập style
7sns.set_theme(style="whitegrid")
8
9# Load sample dataset
10tips = sns.load_dataset("tips")
11print(tips.head())

2. Distribution Plots

2.1 Histogram

Python
1# Basic histogram
2plt.figure(figsize=(10, 6))
3sns.histplot(data=tips, x="total_bill", bins=30)
4plt.title("Distribution of Total Bill")
5plt.show()
6
7# Với KDE (Kernel Density Estimation)
8sns.histplot(data=tips, x="total_bill", kde=True)
9
10# Grouped by category
11sns.histplot(data=tips, x="total_bill", hue="time", kde=True)

2.2 KDE Plot

Python
1# KDE Plot - Smooth distribution
2plt.figure(figsize=(10, 6))
3sns.kdeplot(data=tips, x="total_bill", hue="time", fill=True)
4plt.title("Density Plot by Time")
5plt.show()
6
7# Multiple distributions
8sns.kdeplot(data=tips, x="total_bill", hue="day",
9 palette="viridis", fill=True, alpha=0.5)

2.3 Box Plot

Python
1# Box Plot - Phân phối và outliers
2plt.figure(figsize=(10, 6))
3sns.boxplot(data=tips, x="day", y="total_bill", hue="time")
4plt.title("Total Bill by Day and Time")
5plt.show()
6
7# Horizontal
8sns.boxplot(data=tips, x="total_bill", y="day", orient="h")

2.4 Violin Plot

Python
1# Violin Plot - Box + KDE
2plt.figure(figsize=(10, 6))
3sns.violinplot(data=tips, x="day", y="total_bill", hue="sex", split=True)
4plt.title("Violin Plot")
5plt.show()

3. Categorical Plots

3.1 Bar Plot

Python
1# Bar Plot với mean và CI
2plt.figure(figsize=(10, 6))
3sns.barplot(data=tips, x="day", y="total_bill", hue="sex", errorbar="sd")
4plt.title("Average Total Bill by Day")
5plt.show()
6
7# Custom estimator
8sns.barplot(data=tips, x="day", y="total_bill", estimator=np.median)

3.2 Count Plot

Python
1# Count Plot - Đếm frequency
2plt.figure(figsize=(10, 6))
3sns.countplot(data=tips, x="day", hue="time", palette="Set2")
4plt.title("Count by Day")
5plt.show()

3.3 Strip Plot và Swarm Plot

Python
1# Strip Plot - Scatter for categorical
2fig, axes = plt.subplots(1, 2, figsize=(14, 6))
3
4sns.stripplot(data=tips, x="day", y="total_bill", ax=axes[0], alpha=0.7)
5axes[0].set_title("Strip Plot")
6
7# Swarm Plot - No overlap
8sns.swarmplot(data=tips, x="day", y="total_bill", ax=axes[1], hue="sex")
9axes[1].set_title("Swarm Plot")
10
11plt.tight_layout()
12plt.show()

4. Relational Plots

4.1 Scatter Plot

Python
1# Scatter Plot
2plt.figure(figsize=(10, 6))
3sns.scatterplot(data=tips, x="total_bill", y="tip",
4 hue="time", size="size", style="sex",
5 palette="viridis", sizes=(20, 200))
6plt.title("Total Bill vs Tip")
7plt.show()

4.2 Line Plot

Python
1# Line Plot với confidence interval
2fmri = sns.load_dataset("fmri")
3
4plt.figure(figsize=(10, 6))
5sns.lineplot(data=fmri, x="timepoint", y="signal",
6 hue="region", style="event", markers=True)
7plt.title("FMRI Signal Over Time")
8plt.show()

4.3 Regression Plot

Python
1# Regression Plot với CI
2plt.figure(figsize=(10, 6))
3sns.regplot(data=tips, x="total_bill", y="tip",
4 scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})
5plt.title("Regression: Total Bill vs Tip")
6plt.show()
7
8# lmplot - Regression với facets
9sns.lmplot(data=tips, x="total_bill", y="tip", hue="smoker", col="time")
10plt.show()

5. Matrix Plots

5.1 Heatmap

Python
1# Correlation Heatmap
2plt.figure(figsize=(10, 8))
3
4# Tính correlation matrix
5numeric_cols = tips.select_dtypes(include=[np.number])
6corr = numeric_cols.corr()
7
8sns.heatmap(corr, annot=True, cmap="coolwarm", center=0,
9 fmt=".2f", linewidths=0.5, square=True)
10plt.title("Correlation Heatmap")
11plt.show()

5.2 Clustermap

Python
1# Clustered Heatmap
2sns.clustermap(corr, annot=True, cmap="viridis",
3 figsize=(8, 8), linewidths=0.5)
4plt.show()

6. Pair Plot và Joint Plot

6.1 Pair Plot

Python
1# Pair Plot - All pairwise relationships
2sns.pairplot(tips, hue="time", diag_kind="kde",
3 palette="husl", corner=True)
4plt.show()
5
6# Specific variables
7sns.pairplot(tips, vars=["total_bill", "tip", "size"], hue="time")

6.2 Joint Plot

Python
1# Joint Plot - Bivariate + marginal distributions
2sns.jointplot(data=tips, x="total_bill", y="tip", kind="scatter")
3plt.show()
4
5# Different kinds
6fig, axes = plt.subplots(1, 3, figsize=(15, 5))
7
8# Scatter
9sns.jointplot(data=tips, x="total_bill", y="tip", kind="scatter")
10# Hex
11sns.jointplot(data=tips, x="total_bill", y="tip", kind="hex")
12# KDE
13sns.jointplot(data=tips, x="total_bill", y="tip", kind="kde")
14
15plt.show()

7. FacetGrid - Multi-Plot

Python
1# FacetGrid - Multiple subplots
2g = sns.FacetGrid(tips, col="time", row="smoker", height=4)
3g.map(sns.histplot, "total_bill")
4g.add_legend()
5plt.show()
6
7# catplot - Categorical FacetGrid
8sns.catplot(data=tips, x="day", y="total_bill",
9 hue="sex", col="time", kind="box", height=5)
10plt.show()
11
12# relplot - Relational FacetGrid
13sns.relplot(data=tips, x="total_bill", y="tip",
14 hue="size", col="time", row="smoker",
15 kind="scatter", height=4)
16plt.show()

8. Styling và Customization

8.1 Themes

Python
1# Built-in themes
2themes = ["darkgrid", "whitegrid", "dark", "white", "ticks"]
3
4fig, axes = plt.subplots(1, 5, figsize=(20, 4))
5for ax, theme in zip(axes, themes):
6 sns.set_theme(style=theme)
7 sns.histplot(tips["total_bill"], ax=ax)
8 ax.set_title(theme)
9plt.tight_layout()
10plt.show()

8.2 Color Palettes

Python
1# Built-in palettes
2palettes = ["deep", "muted", "pastel", "bright", "dark", "colorblind"]
3
4for palette in palettes:
5 sns.set_palette(palette)
6 plt.figure(figsize=(8, 4))
7 sns.barplot(data=tips, x="day", y="total_bill")
8 plt.title(f"Palette: {palette}")
9 plt.show()
10
11# Custom palette
12custom_palette = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4"]
13sns.set_palette(custom_palette)
14
15# Sequential palettes
16sns.color_palette("Blues", n_colors=10)
17sns.color_palette("coolwarm", n_colors=10)

8.3 Context

Python
1# Contexts: paper, notebook, talk, poster
2contexts = ["paper", "notebook", "talk", "poster"]
3
4for context in contexts:
5 sns.set_context(context)
6 plt.figure(figsize=(8, 4))
7 sns.lineplot(data=tips, x="size", y="total_bill")
8 plt.title(f"Context: {context}")
9 plt.show()

9. Combining với Matplotlib

Python
1# Kết hợp Seaborn với Matplotlib customization
2fig, axes = plt.subplots(2, 2, figsize=(12, 10))
3
4# Plot 1: Histogram
5sns.histplot(tips["total_bill"], kde=True, ax=axes[0, 0], color="steelblue")
6axes[0, 0].set_title("Distribution of Total Bill", fontsize=14, fontweight='bold')
7axes[0, 0].axvline(tips["total_bill"].mean(), color='red', linestyle='--', label='Mean')
8axes[0, 0].legend()
9
10# Plot 2: Box plot
11sns.boxplot(data=tips, x="day", y="total_bill", ax=axes[0, 1], palette="Set3")
12axes[0, 1].set_title("Total Bill by Day", fontsize=14, fontweight='bold')
13
14# Plot 3: Scatter with regression
15sns.regplot(data=tips, x="total_bill", y="tip", ax=axes[1, 0],
16 scatter_kws={'alpha': 0.6}, color="coral")
17axes[1, 0].set_title("Bill vs Tip (with regression)", fontsize=14, fontweight='bold')
18
19# Plot 4: Heatmap
20numeric_cols = tips.select_dtypes(include=[np.number])
21sns.heatmap(numeric_cols.corr(), annot=True, cmap="RdYlGn", ax=axes[1, 1])
22axes[1, 1].set_title("Correlation Matrix", fontsize=14, fontweight='bold')
23
24plt.suptitle("Tips Dataset Analysis", fontsize=16, fontweight='bold', y=1.02)
25plt.tight_layout()
26plt.savefig("tips_analysis.png", dpi=300, bbox_inches='tight')
27plt.show()

10. Common Patterns

Python
1# Quick EDA Template
2def quick_eda(df, target_col):
3 """Quick EDA visualization"""
4
5 fig = plt.figure(figsize=(15, 10))
6
7 # 1. Distribution of target
8 ax1 = fig.add_subplot(2, 2, 1)
9 sns.histplot(df[target_col], kde=True, ax=ax1)
10 ax1.set_title(f"Distribution of {target_col}")
11
12 # 2. Box plot
13 ax2 = fig.add_subplot(2, 2, 2)
14 sns.boxplot(y=df[target_col], ax=ax2)
15 ax2.set_title(f"Box Plot of {target_col}")
16
17 # 3. Correlation heatmap
18 ax3 = fig.add_subplot(2, 2, 3)
19 numeric_df = df.select_dtypes(include=[np.number])
20 sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", ax=ax3)
21 ax3.set_title("Correlation Matrix")
22
23 # 4. Pair plot (separate figure)
24 ax4 = fig.add_subplot(2, 2, 4)
25 if len(numeric_df.columns) <= 5:
26 ax4.text(0.5, 0.5, "See separate pair plot", ha='center', va='center')
27 sns.pairplot(numeric_df)
28
29 plt.tight_layout()
30 plt.show()
31
32# Sử dụng
33quick_eda(tips, "total_bill")

Tổng Kết

Trong bài này, bạn đã học:

  • ✅ Distribution plots: hist, kde, box, violin
  • ✅ Categorical plots: bar, count, strip, swarm
  • ✅ Relational plots: scatter, line, regression
  • ✅ Matrix plots: heatmap, clustermap
  • ✅ Pair plot và Joint plot
  • ✅ FacetGrid cho multi-plots
  • ✅ Themes, palettes và customization

Bài tiếp theo: Plotly - Interactive Visualization!