Polars - DataFrame Siêu Nhanh

1. Tại sao Polars?

Polars là thư viện DataFrame mới, được viết bằng Rust, nhanh hơn Pandas 10-100x!

Tiêu chí	Pandas	Polars
Ngôn ngữ	Python/C	Rust
Tốc độ	Chuẩn	10-100x nhanh hơn
Memory	Cao	Thấp hơn nhiều
Parallel	Không	Tự động multi-core
Lazy eval	Không	Có
API	Dễ dùng	Dễ dùng

Python

1import polars as pl
2print(pl.__version__)  # 0.x.x

2. Tạo DataFrame

2.1 Từ Dictionary

Python

1import polars as pl
2
3# Từ dict
4df = pl.DataFrame({
5    "name": ["Alice", "Bob", "Charlie"],
6    "age": [25, 30, 35],
7    "salary": [50000, 60000, 70000]
8})
9
10print(df)
11# shape: (3, 3)
12# ┌─────────┬─────┬────────┐
13# │ name    ┆ age ┆ salary │
14# │ ---     ┆ --- ┆ ---    │
15# │ str     ┆ i64 ┆ i64    │
16# ╞═════════╪═════╪════════╡
17# │ Alice   ┆ 25  ┆ 50000  │
18# │ Bob     ┆ 30  ┆ 60000  │
19# │ Charlie ┆ 35  ┆ 70000  │
20# └─────────┴─────┴────────┘

2.2 Đọc File

Python

1# CSV
2df = pl.read_csv("data.csv")
3
4# Parquet (tối ưu nhất cho Polars)
5df = pl.read_parquet("data.parquet")
6
7# JSON
8df = pl.read_json("data.json")
9
10# Excel
11df = pl.read_excel("data.xlsx")
12
13# Lazy loading - Không load hết vào RAM
14df_lazy = pl.scan_csv("large_file.csv")
15df_lazy = pl.scan_parquet("large_file.parquet")

3. Selection với Expressions

3.1 Select Columns

Python

1# Select columns
2df.select("name", "age")
3df.select(pl.col("name"), pl.col("age"))
4
5# Select tất cả
6df.select(pl.all())
7
8# Select với pattern
9df.select(pl.col("^salary.*$"))  # Regex
10df.select(pl.col("*_id"))        # Wildcard

3.2 Filter Rows

Python

1# Filter với expressions
2df.filter(pl.col("age") > 30)
3
4# Multiple conditions
5df.filter(
6    (pl.col("age") > 25) & (pl.col("salary") > 55000)
7)
8
9# isin
10df.filter(pl.col("name").is_in(["Alice", "Bob"]))
11
12# String contains
13df.filter(pl.col("name").str.contains("A"))

3.3 Expressions - Core của Polars

Python

1# Expressions rất powerful!
2df.select(
3    pl.col("name"),
4    pl.col("age"),
5    pl.col("salary"),
6    
7    # Tính toán
8    (pl.col("salary") / 12).alias("monthly_salary"),
9    
10    # Conditions
11    pl.when(pl.col("age") > 30)
12      .then(pl.lit("Senior"))
13      .otherwise(pl.lit("Junior"))
14      .alias("level"),
15    
16    # String operations
17    pl.col("name").str.to_uppercase().alias("NAME_UPPER"),
18    
19    # Aggregations (broadcast)
20    pl.col("salary").mean().alias("avg_salary"),
21)

4. Transformations

4.1 With Columns - Thêm/Sửa cột

Python

1df = df.with_columns(
2    # Cột mới từ tính toán
3    (pl.col("salary") * 1.1).alias("new_salary"),
4    
5    # Age group
6    pl.when(pl.col("age") < 30)
7      .then(pl.lit("Young"))
8      .when(pl.col("age") < 40)
9      .then(pl.lit("Middle"))
10      .otherwise(pl.lit("Senior"))
11      .alias("age_group"),
12    
13    # String processing
14    pl.col("name").str.len_chars().alias("name_length"),
15)

4.2 Sort

Python

1# Sort ascending
2df.sort("age")
3
4# Sort descending
5df.sort("age", descending=True)
6
7# Sort by multiple columns
8df.sort(["city", "age"], descending=[False, True])

4.3 Rename và Drop

Python

1# Rename
2df.rename({"old_name": "new_name"})
3
4# Drop columns
5df.drop("column_to_drop")
6df.drop(["col1", "col2"])

5. Group By và Aggregations

5.1 Basic Group By

Python

1df.group_by("city").agg(
2    pl.col("salary").sum().alias("total_salary"),
3    pl.col("salary").mean().alias("avg_salary"),
4    pl.col("age").max().alias("max_age"),
5    pl.len().alias("count")
6)

5.2 Nhiều Aggregations

Python

1df.group_by("city", "department").agg(
2    # Sum
3    pl.col("salary").sum().alias("total_salary"),
4    
5    # Mean
6    pl.col("age").mean().alias("avg_age"),
7    
8    # Count unique
9    pl.col("employee_id").n_unique().alias("unique_employees"),
10    
11    # First/Last
12    pl.col("hire_date").first().alias("first_hire"),
13    pl.col("hire_date").last().alias("last_hire"),
14    
15    # List aggregation
16    pl.col("name").alias("employees"),  # List of names
17    
18    # Quantiles
19    pl.col("salary").quantile(0.5).alias("median_salary"),
20)

5.3 Window Functions

Python

1df.with_columns(
2    # Running sum
3    pl.col("sales").cum_sum().over("city").alias("cumulative_sales"),
4    
5    # Rank
6    pl.col("salary").rank().over("department").alias("salary_rank"),
7    
8    # Percent of group
9    (pl.col("sales") / pl.col("sales").sum().over("city") * 100)
10        .alias("pct_of_city"),
11    
12    # Lead/Lag
13    pl.col("date").shift(1).over("customer_id").alias("prev_date"),
14    pl.col("date").shift(-1).over("customer_id").alias("next_date"),
15)

6. Lazy Evaluation

Lazy mode là sức mạnh của Polars - tối ưu query trước khi thực thi!

6.1 Lazy Frame

Python

1# Tạo LazyFrame
2df_lazy = pl.scan_csv("large_file.csv")
3
4# Hoặc từ DataFrame
5df_lazy = df.lazy()
6
7# Chain operations (chưa thực thi!)
8result_lazy = (
9    df_lazy
10    .filter(pl.col("age") > 25)
11    .group_by("city")
12    .agg(pl.col("salary").mean())
13    .sort("salary", descending=True)
14)
15
16# Thực thi và collect kết quả
17result = result_lazy.collect()
18
19# Xem query plan
20print(result_lazy.explain())

6.2 Streaming cho Big Data

Python

1# Streaming mode - xử lý file lớn hơn RAM
2result = (
3    pl.scan_csv("huge_file.csv")
4    .filter(pl.col("value") > 100)
5    .group_by("category")
6    .agg(pl.col("value").sum())
7    .collect(streaming=True)  # Streaming execution
8)

7. Joins

Python

1# Sample DataFrames
2orders = pl.DataFrame({
3    "order_id": [1, 2, 3],
4    "customer_id": [101, 102, 101],
5    "amount": [100, 200, 150]
6})
7
8customers = pl.DataFrame({
9    "customer_id": [101, 102, 103],
10    "name": ["Alice", "Bob", "Charlie"]
11})
12
13# Inner join
14orders.join(customers, on="customer_id", how="inner")
15
16# Left join
17orders.join(customers, on="customer_id", how="left")
18
19# Full outer join
20orders.join(customers, on="customer_id", how="outer")
21
22# Join on different columns
23orders.join(
24    customers, 
25    left_on="cust_id", 
26    right_on="customer_id"
27)

8. String Operations

Python

1df.with_columns(
2    # Upper/Lower
3    pl.col("name").str.to_uppercase().alias("upper"),
4    pl.col("name").str.to_lowercase().alias("lower"),
5    
6    # Contains
7    pl.col("name").str.contains("A").alias("has_A"),
8    
9    # Replace
10    pl.col("text").str.replace("old", "new"),
11    pl.col("text").str.replace_all(r"\d+", "X"),  # Regex
12    
13    # Split
14    pl.col("full_name").str.split(" ").alias("name_parts"),
15    
16    # Extract
17    pl.col("email").str.extract(r"@(\w+)\.").alias("domain"),
18    
19    # Length
20    pl.col("name").str.len_chars().alias("char_count"),
21)

9. Date/Time Operations

Python

1df = pl.DataFrame({
2    "date": ["2024-01-15", "2024-02-20", "2024-03-25"],
3}).with_columns(
4    pl.col("date").str.to_datetime().alias("datetime")
5)
6
7df.with_columns(
8    # Extract components
9    pl.col("datetime").dt.year().alias("year"),
10    pl.col("datetime").dt.month().alias("month"),
11    pl.col("datetime").dt.day().alias("day"),
12    pl.col("datetime").dt.weekday().alias("weekday"),
13    
14    # Date arithmetic
15    (pl.col("datetime") + pl.duration(days=7)).alias("plus_week"),
16    
17    # Date diff
18    (pl.col("datetime") - pl.col("datetime").shift(1))
19        .dt.total_days()
20        .alias("days_between"),
21)

10. Pandas ↔ Polars Conversion

Python

1import pandas as pd
2import polars as pl
3
4# Pandas → Polars
5pandas_df = pd.DataFrame({"a": [1, 2, 3]})
6polars_df = pl.from_pandas(pandas_df)
7
8# Polars → Pandas
9polars_df = pl.DataFrame({"a": [1, 2, 3]})
10pandas_df = polars_df.to_pandas()

11. So sánh Pandas vs Polars

Python

1# PANDAS
2import pandas as pd
3
4df_pd = pd.read_csv("data.csv")
5result_pd = (
6    df_pd[df_pd['age'] > 25]
7    .groupby('city')['salary']
8    .mean()
9    .reset_index()
10    .sort_values('salary', ascending=False)
11)
12
13# POLARS (dễ đọc hơn, nhanh hơn!)
14import polars as pl
15
16result_pl = (
17    pl.scan_csv("data.csv")  # Lazy
18    .filter(pl.col("age") > 25)
19    .group_by("city")
20    .agg(pl.col("salary").mean())
21    .sort("salary", descending=True)
22    .collect()  # Execute
23)

Tổng Kết

Trong bài này, bạn đã học:

✅ Polars là gì và tại sao nhanh hơn Pandas
✅ Expressions - core concept của Polars
✅ Selection, Filter, Transform
✅ Group By và Window Functions
✅ Lazy Evaluation và Streaming
✅ String và DateTime operations

Bài tiếp theo: Data Cleaning & Preprocessing - Làm sạch dữ liệu!