Lý thuyết
Bài 5/17

Window Functions Deep Dive

ROW_NUMBER, RANK, LEAD/LAG, và Partitioning trong SQL

Window Functions Deep Dive

SQL Window Functions Analytics

1. Introduction to Window Functions

Window Functions là gì?

Window Functions thực hiện calculations across rows liên quan đến current row, mà không collapse kết quả thành 1 row như GROUP BY. Đây là một trong những features mạnh nhất của SQL.

1.1 Window vs Aggregate Functions

SQL
1-- Aggregate Function: Collapse to 1 row per group
2SELECT department, AVG(salary) as avg_salary
3FROM employees
4GROUP BY department;
5
6-- Window Function: Keep all rows, add calculation
7SELECT
8 employee_id,
9 name,
10 department,
11 salary,
12 AVG(salary) OVER (PARTITION BY department) as dept_avg_salary
13FROM employees;

1.2 Window Function Syntax

Text
1┌──────────────────────────────────────────────────────────┐
2│ Window Function Syntax │
3├──────────────────────────────────────────────────────────┤
4│ │
5│ function_name(column) OVER ( │
6│ [PARTITION BY partition_columns] │
7│ [ORDER BY order_columns [ASC|DESC]] │
8│ [frame_clause] │
9│ ) │
10│ │
11│ PARTITION BY: Define groups (like GROUP BY) │
12│ ORDER BY: Define row order within partition │
13│ frame_clause: Define which rows to include │
14│ │
15└──────────────────────────────────────────────────────────┘

2. Ranking Functions

2.1 ROW_NUMBER()

SQL
1-- Unique sequential number for each row
2SELECT
3 employee_id,
4 name,
5 department,
6 salary,
7 ROW_NUMBER() OVER (ORDER BY salary DESC) as salary_rank
8FROM employees;
9
10-- Row number within each department
11SELECT
12 employee_id,
13 name,
14 department,
15 salary,
16 ROW_NUMBER() OVER (
17 PARTITION BY department
18 ORDER BY salary DESC
19 ) as dept_rank
20FROM employees;
21
22-- Use Case: Get top N per group
23WITH ranked AS (
24 SELECT
25 *,
26 ROW_NUMBER() OVER (
27 PARTITION BY department
28 ORDER BY salary DESC
29 ) as rn
30 FROM employees
31)
32SELECT * FROM ranked WHERE rn <= 3; -- Top 3 per department

2.2 RANK() và DENSE_RANK()

SQL
1-- RANK: Ties get same rank, skip numbers
2-- DENSE_RANK: Ties get same rank, no gaps
3SELECT
4 name,
5 department,
6 salary,
7 ROW_NUMBER() OVER (ORDER BY salary DESC) as row_num,
8 RANK() OVER (ORDER BY salary DESC) as rank,
9 DENSE_RANK() OVER (ORDER BY salary DESC) as dense_rank
10FROM employees;
11
12-- Example output:
13-- | name | salary | row_num | rank | dense_rank |
14-- |---------|--------|---------|------|------------|
15-- | Alice | 100000 | 1 | 1 | 1 |
16-- | Bob | 100000 | 2 | 1 | 1 | <- tie
17-- | Charlie | 90000 | 3 | 3 | 2 | <- rank skips to 3
18-- | Diana | 80000 | 4 | 4 | 3 |

2.3 NTILE()

SQL
1-- Divide rows into N buckets
2SELECT
3 name,
4 salary,
5 NTILE(4) OVER (ORDER BY salary DESC) as salary_quartile
6FROM employees;
7
8-- Use Case: Percentile groups
9SELECT
10 name,
11 salary,
12 CASE NTILE(100) OVER (ORDER BY salary)
13 WHEN 100 THEN 'Top 1%'
14 WHEN 99 THEN 'Top 1-2%'
15 WHEN 98 THEN 'Top 2-3%'
16 ELSE 'Bottom 97%'
17 END as percentile_group
18FROM employees;

2.4 PERCENT_RANK() và CUME_DIST()

SQL
1-- PERCENT_RANK: Relative rank (0-1)
2-- Formula: (rank - 1) / (total_rows - 1)
3
4-- CUME_DIST: Cumulative distribution
5-- Formula: count of rows <= current / total_rows
6
7SELECT
8 name,
9 salary,
10 RANK() OVER (ORDER BY salary) as rank,
11 PERCENT_RANK() OVER (ORDER BY salary) as pct_rank,
12 CUME_DIST() OVER (ORDER BY salary) as cume_dist
13FROM employees;

3. Value Functions

3.1 LAG() và LEAD()

SQL
1-- LAG: Access previous row
2-- LEAD: Access next row
3
4SELECT
5 date,
6 sales,
7 LAG(sales, 1) OVER (ORDER BY date) as prev_day_sales,
8 LEAD(sales, 1) OVER (ORDER BY date) as next_day_sales,
9 sales - LAG(sales, 1) OVER (ORDER BY date) as daily_change
10FROM daily_sales;
11
12-- Multiple periods
13SELECT
14 date,
15 sales,
16 LAG(sales, 1) OVER (ORDER BY date) as prev_1,
17 LAG(sales, 7) OVER (ORDER BY date) as prev_7, -- Week ago
18 LAG(sales, 30) OVER (ORDER BY date) as prev_30 -- Month ago
19FROM daily_sales;
20
21-- With default value for NULL
22SELECT
23 date,
24 sales,
25 LAG(sales, 1, 0) OVER (ORDER BY date) as prev_sales -- Default 0
26FROM daily_sales;

3.2 FIRST_VALUE() và LAST_VALUE()

SQL
1-- FIRST_VALUE: First value in window
2-- LAST_VALUE: Last value in window
3
4SELECT
5 employee_id,
6 department,
7 salary,
8 FIRST_VALUE(salary) OVER (
9 PARTITION BY department
10 ORDER BY salary DESC
11 ) as highest_in_dept,
12 LAST_VALUE(salary) OVER (
13 PARTITION BY department
14 ORDER BY salary DESC
15 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
16 ) as lowest_in_dept
17FROM employees;
18
19-- Note: LAST_VALUE requires frame specification to work correctly!

3.3 NTH_VALUE()

SQL
1-- NTH_VALUE: Get the Nth row's value
2SELECT
3 employee_id,
4 department,
5 salary,
6 NTH_VALUE(salary, 1) OVER (
7 PARTITION BY department ORDER BY salary DESC
8 ) as top_1_salary,
9 NTH_VALUE(salary, 2) OVER (
10 PARTITION BY department ORDER BY salary DESC
11 ) as top_2_salary,
12 NTH_VALUE(salary, 3) OVER (
13 PARTITION BY department ORDER BY salary DESC
14 ) as top_3_salary
15FROM employees;

4. Aggregate Window Functions

4.1 Running Totals

SQL
1-- Cumulative sum
2SELECT
3 date,
4 amount,
5 SUM(amount) OVER (ORDER BY date) as running_total
6FROM transactions;
7
8-- Running total per category
9SELECT
10 date,
11 category,
12 amount,
13 SUM(amount) OVER (
14 PARTITION BY category
15 ORDER BY date
16 ) as category_running_total
17FROM transactions;
18
19-- Year-to-date (reset each year)
20SELECT
21 date,
22 amount,
23 SUM(amount) OVER (
24 PARTITION BY YEAR(date)
25 ORDER BY date
26 ) as ytd_total
27FROM transactions;

4.2 Moving Averages

SQL
1-- 7-day moving average
2SELECT
3 date,
4 sales,
5 AVG(sales) OVER (
6 ORDER BY date
7 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
8 ) as ma_7
9FROM daily_sales;
10
11-- 30-day moving average
12SELECT
13 date,
14 sales,
15 AVG(sales) OVER (
16 ORDER BY date
17 ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
18 ) as ma_30
19FROM daily_sales;
20
21-- Centered moving average
22SELECT
23 date,
24 sales,
25 AVG(sales) OVER (
26 ORDER BY date
27 ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
28 ) as centered_ma_7
29FROM daily_sales;

4.3 Running Statistics

SQL
1SELECT
2 date,
3 sales,
4 SUM(sales) OVER w as cumulative_sales,
5 AVG(sales) OVER w as cumulative_avg,
6 MIN(sales) OVER w as cumulative_min,
7 MAX(sales) OVER w as cumulative_max,
8 COUNT(*) OVER w as row_count
9FROM daily_sales
10WINDOW w AS (ORDER BY date);
11
12-- Standard deviation
13SELECT
14 date,
15 sales,
16 STDDEV(sales) OVER (
17 ORDER BY date
18 ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
19 ) as rolling_stddev
20FROM daily_sales;

5. Frame Clauses

Frame Specification

Frame clause xác định rows nào được include trong calculation. Default là RANGE UNBOUNDED PRECEDING khi có ORDER BY, cần explicit frame cho control tốt hơn.

5.1 ROWS vs RANGE

SQL
1-- ROWS: Physical row count
2-- RANGE: Logical value range
3
4-- ROWS: Exactly 3 previous rows
5SELECT
6 date, sales,
7 SUM(sales) OVER (
8 ORDER BY date
9 ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
10 ) as sum_rows
11FROM daily_sales;
12
13-- RANGE: All rows with values in range
14-- (handles ties differently)
15SELECT
16 date, sales,
17 SUM(sales) OVER (
18 ORDER BY date
19 RANGE BETWEEN INTERVAL '3' DAY PRECEDING AND CURRENT ROW
20 ) as sum_range
21FROM daily_sales;

5.2 Frame Boundaries

SQL
1-- Frame boundary options:
2-- UNBOUNDED PRECEDING: From partition start
3-- n PRECEDING: n rows before current
4-- CURRENT ROW: Current row
5-- n FOLLOWING: n rows after current
6-- UNBOUNDED FOLLOWING: To partition end
7
8-- All rows from start to current
9ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
10
11-- Current row and all following
12ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
13
14-- 5 rows before to 5 rows after (11 rows total)
15ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
16
17-- Entire partition
18ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

5.3 Named Windows

SQL
1-- Define window once, use multiple times
2SELECT
3 date,
4 sales,
5 SUM(sales) OVER sales_window as running_total,
6 AVG(sales) OVER sales_window as running_avg,
7 COUNT(*) OVER sales_window as row_count
8FROM daily_sales
9WINDOW sales_window AS (
10 ORDER BY date
11 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
12);
13
14-- Multiple named windows
15SELECT
16 date,
17 category,
18 sales,
19 SUM(sales) OVER total_window as running_total,
20 SUM(sales) OVER category_window as category_total
21FROM daily_sales
22WINDOW
23 total_window AS (ORDER BY date),
24 category_window AS (PARTITION BY category ORDER BY date);

6. Practical Applications

6.1 Year-over-Year Analysis

SQL
1-- YoY Growth Rate
2WITH monthly_sales AS (
3 SELECT
4 DATE_TRUNC('month', date) as month,
5 SUM(amount) as sales
6 FROM transactions
7 GROUP BY DATE_TRUNC('month', date)
8)
9SELECT
10 month,
11 sales,
12 LAG(sales, 12) OVER (ORDER BY month) as prev_year_sales,
13 ROUND(
14 (sales - LAG(sales, 12) OVER (ORDER BY month))
15 / LAG(sales, 12) OVER (ORDER BY month) * 100,
16 2
17 ) as yoy_growth_pct
18FROM monthly_sales
19ORDER BY month;

6.2 Finding Gaps and Islands

SQL
1-- Identify consecutive sequences
2WITH numbered AS (
3 SELECT
4 date,
5 status,
6 ROW_NUMBER() OVER (ORDER BY date) -
7 ROW_NUMBER() OVER (PARTITION BY status ORDER BY date) as grp
8 FROM events
9)
10SELECT
11 status,
12 MIN(date) as start_date,
13 MAX(date) as end_date,
14 COUNT(*) as consecutive_days
15FROM numbered
16GROUP BY status, grp
17ORDER BY start_date;

6.3 Percentile Analysis

SQL
1-- Calculate percentiles
2SELECT
3 department,
4 name,
5 salary,
6 PERCENT_RANK() OVER (ORDER BY salary) as overall_percentile,
7 PERCENT_RANK() OVER (
8 PARTITION BY department ORDER BY salary
9 ) as dept_percentile,
10 CASE
11 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.9 THEN 'Top 10%'
12 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.75 THEN 'Top 25%'
13 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.5 THEN 'Top 50%'
14 ELSE 'Bottom 50%'
15 END as percentile_group
16FROM employees;

6.4 Session Analysis

SQL
1-- Identify user sessions (30 min gap = new session)
2WITH activity_with_gaps AS (
3 SELECT
4 user_id,
5 timestamp,
6 LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) as prev_timestamp,
7 CASE
8 WHEN timestamp - LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp)
9 > INTERVAL '30 minutes'
10 THEN 1
11 ELSE 0
12 END as new_session
13 FROM user_activity
14),
15sessions AS (
16 SELECT
17 *,
18 SUM(new_session) OVER (
19 PARTITION BY user_id
20 ORDER BY timestamp
21 ) + 1 as session_id
22 FROM activity_with_gaps
23)
24SELECT
25 user_id,
26 session_id,
27 MIN(timestamp) as session_start,
28 MAX(timestamp) as session_end,
29 COUNT(*) as events_in_session
30FROM sessions
31GROUP BY user_id, session_id;

6.5 Running Difference Analysis

SQL
1-- Analyze changes between consecutive records
2SELECT
3 date,
4 metric_value,
5 LAG(metric_value) OVER (ORDER BY date) as prev_value,
6 metric_value - LAG(metric_value) OVER (ORDER BY date) as absolute_change,
7 ROUND(
8 (metric_value - LAG(metric_value) OVER (ORDER BY date))
9 / LAG(metric_value) OVER (ORDER BY date) * 100,
10 2
11 ) as pct_change,
12 CASE
13 WHEN metric_value > LAG(metric_value) OVER (ORDER BY date) THEN 'Increase'
14 WHEN metric_value < LAG(metric_value) OVER (ORDER BY date) THEN 'Decrease'
15 ELSE 'No Change'
16 END as trend
17FROM metrics;

7. Performance Considerations

Window Function Performance

Window functions có thể expensive cho large datasets. Optimize với proper indexing và partition sizing.

7.1 Indexing

SQL
1-- Create index for window function optimization
2CREATE INDEX idx_sales_date ON sales(date);
3CREATE INDEX idx_sales_category_date ON sales(category, date);
4
5-- Composite index for partition + order
6CREATE INDEX idx_transactions_user_date ON transactions(user_id, date);

7.2 Limiting Partitions

SQL
1-- Filter before window function
2WITH filtered AS (
3 SELECT *
4 FROM transactions
5 WHERE date >= DATE_SUB(CURRENT_DATE, INTERVAL 1 YEAR)
6)
7SELECT
8 *,
9 SUM(amount) OVER (ORDER BY date) as running_total
10FROM filtered;

7.3 Materializing Intermediate Results

SQL
1-- For complex calculations, use temporary table
2CREATE TEMPORARY TABLE ranked_sales AS
3SELECT
4 *,
5 ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) as rank
6FROM sales;
7
8-- Then query the temp table
9SELECT * FROM ranked_sales WHERE rank <= 10;

8. Thực hành

Practice Exercises

Exercise: Sales Analytics with Window Functions

SQL
1-- Sample table structure:
2-- orders(order_id, customer_id, order_date, amount, category)
3
4-- Tasks:
5-- 1. Rank customers by total spending (overall and by category)
6-- 2. Calculate 7-day moving average of daily sales
7-- 3. Find YoY growth by month
8-- 4. Identify top 3 orders per customer
9-- 5. Calculate percentage of category total for each order
10
11-- YOUR QUERIES HERE
💡 Xem đáp án
SQL
1-- 1. Rank customers by spending
2WITH customer_spending AS (
3 SELECT
4 customer_id,
5 category,
6 SUM(amount) as total_spent
7 FROM orders
8 GROUP BY customer_id, category
9)
10SELECT
11 customer_id,
12 category,
13 total_spent,
14 RANK() OVER (ORDER BY total_spent DESC) as overall_rank,
15 RANK() OVER (PARTITION BY category ORDER BY total_spent DESC) as category_rank
16FROM customer_spending;
17
18-- 2. 7-day moving average
19WITH daily_sales AS (
20 SELECT
21 DATE(order_date) as date,
22 SUM(amount) as sales
23 FROM orders
24 GROUP BY DATE(order_date)
25)
26SELECT
27 date,
28 sales,
29 ROUND(AVG(sales) OVER (
30 ORDER BY date
31 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
32 ), 2) as ma_7
33FROM daily_sales
34ORDER BY date;
35
36-- 3. YoY growth
37WITH monthly_sales AS (
38 SELECT
39 DATE_TRUNC('month', order_date) as month,
40 SUM(amount) as sales
41 FROM orders
42 GROUP BY DATE_TRUNC('month', order_date)
43)
44SELECT
45 month,
46 sales,
47 LAG(sales, 12) OVER (ORDER BY month) as prev_year,
48 ROUND(
49 (sales - LAG(sales, 12) OVER (ORDER BY month))
50 / NULLIF(LAG(sales, 12) OVER (ORDER BY month), 0) * 100,
51 2
52 ) as yoy_growth_pct
53FROM monthly_sales
54ORDER BY month;
55
56-- 4. Top 3 orders per customer
57WITH ranked_orders AS (
58 SELECT
59 order_id,
60 customer_id,
61 order_date,
62 amount,
63 ROW_NUMBER() OVER (
64 PARTITION BY customer_id
65 ORDER BY amount DESC
66 ) as order_rank
67 FROM orders
68)
69SELECT *
70FROM ranked_orders
71WHERE order_rank <= 3
72ORDER BY customer_id, order_rank;
73
74-- 5. Percentage of category total
75SELECT
76 order_id,
77 customer_id,
78 category,
79 amount,
80 SUM(amount) OVER (PARTITION BY category) as category_total,
81 ROUND(
82 amount / SUM(amount) OVER (PARTITION BY category) * 100,
83 2
84 ) as pct_of_category
85FROM orders
86ORDER BY category, amount DESC;

9. Tổng kết

Function TypeFunctionsUse Case
RankingROW_NUMBER, RANK, DENSE_RANK, NTILERanking, Top N, Percentiles
ValueLAG, LEAD, FIRST_VALUE, LAST_VALUETime comparison, trends
AggregateSUM, AVG, COUNT, MIN, MAXRunning totals, moving averages

Key Concepts:

  1. ✅ PARTITION BY = GROUP BY without collapsing
  2. ✅ ORDER BY = Define calculation order
  3. ✅ Frame clause = Which rows to include
  4. ✅ Named windows = Reusable definitions

Bài tiếp theo: Advanced CTEs - Recursive queries và complex data transformations