Window Functions Deep Dive

SQL Window Functions Analytics

1. Introduction to Window Functions

Window Functions là gì?

Window Functions thực hiện calculations across rows liên quan đến current row, mà không collapse kết quả thành 1 row như GROUP BY. Đây là một trong những features mạnh nhất của SQL.

1.1 Window vs Aggregate Functions

SQL

1-- Aggregate Function: Collapse to 1 row per group
2SELECT department, AVG(salary) as avg_salary
3FROM employees
4GROUP BY department;
5
6-- Window Function: Keep all rows, add calculation
7SELECT 
8    employee_id,
9    name,
10    department,
11    salary,
12    AVG(salary) OVER (PARTITION BY department) as dept_avg_salary
13FROM employees;

1.2 Window Function Syntax

Text

1┌──────────────────────────────────────────────────────────┐
2│                Window Function Syntax                     │
3├──────────────────────────────────────────────────────────┤
4│                                                           │
5│   function_name(column) OVER (                           │
6│       [PARTITION BY partition_columns]                   │
7│       [ORDER BY order_columns [ASC|DESC]]                │
8│       [frame_clause]                                     │
9│   )                                                       │
10│                                                           │
11│   PARTITION BY: Define groups (like GROUP BY)            │
12│   ORDER BY: Define row order within partition            │
13│   frame_clause: Define which rows to include             │
14│                                                           │
15└──────────────────────────────────────────────────────────┘

2. Ranking Functions

2.1 ROW_NUMBER()

SQL

1-- Unique sequential number for each row
2SELECT 
3    employee_id,
4    name,
5    department,
6    salary,
7    ROW_NUMBER() OVER (ORDER BY salary DESC) as salary_rank
8FROM employees;
9
10-- Row number within each department
11SELECT 
12    employee_id,
13    name,
14    department,
15    salary,
16    ROW_NUMBER() OVER (
17        PARTITION BY department 
18        ORDER BY salary DESC
19    ) as dept_rank
20FROM employees;
21
22-- Use Case: Get top N per group
23WITH ranked AS (
24    SELECT 
25        *,
26        ROW_NUMBER() OVER (
27            PARTITION BY department 
28            ORDER BY salary DESC
29        ) as rn
30    FROM employees
31)
32SELECT * FROM ranked WHERE rn <= 3;  -- Top 3 per department

2.2 RANK() và DENSE_RANK()

SQL

1-- RANK: Ties get same rank, skip numbers
2-- DENSE_RANK: Ties get same rank, no gaps
3SELECT 
4    name,
5    department,
6    salary,
7    ROW_NUMBER() OVER (ORDER BY salary DESC) as row_num,
8    RANK() OVER (ORDER BY salary DESC) as rank,
9    DENSE_RANK() OVER (ORDER BY salary DESC) as dense_rank
10FROM employees;
11
12-- Example output:
13-- | name    | salary | row_num | rank | dense_rank |
14-- |---------|--------|---------|------|------------|
15-- | Alice   | 100000 |    1    |  1   |     1      |
16-- | Bob     | 100000 |    2    |  1   |     1      |  <- tie
17-- | Charlie |  90000 |    3    |  3   |     2      |  <- rank skips to 3
18-- | Diana   |  80000 |    4    |  4   |     3      |

2.3 NTILE()

SQL

1-- Divide rows into N buckets
2SELECT 
3    name,
4    salary,
5    NTILE(4) OVER (ORDER BY salary DESC) as salary_quartile
6FROM employees;
7
8-- Use Case: Percentile groups
9SELECT 
10    name,
11    salary,
12    CASE NTILE(100) OVER (ORDER BY salary)
13        WHEN 100 THEN 'Top 1%'
14        WHEN 99 THEN 'Top 1-2%'
15        WHEN 98 THEN 'Top 2-3%'
16        ELSE 'Bottom 97%'
17    END as percentile_group
18FROM employees;

2.4 PERCENT_RANK() và CUME_DIST()

SQL

1-- PERCENT_RANK: Relative rank (0-1)
2-- Formula: (rank - 1) / (total_rows - 1)
3
4-- CUME_DIST: Cumulative distribution
5-- Formula: count of rows <= current / total_rows
6
7SELECT 
8    name,
9    salary,
10    RANK() OVER (ORDER BY salary) as rank,
11    PERCENT_RANK() OVER (ORDER BY salary) as pct_rank,
12    CUME_DIST() OVER (ORDER BY salary) as cume_dist
13FROM employees;

3. Value Functions

3.1 LAG() và LEAD()

SQL

1-- LAG: Access previous row
2-- LEAD: Access next row
3
4SELECT 
5    date,
6    sales,
7    LAG(sales, 1) OVER (ORDER BY date) as prev_day_sales,
8    LEAD(sales, 1) OVER (ORDER BY date) as next_day_sales,
9    sales - LAG(sales, 1) OVER (ORDER BY date) as daily_change
10FROM daily_sales;
11
12-- Multiple periods
13SELECT 
14    date,
15    sales,
16    LAG(sales, 1) OVER (ORDER BY date) as prev_1,
17    LAG(sales, 7) OVER (ORDER BY date) as prev_7,  -- Week ago
18    LAG(sales, 30) OVER (ORDER BY date) as prev_30  -- Month ago
19FROM daily_sales;
20
21-- With default value for NULL
22SELECT 
23    date,
24    sales,
25    LAG(sales, 1, 0) OVER (ORDER BY date) as prev_sales  -- Default 0
26FROM daily_sales;

3.2 FIRST_VALUE() và LAST_VALUE()

SQL

1-- FIRST_VALUE: First value in window
2-- LAST_VALUE: Last value in window
3
4SELECT 
5    employee_id,
6    department,
7    salary,
8    FIRST_VALUE(salary) OVER (
9        PARTITION BY department 
10        ORDER BY salary DESC
11    ) as highest_in_dept,
12    LAST_VALUE(salary) OVER (
13        PARTITION BY department 
14        ORDER BY salary DESC
15        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
16    ) as lowest_in_dept
17FROM employees;
18
19-- Note: LAST_VALUE requires frame specification to work correctly!

3.3 NTH_VALUE()

SQL

1-- NTH_VALUE: Get the Nth row's value
2SELECT 
3    employee_id,
4    department,
5    salary,
6    NTH_VALUE(salary, 1) OVER (
7        PARTITION BY department ORDER BY salary DESC
8    ) as top_1_salary,
9    NTH_VALUE(salary, 2) OVER (
10        PARTITION BY department ORDER BY salary DESC
11    ) as top_2_salary,
12    NTH_VALUE(salary, 3) OVER (
13        PARTITION BY department ORDER BY salary DESC
14    ) as top_3_salary
15FROM employees;

4. Aggregate Window Functions

4.1 Running Totals

SQL

1-- Cumulative sum
2SELECT 
3    date,
4    amount,
5    SUM(amount) OVER (ORDER BY date) as running_total
6FROM transactions;
7
8-- Running total per category
9SELECT 
10    date,
11    category,
12    amount,
13    SUM(amount) OVER (
14        PARTITION BY category 
15        ORDER BY date
16    ) as category_running_total
17FROM transactions;
18
19-- Year-to-date (reset each year)
20SELECT 
21    date,
22    amount,
23    SUM(amount) OVER (
24        PARTITION BY YEAR(date) 
25        ORDER BY date
26    ) as ytd_total
27FROM transactions;

4.2 Moving Averages

SQL

1-- 7-day moving average
2SELECT 
3    date,
4    sales,
5    AVG(sales) OVER (
6        ORDER BY date 
7        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
8    ) as ma_7
9FROM daily_sales;
10
11-- 30-day moving average
12SELECT 
13    date,
14    sales,
15    AVG(sales) OVER (
16        ORDER BY date 
17        ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
18    ) as ma_30
19FROM daily_sales;
20
21-- Centered moving average
22SELECT 
23    date,
24    sales,
25    AVG(sales) OVER (
26        ORDER BY date 
27        ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
28    ) as centered_ma_7
29FROM daily_sales;

4.3 Running Statistics

SQL

1SELECT 
2    date,
3    sales,
4    SUM(sales) OVER w as cumulative_sales,
5    AVG(sales) OVER w as cumulative_avg,
6    MIN(sales) OVER w as cumulative_min,
7    MAX(sales) OVER w as cumulative_max,
8    COUNT(*) OVER w as row_count
9FROM daily_sales
10WINDOW w AS (ORDER BY date);
11
12-- Standard deviation
13SELECT 
14    date,
15    sales,
16    STDDEV(sales) OVER (
17        ORDER BY date 
18        ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
19    ) as rolling_stddev
20FROM daily_sales;

5. Frame Clauses

Frame Specification

Frame clause xác định rows nào được include trong calculation. Default là RANGE UNBOUNDED PRECEDING khi có ORDER BY, cần explicit frame cho control tốt hơn.

5.1 ROWS vs RANGE

SQL

1-- ROWS: Physical row count
2-- RANGE: Logical value range
3
4-- ROWS: Exactly 3 previous rows
5SELECT 
6    date, sales,
7    SUM(sales) OVER (
8        ORDER BY date 
9        ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
10    ) as sum_rows
11FROM daily_sales;
12
13-- RANGE: All rows with values in range
14-- (handles ties differently)
15SELECT 
16    date, sales,
17    SUM(sales) OVER (
18        ORDER BY date 
19        RANGE BETWEEN INTERVAL '3' DAY PRECEDING AND CURRENT ROW
20    ) as sum_range
21FROM daily_sales;

5.2 Frame Boundaries

SQL

1-- Frame boundary options:
2-- UNBOUNDED PRECEDING: From partition start
3-- n PRECEDING: n rows before current
4-- CURRENT ROW: Current row
5-- n FOLLOWING: n rows after current  
6-- UNBOUNDED FOLLOWING: To partition end
7
8-- All rows from start to current
9ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
10
11-- Current row and all following
12ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
13
14-- 5 rows before to 5 rows after (11 rows total)
15ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
16
17-- Entire partition
18ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

5.3 Named Windows

SQL

1-- Define window once, use multiple times
2SELECT 
3    date,
4    sales,
5    SUM(sales) OVER sales_window as running_total,
6    AVG(sales) OVER sales_window as running_avg,
7    COUNT(*) OVER sales_window as row_count
8FROM daily_sales
9WINDOW sales_window AS (
10    ORDER BY date 
11    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
12);
13
14-- Multiple named windows
15SELECT 
16    date,
17    category,
18    sales,
19    SUM(sales) OVER total_window as running_total,
20    SUM(sales) OVER category_window as category_total
21FROM daily_sales
22WINDOW 
23    total_window AS (ORDER BY date),
24    category_window AS (PARTITION BY category ORDER BY date);

6. Practical Applications

6.1 Year-over-Year Analysis

SQL

1-- YoY Growth Rate
2WITH monthly_sales AS (
3    SELECT 
4        DATE_TRUNC('month', date) as month,
5        SUM(amount) as sales
6    FROM transactions
7    GROUP BY DATE_TRUNC('month', date)
8)
9SELECT 
10    month,
11    sales,
12    LAG(sales, 12) OVER (ORDER BY month) as prev_year_sales,
13    ROUND(
14        (sales - LAG(sales, 12) OVER (ORDER BY month)) 
15        / LAG(sales, 12) OVER (ORDER BY month) * 100, 
16        2
17    ) as yoy_growth_pct
18FROM monthly_sales
19ORDER BY month;

6.2 Finding Gaps and Islands

SQL

1-- Identify consecutive sequences
2WITH numbered AS (
3    SELECT 
4        date,
5        status,
6        ROW_NUMBER() OVER (ORDER BY date) -
7        ROW_NUMBER() OVER (PARTITION BY status ORDER BY date) as grp
8    FROM events
9)
10SELECT 
11    status,
12    MIN(date) as start_date,
13    MAX(date) as end_date,
14    COUNT(*) as consecutive_days
15FROM numbered
16GROUP BY status, grp
17ORDER BY start_date;

6.3 Percentile Analysis

SQL

1-- Calculate percentiles
2SELECT 
3    department,
4    name,
5    salary,
6    PERCENT_RANK() OVER (ORDER BY salary) as overall_percentile,
7    PERCENT_RANK() OVER (
8        PARTITION BY department ORDER BY salary
9    ) as dept_percentile,
10    CASE 
11        WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.9 THEN 'Top 10%'
12        WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.75 THEN 'Top 25%'
13        WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.5 THEN 'Top 50%'
14        ELSE 'Bottom 50%'
15    END as percentile_group
16FROM employees;

6.4 Session Analysis

SQL

1-- Identify user sessions (30 min gap = new session)
2WITH activity_with_gaps AS (
3    SELECT 
4        user_id,
5        timestamp,
6        LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) as prev_timestamp,
7        CASE 
8            WHEN timestamp - LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) 
9                 > INTERVAL '30 minutes'
10            THEN 1 
11            ELSE 0 
12        END as new_session
13    FROM user_activity
14),
15sessions AS (
16    SELECT 
17        *,
18        SUM(new_session) OVER (
19            PARTITION BY user_id 
20            ORDER BY timestamp
21        ) + 1 as session_id
22    FROM activity_with_gaps
23)
24SELECT 
25    user_id,
26    session_id,
27    MIN(timestamp) as session_start,
28    MAX(timestamp) as session_end,
29    COUNT(*) as events_in_session
30FROM sessions
31GROUP BY user_id, session_id;

6.5 Running Difference Analysis

SQL

1-- Analyze changes between consecutive records
2SELECT 
3    date,
4    metric_value,
5    LAG(metric_value) OVER (ORDER BY date) as prev_value,
6    metric_value - LAG(metric_value) OVER (ORDER BY date) as absolute_change,
7    ROUND(
8        (metric_value - LAG(metric_value) OVER (ORDER BY date)) 
9        / LAG(metric_value) OVER (ORDER BY date) * 100,
10        2
11    ) as pct_change,
12    CASE 
13        WHEN metric_value > LAG(metric_value) OVER (ORDER BY date) THEN 'Increase'
14        WHEN metric_value < LAG(metric_value) OVER (ORDER BY date) THEN 'Decrease'
15        ELSE 'No Change'
16    END as trend
17FROM metrics;

7. Performance Considerations

Window Function Performance

Window functions có thể expensive cho large datasets. Optimize với proper indexing và partition sizing.

7.1 Indexing

SQL

1-- Create index for window function optimization
2CREATE INDEX idx_sales_date ON sales(date);
3CREATE INDEX idx_sales_category_date ON sales(category, date);
4
5-- Composite index for partition + order
6CREATE INDEX idx_transactions_user_date ON transactions(user_id, date);

7.2 Limiting Partitions

SQL

1-- Filter before window function
2WITH filtered AS (
3    SELECT * 
4    FROM transactions 
5    WHERE date >= DATE_SUB(CURRENT_DATE, INTERVAL 1 YEAR)
6)
7SELECT 
8    *,
9    SUM(amount) OVER (ORDER BY date) as running_total
10FROM filtered;

7.3 Materializing Intermediate Results

SQL

1-- For complex calculations, use temporary table
2CREATE TEMPORARY TABLE ranked_sales AS
3SELECT 
4    *,
5    ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) as rank
6FROM sales;
7
8-- Then query the temp table
9SELECT * FROM ranked_sales WHERE rank <= 10;

8. Thực hành

Practice Exercises

Exercise: Sales Analytics with Window Functions

SQL

1-- Sample table structure:
2-- orders(order_id, customer_id, order_date, amount, category)
3
4-- Tasks:
5-- 1. Rank customers by total spending (overall and by category)
6-- 2. Calculate 7-day moving average of daily sales
7-- 3. Find YoY growth by month
8-- 4. Identify top 3 orders per customer
9-- 5. Calculate percentage of category total for each order
10
11-- YOUR QUERIES HERE

💡 Xem đáp án

SQL

1-- 1. Rank customers by spending
2WITH customer_spending AS (
3    SELECT 
4        customer_id,
5        category,
6        SUM(amount) as total_spent
7    FROM orders
8    GROUP BY customer_id, category
9)
10SELECT 
11    customer_id,
12    category,
13    total_spent,
14    RANK() OVER (ORDER BY total_spent DESC) as overall_rank,
15    RANK() OVER (PARTITION BY category ORDER BY total_spent DESC) as category_rank
16FROM customer_spending;
17
18-- 2. 7-day moving average
19WITH daily_sales AS (
20    SELECT 
21        DATE(order_date) as date,
22        SUM(amount) as sales
23    FROM orders
24    GROUP BY DATE(order_date)
25)
26SELECT 
27    date,
28    sales,
29    ROUND(AVG(sales) OVER (
30        ORDER BY date 
31        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
32    ), 2) as ma_7
33FROM daily_sales
34ORDER BY date;
35
36-- 3. YoY growth
37WITH monthly_sales AS (
38    SELECT 
39        DATE_TRUNC('month', order_date) as month,
40        SUM(amount) as sales
41    FROM orders
42    GROUP BY DATE_TRUNC('month', order_date)
43)
44SELECT 
45    month,
46    sales,
47    LAG(sales, 12) OVER (ORDER BY month) as prev_year,
48    ROUND(
49        (sales - LAG(sales, 12) OVER (ORDER BY month)) 
50        / NULLIF(LAG(sales, 12) OVER (ORDER BY month), 0) * 100,
51        2
52    ) as yoy_growth_pct
53FROM monthly_sales
54ORDER BY month;
55
56-- 4. Top 3 orders per customer
57WITH ranked_orders AS (
58    SELECT 
59        order_id,
60        customer_id,
61        order_date,
62        amount,
63        ROW_NUMBER() OVER (
64            PARTITION BY customer_id 
65            ORDER BY amount DESC
66        ) as order_rank
67    FROM orders
68)
69SELECT * 
70FROM ranked_orders 
71WHERE order_rank <= 3
72ORDER BY customer_id, order_rank;
73
74-- 5. Percentage of category total
75SELECT 
76    order_id,
77    customer_id,
78    category,
79    amount,
80    SUM(amount) OVER (PARTITION BY category) as category_total,
81    ROUND(
82        amount / SUM(amount) OVER (PARTITION BY category) * 100,
83        2
84    ) as pct_of_category
85FROM orders
86ORDER BY category, amount DESC;

9. Tổng kết

Function Type	Functions	Use Case
Ranking	ROW_NUMBER, RANK, DENSE_RANK, NTILE	Ranking, Top N, Percentiles
Value	LAG, LEAD, FIRST_VALUE, LAST_VALUE	Time comparison, trends
Aggregate	SUM, AVG, COUNT, MIN, MAX	Running totals, moving averages

Key Concepts:

✅ PARTITION BY = GROUP BY without collapsing
✅ ORDER BY = Define calculation order
✅ Frame clause = Which rows to include
✅ Named windows = Reusable definitions

Bài tiếp theo: Advanced CTEs - Recursive queries và complex data transformations