Window Functions Deep Dive
1. Introduction to Window Functions
Window Functions là gì?
Window Functions thực hiện calculations across rows liên quan đến current row, mà không collapse kết quả thành 1 row như GROUP BY. Đây là một trong những features mạnh nhất của SQL.
1.1 Window vs Aggregate Functions
SQL
1-- Aggregate Function: Collapse to 1 row per group2SELECT department, AVG(salary) as avg_salary3FROM employees4GROUP BY department;56-- Window Function: Keep all rows, add calculation7SELECT 8 employee_id,9 name,10 department,11 salary,12 AVG(salary) OVER (PARTITION BY department) as dept_avg_salary13FROM employees;1.2 Window Function Syntax
Text
1┌──────────────────────────────────────────────────────────┐2│ Window Function Syntax │3├──────────────────────────────────────────────────────────┤4│ │5│ function_name(column) OVER ( │6│ [PARTITION BY partition_columns] │7│ [ORDER BY order_columns [ASC|DESC]] │8│ [frame_clause] │9│ ) │10│ │11│ PARTITION BY: Define groups (like GROUP BY) │12│ ORDER BY: Define row order within partition │13│ frame_clause: Define which rows to include │14│ │15└──────────────────────────────────────────────────────────┘2. Ranking Functions
2.1 ROW_NUMBER()
SQL
1-- Unique sequential number for each row2SELECT 3 employee_id,4 name,5 department,6 salary,7 ROW_NUMBER() OVER (ORDER BY salary DESC) as salary_rank8FROM employees;910-- Row number within each department11SELECT 12 employee_id,13 name,14 department,15 salary,16 ROW_NUMBER() OVER (17 PARTITION BY department 18 ORDER BY salary DESC19 ) as dept_rank20FROM employees;2122-- Use Case: Get top N per group23WITH ranked AS (24 SELECT 25 *,26 ROW_NUMBER() OVER (27 PARTITION BY department 28 ORDER BY salary DESC29 ) as rn30 FROM employees31)32SELECT * FROM ranked WHERE rn <= 3; -- Top 3 per department2.2 RANK() và DENSE_RANK()
SQL
1-- RANK: Ties get same rank, skip numbers2-- DENSE_RANK: Ties get same rank, no gaps3SELECT 4 name,5 department,6 salary,7 ROW_NUMBER() OVER (ORDER BY salary DESC) as row_num,8 RANK() OVER (ORDER BY salary DESC) as rank,9 DENSE_RANK() OVER (ORDER BY salary DESC) as dense_rank10FROM employees;1112-- Example output:13-- | name | salary | row_num | rank | dense_rank |14-- |---------|--------|---------|------|------------|15-- | Alice | 100000 | 1 | 1 | 1 |16-- | Bob | 100000 | 2 | 1 | 1 | <- tie17-- | Charlie | 90000 | 3 | 3 | 2 | <- rank skips to 318-- | Diana | 80000 | 4 | 4 | 3 |2.3 NTILE()
SQL
1-- Divide rows into N buckets2SELECT 3 name,4 salary,5 NTILE(4) OVER (ORDER BY salary DESC) as salary_quartile6FROM employees;78-- Use Case: Percentile groups9SELECT 10 name,11 salary,12 CASE NTILE(100) OVER (ORDER BY salary)13 WHEN 100 THEN 'Top 1%'14 WHEN 99 THEN 'Top 1-2%'15 WHEN 98 THEN 'Top 2-3%'16 ELSE 'Bottom 97%'17 END as percentile_group18FROM employees;2.4 PERCENT_RANK() và CUME_DIST()
SQL
1-- PERCENT_RANK: Relative rank (0-1)2-- Formula: (rank - 1) / (total_rows - 1)34-- CUME_DIST: Cumulative distribution5-- Formula: count of rows <= current / total_rows67SELECT 8 name,9 salary,10 RANK() OVER (ORDER BY salary) as rank,11 PERCENT_RANK() OVER (ORDER BY salary) as pct_rank,12 CUME_DIST() OVER (ORDER BY salary) as cume_dist13FROM employees;3. Value Functions
3.1 LAG() và LEAD()
SQL
1-- LAG: Access previous row2-- LEAD: Access next row34SELECT 5 date,6 sales,7 LAG(sales, 1) OVER (ORDER BY date) as prev_day_sales,8 LEAD(sales, 1) OVER (ORDER BY date) as next_day_sales,9 sales - LAG(sales, 1) OVER (ORDER BY date) as daily_change10FROM daily_sales;1112-- Multiple periods13SELECT 14 date,15 sales,16 LAG(sales, 1) OVER (ORDER BY date) as prev_1,17 LAG(sales, 7) OVER (ORDER BY date) as prev_7, -- Week ago18 LAG(sales, 30) OVER (ORDER BY date) as prev_30 -- Month ago19FROM daily_sales;2021-- With default value for NULL22SELECT 23 date,24 sales,25 LAG(sales, 1, 0) OVER (ORDER BY date) as prev_sales -- Default 026FROM daily_sales;3.2 FIRST_VALUE() và LAST_VALUE()
SQL
1-- FIRST_VALUE: First value in window2-- LAST_VALUE: Last value in window34SELECT 5 employee_id,6 department,7 salary,8 FIRST_VALUE(salary) OVER (9 PARTITION BY department 10 ORDER BY salary DESC11 ) as highest_in_dept,12 LAST_VALUE(salary) OVER (13 PARTITION BY department 14 ORDER BY salary DESC15 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING16 ) as lowest_in_dept17FROM employees;1819-- Note: LAST_VALUE requires frame specification to work correctly!3.3 NTH_VALUE()
SQL
1-- NTH_VALUE: Get the Nth row's value2SELECT 3 employee_id,4 department,5 salary,6 NTH_VALUE(salary, 1) OVER (7 PARTITION BY department ORDER BY salary DESC8 ) as top_1_salary,9 NTH_VALUE(salary, 2) OVER (10 PARTITION BY department ORDER BY salary DESC11 ) as top_2_salary,12 NTH_VALUE(salary, 3) OVER (13 PARTITION BY department ORDER BY salary DESC14 ) as top_3_salary15FROM employees;4. Aggregate Window Functions
4.1 Running Totals
SQL
1-- Cumulative sum2SELECT 3 date,4 amount,5 SUM(amount) OVER (ORDER BY date) as running_total6FROM transactions;78-- Running total per category9SELECT 10 date,11 category,12 amount,13 SUM(amount) OVER (14 PARTITION BY category 15 ORDER BY date16 ) as category_running_total17FROM transactions;1819-- Year-to-date (reset each year)20SELECT 21 date,22 amount,23 SUM(amount) OVER (24 PARTITION BY YEAR(date) 25 ORDER BY date26 ) as ytd_total27FROM transactions;4.2 Moving Averages
SQL
1-- 7-day moving average2SELECT 3 date,4 sales,5 AVG(sales) OVER (6 ORDER BY date 7 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW8 ) as ma_79FROM daily_sales;1011-- 30-day moving average12SELECT 13 date,14 sales,15 AVG(sales) OVER (16 ORDER BY date 17 ROWS BETWEEN 29 PRECEDING AND CURRENT ROW18 ) as ma_3019FROM daily_sales;2021-- Centered moving average22SELECT 23 date,24 sales,25 AVG(sales) OVER (26 ORDER BY date 27 ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING28 ) as centered_ma_729FROM daily_sales;4.3 Running Statistics
SQL
1SELECT 2 date,3 sales,4 SUM(sales) OVER w as cumulative_sales,5 AVG(sales) OVER w as cumulative_avg,6 MIN(sales) OVER w as cumulative_min,7 MAX(sales) OVER w as cumulative_max,8 COUNT(*) OVER w as row_count9FROM daily_sales10WINDOW w AS (ORDER BY date);1112-- Standard deviation13SELECT 14 date,15 sales,16 STDDEV(sales) OVER (17 ORDER BY date 18 ROWS BETWEEN 29 PRECEDING AND CURRENT ROW19 ) as rolling_stddev20FROM daily_sales;5. Frame Clauses
Frame Specification
Frame clause xác định rows nào được include trong calculation. Default là RANGE UNBOUNDED PRECEDING khi có ORDER BY, cần explicit frame cho control tốt hơn.
5.1 ROWS vs RANGE
SQL
1-- ROWS: Physical row count2-- RANGE: Logical value range34-- ROWS: Exactly 3 previous rows5SELECT 6 date, sales,7 SUM(sales) OVER (8 ORDER BY date 9 ROWS BETWEEN 3 PRECEDING AND CURRENT ROW10 ) as sum_rows11FROM daily_sales;1213-- RANGE: All rows with values in range14-- (handles ties differently)15SELECT 16 date, sales,17 SUM(sales) OVER (18 ORDER BY date 19 RANGE BETWEEN INTERVAL '3' DAY PRECEDING AND CURRENT ROW20 ) as sum_range21FROM daily_sales;5.2 Frame Boundaries
SQL
1-- Frame boundary options:2-- UNBOUNDED PRECEDING: From partition start3-- n PRECEDING: n rows before current4-- CURRENT ROW: Current row5-- n FOLLOWING: n rows after current 6-- UNBOUNDED FOLLOWING: To partition end78-- All rows from start to current9ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW1011-- Current row and all following12ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING1314-- 5 rows before to 5 rows after (11 rows total)15ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING1617-- Entire partition18ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING5.3 Named Windows
SQL
1-- Define window once, use multiple times2SELECT 3 date,4 sales,5 SUM(sales) OVER sales_window as running_total,6 AVG(sales) OVER sales_window as running_avg,7 COUNT(*) OVER sales_window as row_count8FROM daily_sales9WINDOW sales_window AS (10 ORDER BY date 11 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW12);1314-- Multiple named windows15SELECT 16 date,17 category,18 sales,19 SUM(sales) OVER total_window as running_total,20 SUM(sales) OVER category_window as category_total21FROM daily_sales22WINDOW 23 total_window AS (ORDER BY date),24 category_window AS (PARTITION BY category ORDER BY date);6. Practical Applications
6.1 Year-over-Year Analysis
SQL
1-- YoY Growth Rate2WITH monthly_sales AS (3 SELECT 4 DATE_TRUNC('month', date) as month,5 SUM(amount) as sales6 FROM transactions7 GROUP BY DATE_TRUNC('month', date)8)9SELECT 10 month,11 sales,12 LAG(sales, 12) OVER (ORDER BY month) as prev_year_sales,13 ROUND(14 (sales - LAG(sales, 12) OVER (ORDER BY month)) 15 / LAG(sales, 12) OVER (ORDER BY month) * 100, 16 217 ) as yoy_growth_pct18FROM monthly_sales19ORDER BY month;6.2 Finding Gaps and Islands
SQL
1-- Identify consecutive sequences2WITH numbered AS (3 SELECT 4 date,5 status,6 ROW_NUMBER() OVER (ORDER BY date) -7 ROW_NUMBER() OVER (PARTITION BY status ORDER BY date) as grp8 FROM events9)10SELECT 11 status,12 MIN(date) as start_date,13 MAX(date) as end_date,14 COUNT(*) as consecutive_days15FROM numbered16GROUP BY status, grp17ORDER BY start_date;6.3 Percentile Analysis
SQL
1-- Calculate percentiles2SELECT 3 department,4 name,5 salary,6 PERCENT_RANK() OVER (ORDER BY salary) as overall_percentile,7 PERCENT_RANK() OVER (8 PARTITION BY department ORDER BY salary9 ) as dept_percentile,10 CASE 11 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.9 THEN 'Top 10%'12 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.75 THEN 'Top 25%'13 WHEN PERCENT_RANK() OVER (ORDER BY salary) >= 0.5 THEN 'Top 50%'14 ELSE 'Bottom 50%'15 END as percentile_group16FROM employees;6.4 Session Analysis
SQL
1-- Identify user sessions (30 min gap = new session)2WITH activity_with_gaps AS (3 SELECT 4 user_id,5 timestamp,6 LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) as prev_timestamp,7 CASE 8 WHEN timestamp - LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp) 9 > INTERVAL '30 minutes'10 THEN 1 11 ELSE 0 12 END as new_session13 FROM user_activity14),15sessions AS (16 SELECT 17 *,18 SUM(new_session) OVER (19 PARTITION BY user_id 20 ORDER BY timestamp21 ) + 1 as session_id22 FROM activity_with_gaps23)24SELECT 25 user_id,26 session_id,27 MIN(timestamp) as session_start,28 MAX(timestamp) as session_end,29 COUNT(*) as events_in_session30FROM sessions31GROUP BY user_id, session_id;6.5 Running Difference Analysis
SQL
1-- Analyze changes between consecutive records2SELECT 3 date,4 metric_value,5 LAG(metric_value) OVER (ORDER BY date) as prev_value,6 metric_value - LAG(metric_value) OVER (ORDER BY date) as absolute_change,7 ROUND(8 (metric_value - LAG(metric_value) OVER (ORDER BY date)) 9 / LAG(metric_value) OVER (ORDER BY date) * 100,10 211 ) as pct_change,12 CASE 13 WHEN metric_value > LAG(metric_value) OVER (ORDER BY date) THEN 'Increase'14 WHEN metric_value < LAG(metric_value) OVER (ORDER BY date) THEN 'Decrease'15 ELSE 'No Change'16 END as trend17FROM metrics;7. Performance Considerations
Window Function Performance
Window functions có thể expensive cho large datasets. Optimize với proper indexing và partition sizing.
7.1 Indexing
SQL
1-- Create index for window function optimization2CREATE INDEX idx_sales_date ON sales(date);3CREATE INDEX idx_sales_category_date ON sales(category, date);45-- Composite index for partition + order6CREATE INDEX idx_transactions_user_date ON transactions(user_id, date);7.2 Limiting Partitions
SQL
1-- Filter before window function2WITH filtered AS (3 SELECT * 4 FROM transactions 5 WHERE date >= DATE_SUB(CURRENT_DATE, INTERVAL 1 YEAR)6)7SELECT 8 *,9 SUM(amount) OVER (ORDER BY date) as running_total10FROM filtered;7.3 Materializing Intermediate Results
SQL
1-- For complex calculations, use temporary table2CREATE TEMPORARY TABLE ranked_sales AS3SELECT 4 *,5 ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) as rank6FROM sales;78-- Then query the temp table9SELECT * FROM ranked_sales WHERE rank <= 10;8. Thực hành
Practice Exercises
Exercise: Sales Analytics with Window Functions
SQL
1-- Sample table structure:2-- orders(order_id, customer_id, order_date, amount, category)34-- Tasks:5-- 1. Rank customers by total spending (overall and by category)6-- 2. Calculate 7-day moving average of daily sales7-- 3. Find YoY growth by month8-- 4. Identify top 3 orders per customer9-- 5. Calculate percentage of category total for each order1011-- YOUR QUERIES HERE💡 Xem đáp án
SQL
1-- 1. Rank customers by spending2WITH customer_spending AS (3 SELECT 4 customer_id,5 category,6 SUM(amount) as total_spent7 FROM orders8 GROUP BY customer_id, category9)10SELECT 11 customer_id,12 category,13 total_spent,14 RANK() OVER (ORDER BY total_spent DESC) as overall_rank,15 RANK() OVER (PARTITION BY category ORDER BY total_spent DESC) as category_rank16FROM customer_spending;1718-- 2. 7-day moving average19WITH daily_sales AS (20 SELECT 21 DATE(order_date) as date,22 SUM(amount) as sales23 FROM orders24 GROUP BY DATE(order_date)25)26SELECT 27 date,28 sales,29 ROUND(AVG(sales) OVER (30 ORDER BY date 31 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW32 ), 2) as ma_733FROM daily_sales34ORDER BY date;3536-- 3. YoY growth37WITH monthly_sales AS (38 SELECT 39 DATE_TRUNC('month', order_date) as month,40 SUM(amount) as sales41 FROM orders42 GROUP BY DATE_TRUNC('month', order_date)43)44SELECT 45 month,46 sales,47 LAG(sales, 12) OVER (ORDER BY month) as prev_year,48 ROUND(49 (sales - LAG(sales, 12) OVER (ORDER BY month)) 50 / NULLIF(LAG(sales, 12) OVER (ORDER BY month), 0) * 100,51 252 ) as yoy_growth_pct53FROM monthly_sales54ORDER BY month;5556-- 4. Top 3 orders per customer57WITH ranked_orders AS (58 SELECT 59 order_id,60 customer_id,61 order_date,62 amount,63 ROW_NUMBER() OVER (64 PARTITION BY customer_id 65 ORDER BY amount DESC66 ) as order_rank67 FROM orders68)69SELECT * 70FROM ranked_orders 71WHERE order_rank <= 372ORDER BY customer_id, order_rank;7374-- 5. Percentage of category total75SELECT 76 order_id,77 customer_id,78 category,79 amount,80 SUM(amount) OVER (PARTITION BY category) as category_total,81 ROUND(82 amount / SUM(amount) OVER (PARTITION BY category) * 100,83 284 ) as pct_of_category85FROM orders86ORDER BY category, amount DESC;9. Tổng kết
| Function Type | Functions | Use Case |
|---|---|---|
| Ranking | ROW_NUMBER, RANK, DENSE_RANK, NTILE | Ranking, Top N, Percentiles |
| Value | LAG, LEAD, FIRST_VALUE, LAST_VALUE | Time comparison, trends |
| Aggregate | SUM, AVG, COUNT, MIN, MAX | Running totals, moving averages |
Key Concepts:
- ✅ PARTITION BY = GROUP BY without collapsing
- ✅ ORDER BY = Define calculation order
- ✅ Frame clause = Which rows to include
- ✅ Named windows = Reusable definitions
Bài tiếp theo: Advanced CTEs - Recursive queries và complex data transformations
