Advanced CTEs
1. CTE Fundamentals Review
Common Table Expressions
CTEs (WITH clause) tạo temporary named result sets, giúp organize complex queries, improve readability, và enable recursive operations.
1.1 Basic CTE Syntax
SQL
1-- Single CTE2WITH sales_summary AS (3 SELECT 4 category,5 SUM(amount) as total_sales,6 COUNT(*) as order_count7 FROM orders8 GROUP BY category9)10SELECT * FROM sales_summary11WHERE total_sales > 10000;1213-- Multiple CTEs14WITH 15monthly_sales AS (16 SELECT 17 DATE_TRUNC('month', order_date) as month,18 SUM(amount) as sales19 FROM orders20 GROUP BY DATE_TRUNC('month', order_date)21),22monthly_targets AS (23 SELECT month, target_amount24 FROM targets25)26SELECT 27 m.month,28 m.sales,29 t.target_amount,30 m.sales - t.target_amount as variance31FROM monthly_sales m32JOIN monthly_targets t ON m.month = t.month;1.2 CTE vs Subquery
SQL
1-- Subquery (harder to read)2SELECT *3FROM (4 SELECT 5 customer_id,6 SUM(amount) as total7 FROM orders8 GROUP BY customer_id9) customer_totals10WHERE total > (11 SELECT AVG(total) FROM (12 SELECT customer_id, SUM(amount) as total13 FROM orders14 GROUP BY customer_id15 ) avg_calc16);1718-- CTE (cleaner, reusable)19WITH customer_totals AS (20 SELECT 21 customer_id,22 SUM(amount) as total23 FROM orders24 GROUP BY customer_id25),26avg_total AS (27 SELECT AVG(total) as avg_value FROM customer_totals28)29SELECT ct.*30FROM customer_totals ct, avg_total31WHERE ct.total > avg_total.avg_value;2. Recursive CTEs
Recursive CTEs
Recursive CTEs reference themselves để iterate qua hierarchical hoặc sequential data. Cần base case và recursive case.
2.1 Recursive CTE Structure
SQL
1WITH RECURSIVE cte_name AS (2 -- Base case (anchor member)3 SELECT ... 4 WHERE ... -- Starting condition5 6 UNION ALL7 8 -- Recursive case (recursive member)9 SELECT ...10 FROM cte_name11 WHERE ... -- Termination condition12)13SELECT * FROM cte_name;2.2 Hierarchy Traversal
SQL
1-- Employee-Manager hierarchy2-- Table: employees(id, name, manager_id)34WITH RECURSIVE org_chart AS (5 -- Base case: Top-level managers (no manager)6 SELECT 7 id,8 name,9 manager_id,10 1 as level,11 name as path12 FROM employees13 WHERE manager_id IS NULL14 15 UNION ALL16 17 -- Recursive case: Employees with managers18 SELECT 19 e.id,20 e.name,21 e.manager_id,22 oc.level + 1,23 oc.path || ' > ' || e.name24 FROM employees e25 JOIN org_chart oc ON e.manager_id = oc.id26)27SELECT * FROM org_chart28ORDER BY path;2930-- Result:31-- | id | name | level | path |32-- |----|---------|-------|----------------------------|33-- | 1 | CEO | 1 | CEO |34-- | 2 | VP Sales| 2 | CEO > VP Sales |35-- | 4 | Manager | 3 | CEO > VP Sales > Manager |2.3 Tree Structures
SQL
1-- Categories with subcategories2-- Table: categories(id, name, parent_id)34WITH RECURSIVE category_tree AS (5 SELECT 6 id,7 name,8 parent_id,9 0 as depth,10 ARRAY[id] as path_ids,11 name as full_path12 FROM categories13 WHERE parent_id IS NULL14 15 UNION ALL16 17 SELECT 18 c.id,19 c.name,20 c.parent_id,21 ct.depth + 1,22 ct.path_ids || c.id,23 ct.full_path || ' > ' || c.name24 FROM categories c25 JOIN category_tree ct ON c.parent_id = ct.id26)27SELECT 28 id,29 REPEAT(' ', depth) || name as indented_name,30 full_path31FROM category_tree32ORDER BY path_ids;2.4 Bill of Materials (BOM)
SQL
1-- Product components breakdown2-- Table: parts(part_id, part_name, parent_part_id, quantity)34WITH RECURSIVE bom AS (5 -- Base: Top-level product6 SELECT 7 part_id,8 part_name,9 parent_part_id,10 quantity,11 1 as level,12 quantity as total_quantity13 FROM parts14 WHERE part_id = 100 -- Starting product15 16 UNION ALL17 18 -- Recursive: Component parts19 SELECT 20 p.part_id,21 p.part_name,22 p.parent_part_id,23 p.quantity,24 b.level + 1,25 b.total_quantity * p.quantity26 FROM parts p27 JOIN bom b ON p.parent_part_id = b.part_id28)29SELECT 30 REPEAT('--', level - 1) || part_name as part_hierarchy,31 quantity,32 total_quantity33FROM bom34ORDER BY level, part_name;3. Number Series Generation
3.1 Generate Numbers
SQL
1-- Generate numbers 1 to 1002WITH RECURSIVE numbers AS (3 SELECT 1 as n4 5 UNION ALL6 7 SELECT n + 18 FROM numbers9 WHERE n < 10010)11SELECT n FROM numbers;1213-- Generate date range14WITH RECURSIVE dates AS (15 SELECT DATE '2024-01-01' as date16 17 UNION ALL18 19 SELECT date + INTERVAL '1 day'20 FROM dates21 WHERE date < DATE '2024-12-31'22)23SELECT date FROM dates;2425-- Use with GENERATE_SERIES if available (PostgreSQL)26SELECT generate_series(1, 100) as n;27SELECT generate_series('2024-01-01'::date, '2024-12-31'::date, '1 day'::interval) as date;3.2 Fill Missing Dates
SQL
1-- Fill gaps in time series data2WITH RECURSIVE all_dates AS (3 SELECT MIN(date) as date FROM sales4 5 UNION ALL6 7 SELECT date + INTERVAL '1 day'8 FROM all_dates9 WHERE date < (SELECT MAX(date) FROM sales)10),11daily_sales AS (12 SELECT date, SUM(amount) as sales13 FROM sales14 GROUP BY date15)16SELECT 17 ad.date,18 COALESCE(ds.sales, 0) as sales19FROM all_dates ad20LEFT JOIN daily_sales ds ON ad.date = ds.date21ORDER BY ad.date;4. Graph Traversal
4.1 Finding Paths
SQL
1-- Find all paths from A to B in a graph2-- Table: edges(from_node, to_node, weight)34WITH RECURSIVE paths AS (5 -- Start from source node6 SELECT 7 from_node,8 to_node,9 weight,10 ARRAY[from_node, to_node] as path,11 weight as total_weight12 FROM edges13 WHERE from_node = 'A'14 15 UNION ALL16 17 -- Extend paths18 SELECT 19 p.from_node,20 e.to_node,21 e.weight,22 p.path || e.to_node,23 p.total_weight + e.weight24 FROM paths p25 JOIN edges e ON p.to_node = e.from_node26 WHERE NOT e.to_node = ANY(p.path) -- Avoid cycles27)28SELECT * FROM paths29WHERE to_node = 'Z' -- Destination30ORDER BY total_weight;4.2 Shortest Path (Simplified)
SQL
1-- Find shortest path (simplified Dijkstra)2WITH RECURSIVE shortest_path AS (3 SELECT 4 to_node as node,5 weight as distance,6 ARRAY['A', to_node] as path7 FROM edges8 WHERE from_node = 'A'9 10 UNION ALL11 12 SELECT 13 e.to_node,14 sp.distance + e.weight,15 sp.path || e.to_node16 FROM shortest_path sp17 JOIN edges e ON sp.node = e.from_node18 WHERE NOT e.to_node = ANY(sp.path)19 AND sp.distance + e.weight < 1000 -- Prevent infinite loops20)21SELECT DISTINCT ON (node)22 node,23 distance,24 path25FROM shortest_path26ORDER BY node, distance;5. Complex Query Patterns
5.1 Running Totals with Reset
SQL
1-- Running total that resets on condition2WITH 3order_data AS (4 SELECT 5 order_date,6 amount,7 CASE WHEN is_refund THEN 1 ELSE 0 END as reset_flag8 FROM orders9),10groups AS (11 SELECT 12 *,13 SUM(reset_flag) OVER (ORDER BY order_date) as grp14 FROM order_data15)16SELECT 17 order_date,18 amount,19 SUM(amount) OVER (20 PARTITION BY grp 21 ORDER BY order_date22 ) as running_total23FROM groups24ORDER BY order_date;5.2 Gap and Island Detection
SQL
1-- Find consecutive sequences (islands)2WITH numbered AS (3 SELECT 4 date,5 status,6 ROW_NUMBER() OVER (ORDER BY date) as rn,7 ROW_NUMBER() OVER (PARTITION BY status ORDER BY date) as status_rn8 FROM events9),10grouped AS (11 SELECT 12 *,13 rn - status_rn as island_group14 FROM numbered15)16SELECT 17 status,18 MIN(date) as island_start,19 MAX(date) as island_end,20 COUNT(*) as days_count21FROM grouped22GROUP BY status, island_group23ORDER BY island_start;5.3 Data Validation
SQL
1-- Validate data consistency across related tables2WITH 3order_totals AS (4 SELECT order_id, SUM(item_total) as calc_total5 FROM order_items6 GROUP BY order_id7),8validation AS (9 SELECT 10 o.order_id,11 o.total_amount as stored_total,12 ot.calc_total,13 CASE 14 WHEN o.total_amount = ot.calc_total THEN 'Valid'15 WHEN ot.calc_total IS NULL THEN 'Missing Items'16 ELSE 'Mismatch'17 END as validation_status18 FROM orders o19 LEFT JOIN order_totals ot ON o.order_id = ot.order_id20)21SELECT * FROM validation22WHERE validation_status != 'Valid';5.4 Pivot with CTE
SQL
1-- Dynamic pivot using CTE2WITH 3sales_data AS (4 SELECT 5 product,6 EXTRACT(MONTH FROM date) as month,7 SUM(amount) as sales8 FROM sales9 WHERE EXTRACT(YEAR FROM date) = 202410 GROUP BY product, EXTRACT(MONTH FROM date)11)12SELECT 13 product,14 SUM(CASE WHEN month = 1 THEN sales END) as jan,15 SUM(CASE WHEN month = 2 THEN sales END) as feb,16 SUM(CASE WHEN month = 3 THEN sales END) as mar,17 SUM(CASE WHEN month = 4 THEN sales END) as apr,18 SUM(CASE WHEN month = 5 THEN sales END) as may,19 SUM(CASE WHEN month = 6 THEN sales END) as jun,20 SUM(CASE WHEN month = 7 THEN sales END) as jul,21 SUM(CASE WHEN month = 8 THEN sales END) as aug,22 SUM(CASE WHEN month = 9 THEN sales END) as sep,23 SUM(CASE WHEN month = 10 THEN sales END) as oct,24 SUM(CASE WHEN month = 11 THEN sales END) as nov,25 SUM(CASE WHEN month = 12 THEN sales END) as dec26FROM sales_data27GROUP BY product;6. CTE Materialization
6.1 Materialized vs Inline
SQL
1-- PostgreSQL: Control materialization2WITH sales_cte AS MATERIALIZED (3 -- Force materialization (computed once)4 SELECT * FROM sales WHERE date >= '2024-01-01'5)6SELECT * FROM sales_cte WHERE amount > 100;78WITH sales_cte AS NOT MATERIALIZED (9 -- Inline (may be optimized with main query)10 SELECT * FROM sales WHERE date >= '2024-01-01'11)12SELECT * FROM sales_cte WHERE amount > 100;6.2 CTE for Deduplication
SQL
1-- Efficient deduplication with CTE2WITH ranked AS (3 SELECT 4 *,5 ROW_NUMBER() OVER (6 PARTITION BY customer_id, product_id 7 ORDER BY order_date DESC8 ) as rn9 FROM orders10)11DELETE FROM orders12WHERE order_id IN (13 SELECT order_id FROM ranked WHERE rn > 114);7. Practical Examples
7.1 Customer Lifetime Value Analysis
SQL
1WITH 2first_purchase AS (3 SELECT 4 customer_id,5 MIN(order_date) as first_order_date6 FROM orders7 GROUP BY customer_id8),9customer_metrics AS (10 SELECT 11 o.customer_id,12 fp.first_order_date,13 COUNT(DISTINCT o.order_id) as total_orders,14 SUM(o.amount) as total_revenue,15 MAX(o.order_date) as last_order_date,16 COUNT(DISTINCT DATE_TRUNC('month', o.order_date)) as active_months17 FROM orders o18 JOIN first_purchase fp ON o.customer_id = fp.customer_id19 GROUP BY o.customer_id, fp.first_order_date20),21cohort_analysis AS (22 SELECT 23 customer_id,24 DATE_TRUNC('month', first_order_date) as cohort_month,25 total_orders,26 total_revenue,27 total_revenue / GREATEST(active_months, 1) as monthly_value,28 DATE_PART('day', last_order_date - first_order_date) as customer_lifetime_days29 FROM customer_metrics30)31SELECT 32 cohort_month,33 COUNT(*) as customers,34 ROUND(AVG(total_orders), 1) as avg_orders,35 ROUND(AVG(total_revenue), 2) as avg_revenue,36 ROUND(AVG(monthly_value), 2) as avg_monthly_value37FROM cohort_analysis38GROUP BY cohort_month39ORDER BY cohort_month;7.2 Funnel Analysis
SQL
1WITH 2funnel_stages AS (3 SELECT 4 user_id,5 session_id,6 MAX(CASE WHEN event = 'page_view' THEN 1 END) as viewed,7 MAX(CASE WHEN event = 'add_to_cart' THEN 1 END) as added_cart,8 MAX(CASE WHEN event = 'checkout_start' THEN 1 END) as started_checkout,9 MAX(CASE WHEN event = 'purchase' THEN 1 END) as purchased10 FROM events11 WHERE date >= '2024-01-01'12 GROUP BY user_id, session_id13),14funnel_counts AS (15 SELECT 16 SUM(viewed) as stage1_view,17 SUM(added_cart) as stage2_cart,18 SUM(started_checkout) as stage3_checkout,19 SUM(purchased) as stage4_purchase20 FROM funnel_stages21)22SELECT 23 'Page View' as stage, stage1_view as users, 100.0 as pct24FROM funnel_counts25UNION ALL26SELECT 27 'Add to Cart', stage2_cart, 28 ROUND(stage2_cart::numeric / stage1_view * 100, 1)29FROM funnel_counts30UNION ALL31SELECT 32 'Checkout', stage3_checkout,33 ROUND(stage3_checkout::numeric / stage1_view * 100, 1)34FROM funnel_counts35UNION ALL36SELECT 37 'Purchase', stage4_purchase,38 ROUND(stage4_purchase::numeric / stage1_view * 100, 1)39FROM funnel_counts;7.3 Employee Tenure Report
SQL
1WITH RECURSIVE 2tenure_brackets AS (3 -- Generate tenure brackets 0-1, 1-2, ..., 10+ years4 SELECT 0 as min_years, 1 as max_years, '0-1 years' as bracket5 UNION ALL6 SELECT min_years + 1, max_years + 1, 7 CASE WHEN min_years + 1 >= 10 THEN '10+ years'8 ELSE (min_years + 1)::text || '-' || (max_years + 1)::text || ' years'9 END10 FROM tenure_brackets11 WHERE min_years < 1012),13employee_tenure AS (14 SELECT 15 employee_id,16 name,17 department,18 hire_date,19 EXTRACT(YEAR FROM AGE(CURRENT_DATE, hire_date)) as years_of_service20 FROM employees21 WHERE termination_date IS NULL22)23SELECT 24 tb.bracket,25 COUNT(et.employee_id) as employee_count,26 ROUND(COUNT(et.employee_id)::numeric / SUM(COUNT(et.employee_id)) OVER () * 100, 1) as percentage27FROM tenure_brackets tb28LEFT JOIN employee_tenure et ON 29 et.years_of_service >= tb.min_years AND 30 et.years_of_service < tb.max_years31GROUP BY tb.bracket, tb.min_years32ORDER BY tb.min_years;8. Thực hành
CTE Practice
Exercise: Organizational Analytics
SQL
1-- Tables:2-- employees(id, name, department, manager_id, hire_date, salary)3-- projects(project_id, name, start_date, end_date, budget)4-- project_assignments(employee_id, project_id, role, hours)56-- Tasks:7-- 1. Build org chart using recursive CTE8-- 2. Find all employees under a specific manager (any level)9-- 3. Calculate total team cost for each manager10-- 4. Find longest management chain11-- 5. Analyze project participation by department1213-- YOUR QUERIES HERE💡 Xem đáp án
SQL
1-- 1. Org chart2WITH RECURSIVE org AS (3 SELECT 4 id, name, manager_id, department,5 0 as level,6 name as path7 FROM employees8 WHERE manager_id IS NULL9 10 UNION ALL11 12 SELECT 13 e.id, e.name, e.manager_id, e.department,14 o.level + 1,15 o.path || ' > ' || e.name16 FROM employees e17 JOIN org o ON e.manager_id = o.id18)19SELECT 20 REPEAT(' ', level) || name as employee,21 department,22 level23FROM org24ORDER BY path;2526-- 2. All reports under specific manager27WITH RECURSIVE reports AS (28 SELECT id, name, manager_id29 FROM employees30 WHERE manager_id = 5 -- Manager ID to search31 32 UNION ALL33 34 SELECT e.id, e.name, e.manager_id35 FROM employees e36 JOIN reports r ON e.manager_id = r.id37)38SELECT * FROM reports;3940-- 3. Total team cost per manager41WITH RECURSIVE team AS (42 SELECT 43 id as manager_id,44 id as employee_id,45 salary46 FROM employees47 48 UNION ALL49 50 SELECT 51 t.manager_id,52 e.id,53 e.salary54 FROM employees e55 JOIN team t ON e.manager_id = t.employee_id56 WHERE t.manager_id != e.id57)58SELECT 59 e.name as manager_name,60 COUNT(DISTINCT t.employee_id) - 1 as team_size, -- Exclude self61 SUM(t.salary) - e.salary as team_cost -- Exclude own salary62FROM team t63JOIN employees e ON t.manager_id = e.id64WHERE EXISTS (SELECT 1 FROM employees WHERE manager_id = e.id)65GROUP BY e.id, e.name, e.salary66ORDER BY team_cost DESC;6768-- 4. Longest management chain69WITH RECURSIVE chain AS (70 SELECT 71 id,72 name,73 1 as depth,74 ARRAY[id] as chain_path75 FROM employees76 WHERE manager_id IS NULL77 78 UNION ALL79 80 SELECT 81 e.id,82 e.name,83 c.depth + 1,84 c.chain_path || e.id85 FROM employees e86 JOIN chain c ON e.manager_id = c.id87)88SELECT 89 name,90 depth as chain_length,91 chain_path92FROM chain93ORDER BY depth DESC94LIMIT 5;9596-- 5. Project participation by department97WITH 98project_hours AS (99 SELECT 100 e.department,101 p.name as project_name,102 SUM(pa.hours) as total_hours,103 COUNT(DISTINCT pa.employee_id) as team_members104 FROM project_assignments pa105 JOIN employees e ON pa.employee_id = e.id106 JOIN projects p ON pa.project_id = p.project_id107 GROUP BY e.department, p.name108),109dept_summary AS (110 SELECT 111 department,112 COUNT(DISTINCT project_name) as projects_count,113 SUM(total_hours) as total_hours,114 ROUND(AVG(team_members), 1) as avg_team_size115 FROM project_hours116 GROUP BY department117)118SELECT 119 ds.*,120 ROUND(ds.total_hours::numeric / SUM(ds.total_hours) OVER () * 100, 1) as pct_of_total121FROM dept_summary ds122ORDER BY total_hours DESC;9. Tổng kết
| CTE Type | Use Case | Example |
|---|---|---|
| Basic CTE | Query organization | Complex joins, subquery replacement |
| Multiple CTEs | Multi-step analysis | Pipeline transformations |
| Recursive CTE | Hierarchical data | Org charts, BOM, graphs |
| Recursive + Numbers | Series generation | Date ranges, sequences |
Best Practices:
- ✅ Use CTEs for readability
- ✅ Name CTEs descriptively
- ✅ Always include termination condition in recursive CTEs
- ✅ Consider materialization for performance
Bài tiếp theo: Query Optimization - EXPLAIN, Indexes, Performance Tuning
