Lý thuyết
Bài 6/17

Advanced CTEs

Recursive CTEs, Complex Queries, và Query Organization

Advanced CTEs

Advanced SQL Queries

1. CTE Fundamentals Review

Common Table Expressions

CTEs (WITH clause) tạo temporary named result sets, giúp organize complex queries, improve readability, và enable recursive operations.

1.1 Basic CTE Syntax

SQL
1-- Single CTE
2WITH sales_summary AS (
3 SELECT
4 category,
5 SUM(amount) as total_sales,
6 COUNT(*) as order_count
7 FROM orders
8 GROUP BY category
9)
10SELECT * FROM sales_summary
11WHERE total_sales > 10000;
12
13-- Multiple CTEs
14WITH
15monthly_sales AS (
16 SELECT
17 DATE_TRUNC('month', order_date) as month,
18 SUM(amount) as sales
19 FROM orders
20 GROUP BY DATE_TRUNC('month', order_date)
21),
22monthly_targets AS (
23 SELECT month, target_amount
24 FROM targets
25)
26SELECT
27 m.month,
28 m.sales,
29 t.target_amount,
30 m.sales - t.target_amount as variance
31FROM monthly_sales m
32JOIN monthly_targets t ON m.month = t.month;

1.2 CTE vs Subquery

SQL
1-- Subquery (harder to read)
2SELECT *
3FROM (
4 SELECT
5 customer_id,
6 SUM(amount) as total
7 FROM orders
8 GROUP BY customer_id
9) customer_totals
10WHERE total > (
11 SELECT AVG(total) FROM (
12 SELECT customer_id, SUM(amount) as total
13 FROM orders
14 GROUP BY customer_id
15 ) avg_calc
16);
17
18-- CTE (cleaner, reusable)
19WITH customer_totals AS (
20 SELECT
21 customer_id,
22 SUM(amount) as total
23 FROM orders
24 GROUP BY customer_id
25),
26avg_total AS (
27 SELECT AVG(total) as avg_value FROM customer_totals
28)
29SELECT ct.*
30FROM customer_totals ct, avg_total
31WHERE ct.total > avg_total.avg_value;

2. Recursive CTEs

Recursive CTEs

Recursive CTEs reference themselves để iterate qua hierarchical hoặc sequential data. Cần base case và recursive case.

2.1 Recursive CTE Structure

SQL
1WITH RECURSIVE cte_name AS (
2 -- Base case (anchor member)
3 SELECT ...
4 WHERE ... -- Starting condition
5
6 UNION ALL
7
8 -- Recursive case (recursive member)
9 SELECT ...
10 FROM cte_name
11 WHERE ... -- Termination condition
12)
13SELECT * FROM cte_name;

2.2 Hierarchy Traversal

SQL
1-- Employee-Manager hierarchy
2-- Table: employees(id, name, manager_id)
3
4WITH RECURSIVE org_chart AS (
5 -- Base case: Top-level managers (no manager)
6 SELECT
7 id,
8 name,
9 manager_id,
10 1 as level,
11 name as path
12 FROM employees
13 WHERE manager_id IS NULL
14
15 UNION ALL
16
17 -- Recursive case: Employees with managers
18 SELECT
19 e.id,
20 e.name,
21 e.manager_id,
22 oc.level + 1,
23 oc.path || ' > ' || e.name
24 FROM employees e
25 JOIN org_chart oc ON e.manager_id = oc.id
26)
27SELECT * FROM org_chart
28ORDER BY path;
29
30-- Result:
31-- | id | name | level | path |
32-- |----|---------|-------|----------------------------|
33-- | 1 | CEO | 1 | CEO |
34-- | 2 | VP Sales| 2 | CEO > VP Sales |
35-- | 4 | Manager | 3 | CEO > VP Sales > Manager |

2.3 Tree Structures

SQL
1-- Categories with subcategories
2-- Table: categories(id, name, parent_id)
3
4WITH RECURSIVE category_tree AS (
5 SELECT
6 id,
7 name,
8 parent_id,
9 0 as depth,
10 ARRAY[id] as path_ids,
11 name as full_path
12 FROM categories
13 WHERE parent_id IS NULL
14
15 UNION ALL
16
17 SELECT
18 c.id,
19 c.name,
20 c.parent_id,
21 ct.depth + 1,
22 ct.path_ids || c.id,
23 ct.full_path || ' > ' || c.name
24 FROM categories c
25 JOIN category_tree ct ON c.parent_id = ct.id
26)
27SELECT
28 id,
29 REPEAT(' ', depth) || name as indented_name,
30 full_path
31FROM category_tree
32ORDER BY path_ids;

2.4 Bill of Materials (BOM)

SQL
1-- Product components breakdown
2-- Table: parts(part_id, part_name, parent_part_id, quantity)
3
4WITH RECURSIVE bom AS (
5 -- Base: Top-level product
6 SELECT
7 part_id,
8 part_name,
9 parent_part_id,
10 quantity,
11 1 as level,
12 quantity as total_quantity
13 FROM parts
14 WHERE part_id = 100 -- Starting product
15
16 UNION ALL
17
18 -- Recursive: Component parts
19 SELECT
20 p.part_id,
21 p.part_name,
22 p.parent_part_id,
23 p.quantity,
24 b.level + 1,
25 b.total_quantity * p.quantity
26 FROM parts p
27 JOIN bom b ON p.parent_part_id = b.part_id
28)
29SELECT
30 REPEAT('--', level - 1) || part_name as part_hierarchy,
31 quantity,
32 total_quantity
33FROM bom
34ORDER BY level, part_name;

3. Number Series Generation

3.1 Generate Numbers

SQL
1-- Generate numbers 1 to 100
2WITH RECURSIVE numbers AS (
3 SELECT 1 as n
4
5 UNION ALL
6
7 SELECT n + 1
8 FROM numbers
9 WHERE n < 100
10)
11SELECT n FROM numbers;
12
13-- Generate date range
14WITH RECURSIVE dates AS (
15 SELECT DATE '2024-01-01' as date
16
17 UNION ALL
18
19 SELECT date + INTERVAL '1 day'
20 FROM dates
21 WHERE date < DATE '2024-12-31'
22)
23SELECT date FROM dates;
24
25-- Use with GENERATE_SERIES if available (PostgreSQL)
26SELECT generate_series(1, 100) as n;
27SELECT generate_series('2024-01-01'::date, '2024-12-31'::date, '1 day'::interval) as date;

3.2 Fill Missing Dates

SQL
1-- Fill gaps in time series data
2WITH RECURSIVE all_dates AS (
3 SELECT MIN(date) as date FROM sales
4
5 UNION ALL
6
7 SELECT date + INTERVAL '1 day'
8 FROM all_dates
9 WHERE date < (SELECT MAX(date) FROM sales)
10),
11daily_sales AS (
12 SELECT date, SUM(amount) as sales
13 FROM sales
14 GROUP BY date
15)
16SELECT
17 ad.date,
18 COALESCE(ds.sales, 0) as sales
19FROM all_dates ad
20LEFT JOIN daily_sales ds ON ad.date = ds.date
21ORDER BY ad.date;

4. Graph Traversal

4.1 Finding Paths

SQL
1-- Find all paths from A to B in a graph
2-- Table: edges(from_node, to_node, weight)
3
4WITH RECURSIVE paths AS (
5 -- Start from source node
6 SELECT
7 from_node,
8 to_node,
9 weight,
10 ARRAY[from_node, to_node] as path,
11 weight as total_weight
12 FROM edges
13 WHERE from_node = 'A'
14
15 UNION ALL
16
17 -- Extend paths
18 SELECT
19 p.from_node,
20 e.to_node,
21 e.weight,
22 p.path || e.to_node,
23 p.total_weight + e.weight
24 FROM paths p
25 JOIN edges e ON p.to_node = e.from_node
26 WHERE NOT e.to_node = ANY(p.path) -- Avoid cycles
27)
28SELECT * FROM paths
29WHERE to_node = 'Z' -- Destination
30ORDER BY total_weight;

4.2 Shortest Path (Simplified)

SQL
1-- Find shortest path (simplified Dijkstra)
2WITH RECURSIVE shortest_path AS (
3 SELECT
4 to_node as node,
5 weight as distance,
6 ARRAY['A', to_node] as path
7 FROM edges
8 WHERE from_node = 'A'
9
10 UNION ALL
11
12 SELECT
13 e.to_node,
14 sp.distance + e.weight,
15 sp.path || e.to_node
16 FROM shortest_path sp
17 JOIN edges e ON sp.node = e.from_node
18 WHERE NOT e.to_node = ANY(sp.path)
19 AND sp.distance + e.weight < 1000 -- Prevent infinite loops
20)
21SELECT DISTINCT ON (node)
22 node,
23 distance,
24 path
25FROM shortest_path
26ORDER BY node, distance;

5. Complex Query Patterns

5.1 Running Totals with Reset

SQL
1-- Running total that resets on condition
2WITH
3order_data AS (
4 SELECT
5 order_date,
6 amount,
7 CASE WHEN is_refund THEN 1 ELSE 0 END as reset_flag
8 FROM orders
9),
10groups AS (
11 SELECT
12 *,
13 SUM(reset_flag) OVER (ORDER BY order_date) as grp
14 FROM order_data
15)
16SELECT
17 order_date,
18 amount,
19 SUM(amount) OVER (
20 PARTITION BY grp
21 ORDER BY order_date
22 ) as running_total
23FROM groups
24ORDER BY order_date;

5.2 Gap and Island Detection

SQL
1-- Find consecutive sequences (islands)
2WITH numbered AS (
3 SELECT
4 date,
5 status,
6 ROW_NUMBER() OVER (ORDER BY date) as rn,
7 ROW_NUMBER() OVER (PARTITION BY status ORDER BY date) as status_rn
8 FROM events
9),
10grouped AS (
11 SELECT
12 *,
13 rn - status_rn as island_group
14 FROM numbered
15)
16SELECT
17 status,
18 MIN(date) as island_start,
19 MAX(date) as island_end,
20 COUNT(*) as days_count
21FROM grouped
22GROUP BY status, island_group
23ORDER BY island_start;

5.3 Data Validation

SQL
1-- Validate data consistency across related tables
2WITH
3order_totals AS (
4 SELECT order_id, SUM(item_total) as calc_total
5 FROM order_items
6 GROUP BY order_id
7),
8validation AS (
9 SELECT
10 o.order_id,
11 o.total_amount as stored_total,
12 ot.calc_total,
13 CASE
14 WHEN o.total_amount = ot.calc_total THEN 'Valid'
15 WHEN ot.calc_total IS NULL THEN 'Missing Items'
16 ELSE 'Mismatch'
17 END as validation_status
18 FROM orders o
19 LEFT JOIN order_totals ot ON o.order_id = ot.order_id
20)
21SELECT * FROM validation
22WHERE validation_status != 'Valid';

5.4 Pivot with CTE

SQL
1-- Dynamic pivot using CTE
2WITH
3sales_data AS (
4 SELECT
5 product,
6 EXTRACT(MONTH FROM date) as month,
7 SUM(amount) as sales
8 FROM sales
9 WHERE EXTRACT(YEAR FROM date) = 2024
10 GROUP BY product, EXTRACT(MONTH FROM date)
11)
12SELECT
13 product,
14 SUM(CASE WHEN month = 1 THEN sales END) as jan,
15 SUM(CASE WHEN month = 2 THEN sales END) as feb,
16 SUM(CASE WHEN month = 3 THEN sales END) as mar,
17 SUM(CASE WHEN month = 4 THEN sales END) as apr,
18 SUM(CASE WHEN month = 5 THEN sales END) as may,
19 SUM(CASE WHEN month = 6 THEN sales END) as jun,
20 SUM(CASE WHEN month = 7 THEN sales END) as jul,
21 SUM(CASE WHEN month = 8 THEN sales END) as aug,
22 SUM(CASE WHEN month = 9 THEN sales END) as sep,
23 SUM(CASE WHEN month = 10 THEN sales END) as oct,
24 SUM(CASE WHEN month = 11 THEN sales END) as nov,
25 SUM(CASE WHEN month = 12 THEN sales END) as dec
26FROM sales_data
27GROUP BY product;

6. CTE Materialization

6.1 Materialized vs Inline

SQL
1-- PostgreSQL: Control materialization
2WITH sales_cte AS MATERIALIZED (
3 -- Force materialization (computed once)
4 SELECT * FROM sales WHERE date >= '2024-01-01'
5)
6SELECT * FROM sales_cte WHERE amount > 100;
7
8WITH sales_cte AS NOT MATERIALIZED (
9 -- Inline (may be optimized with main query)
10 SELECT * FROM sales WHERE date >= '2024-01-01'
11)
12SELECT * FROM sales_cte WHERE amount > 100;

6.2 CTE for Deduplication

SQL
1-- Efficient deduplication with CTE
2WITH ranked AS (
3 SELECT
4 *,
5 ROW_NUMBER() OVER (
6 PARTITION BY customer_id, product_id
7 ORDER BY order_date DESC
8 ) as rn
9 FROM orders
10)
11DELETE FROM orders
12WHERE order_id IN (
13 SELECT order_id FROM ranked WHERE rn > 1
14);

7. Practical Examples

7.1 Customer Lifetime Value Analysis

SQL
1WITH
2first_purchase AS (
3 SELECT
4 customer_id,
5 MIN(order_date) as first_order_date
6 FROM orders
7 GROUP BY customer_id
8),
9customer_metrics AS (
10 SELECT
11 o.customer_id,
12 fp.first_order_date,
13 COUNT(DISTINCT o.order_id) as total_orders,
14 SUM(o.amount) as total_revenue,
15 MAX(o.order_date) as last_order_date,
16 COUNT(DISTINCT DATE_TRUNC('month', o.order_date)) as active_months
17 FROM orders o
18 JOIN first_purchase fp ON o.customer_id = fp.customer_id
19 GROUP BY o.customer_id, fp.first_order_date
20),
21cohort_analysis AS (
22 SELECT
23 customer_id,
24 DATE_TRUNC('month', first_order_date) as cohort_month,
25 total_orders,
26 total_revenue,
27 total_revenue / GREATEST(active_months, 1) as monthly_value,
28 DATE_PART('day', last_order_date - first_order_date) as customer_lifetime_days
29 FROM customer_metrics
30)
31SELECT
32 cohort_month,
33 COUNT(*) as customers,
34 ROUND(AVG(total_orders), 1) as avg_orders,
35 ROUND(AVG(total_revenue), 2) as avg_revenue,
36 ROUND(AVG(monthly_value), 2) as avg_monthly_value
37FROM cohort_analysis
38GROUP BY cohort_month
39ORDER BY cohort_month;

7.2 Funnel Analysis

SQL
1WITH
2funnel_stages AS (
3 SELECT
4 user_id,
5 session_id,
6 MAX(CASE WHEN event = 'page_view' THEN 1 END) as viewed,
7 MAX(CASE WHEN event = 'add_to_cart' THEN 1 END) as added_cart,
8 MAX(CASE WHEN event = 'checkout_start' THEN 1 END) as started_checkout,
9 MAX(CASE WHEN event = 'purchase' THEN 1 END) as purchased
10 FROM events
11 WHERE date >= '2024-01-01'
12 GROUP BY user_id, session_id
13),
14funnel_counts AS (
15 SELECT
16 SUM(viewed) as stage1_view,
17 SUM(added_cart) as stage2_cart,
18 SUM(started_checkout) as stage3_checkout,
19 SUM(purchased) as stage4_purchase
20 FROM funnel_stages
21)
22SELECT
23 'Page View' as stage, stage1_view as users, 100.0 as pct
24FROM funnel_counts
25UNION ALL
26SELECT
27 'Add to Cart', stage2_cart,
28 ROUND(stage2_cart::numeric / stage1_view * 100, 1)
29FROM funnel_counts
30UNION ALL
31SELECT
32 'Checkout', stage3_checkout,
33 ROUND(stage3_checkout::numeric / stage1_view * 100, 1)
34FROM funnel_counts
35UNION ALL
36SELECT
37 'Purchase', stage4_purchase,
38 ROUND(stage4_purchase::numeric / stage1_view * 100, 1)
39FROM funnel_counts;

7.3 Employee Tenure Report

SQL
1WITH RECURSIVE
2tenure_brackets AS (
3 -- Generate tenure brackets 0-1, 1-2, ..., 10+ years
4 SELECT 0 as min_years, 1 as max_years, '0-1 years' as bracket
5 UNION ALL
6 SELECT min_years + 1, max_years + 1,
7 CASE WHEN min_years + 1 >= 10 THEN '10+ years'
8 ELSE (min_years + 1)::text || '-' || (max_years + 1)::text || ' years'
9 END
10 FROM tenure_brackets
11 WHERE min_years < 10
12),
13employee_tenure AS (
14 SELECT
15 employee_id,
16 name,
17 department,
18 hire_date,
19 EXTRACT(YEAR FROM AGE(CURRENT_DATE, hire_date)) as years_of_service
20 FROM employees
21 WHERE termination_date IS NULL
22)
23SELECT
24 tb.bracket,
25 COUNT(et.employee_id) as employee_count,
26 ROUND(COUNT(et.employee_id)::numeric / SUM(COUNT(et.employee_id)) OVER () * 100, 1) as percentage
27FROM tenure_brackets tb
28LEFT JOIN employee_tenure et ON
29 et.years_of_service >= tb.min_years AND
30 et.years_of_service < tb.max_years
31GROUP BY tb.bracket, tb.min_years
32ORDER BY tb.min_years;

8. Thực hành

CTE Practice

Exercise: Organizational Analytics

SQL
1-- Tables:
2-- employees(id, name, department, manager_id, hire_date, salary)
3-- projects(project_id, name, start_date, end_date, budget)
4-- project_assignments(employee_id, project_id, role, hours)
5
6-- Tasks:
7-- 1. Build org chart using recursive CTE
8-- 2. Find all employees under a specific manager (any level)
9-- 3. Calculate total team cost for each manager
10-- 4. Find longest management chain
11-- 5. Analyze project participation by department
12
13-- YOUR QUERIES HERE
💡 Xem đáp án
SQL
1-- 1. Org chart
2WITH RECURSIVE org AS (
3 SELECT
4 id, name, manager_id, department,
5 0 as level,
6 name as path
7 FROM employees
8 WHERE manager_id IS NULL
9
10 UNION ALL
11
12 SELECT
13 e.id, e.name, e.manager_id, e.department,
14 o.level + 1,
15 o.path || ' > ' || e.name
16 FROM employees e
17 JOIN org o ON e.manager_id = o.id
18)
19SELECT
20 REPEAT(' ', level) || name as employee,
21 department,
22 level
23FROM org
24ORDER BY path;
25
26-- 2. All reports under specific manager
27WITH RECURSIVE reports AS (
28 SELECT id, name, manager_id
29 FROM employees
30 WHERE manager_id = 5 -- Manager ID to search
31
32 UNION ALL
33
34 SELECT e.id, e.name, e.manager_id
35 FROM employees e
36 JOIN reports r ON e.manager_id = r.id
37)
38SELECT * FROM reports;
39
40-- 3. Total team cost per manager
41WITH RECURSIVE team AS (
42 SELECT
43 id as manager_id,
44 id as employee_id,
45 salary
46 FROM employees
47
48 UNION ALL
49
50 SELECT
51 t.manager_id,
52 e.id,
53 e.salary
54 FROM employees e
55 JOIN team t ON e.manager_id = t.employee_id
56 WHERE t.manager_id != e.id
57)
58SELECT
59 e.name as manager_name,
60 COUNT(DISTINCT t.employee_id) - 1 as team_size, -- Exclude self
61 SUM(t.salary) - e.salary as team_cost -- Exclude own salary
62FROM team t
63JOIN employees e ON t.manager_id = e.id
64WHERE EXISTS (SELECT 1 FROM employees WHERE manager_id = e.id)
65GROUP BY e.id, e.name, e.salary
66ORDER BY team_cost DESC;
67
68-- 4. Longest management chain
69WITH RECURSIVE chain AS (
70 SELECT
71 id,
72 name,
73 1 as depth,
74 ARRAY[id] as chain_path
75 FROM employees
76 WHERE manager_id IS NULL
77
78 UNION ALL
79
80 SELECT
81 e.id,
82 e.name,
83 c.depth + 1,
84 c.chain_path || e.id
85 FROM employees e
86 JOIN chain c ON e.manager_id = c.id
87)
88SELECT
89 name,
90 depth as chain_length,
91 chain_path
92FROM chain
93ORDER BY depth DESC
94LIMIT 5;
95
96-- 5. Project participation by department
97WITH
98project_hours AS (
99 SELECT
100 e.department,
101 p.name as project_name,
102 SUM(pa.hours) as total_hours,
103 COUNT(DISTINCT pa.employee_id) as team_members
104 FROM project_assignments pa
105 JOIN employees e ON pa.employee_id = e.id
106 JOIN projects p ON pa.project_id = p.project_id
107 GROUP BY e.department, p.name
108),
109dept_summary AS (
110 SELECT
111 department,
112 COUNT(DISTINCT project_name) as projects_count,
113 SUM(total_hours) as total_hours,
114 ROUND(AVG(team_members), 1) as avg_team_size
115 FROM project_hours
116 GROUP BY department
117)
118SELECT
119 ds.*,
120 ROUND(ds.total_hours::numeric / SUM(ds.total_hours) OVER () * 100, 1) as pct_of_total
121FROM dept_summary ds
122ORDER BY total_hours DESC;

9. Tổng kết

CTE TypeUse CaseExample
Basic CTEQuery organizationComplex joins, subquery replacement
Multiple CTEsMulti-step analysisPipeline transformations
Recursive CTEHierarchical dataOrg charts, BOM, graphs
Recursive + NumbersSeries generationDate ranges, sequences

Best Practices:

  1. ✅ Use CTEs for readability
  2. ✅ Name CTEs descriptively
  3. ✅ Always include termination condition in recursive CTEs
  4. ✅ Consider materialization for performance

Bài tiếp theo: Query Optimization - EXPLAIN, Indexes, Performance Tuning