Time Series Analysis
1. Introduction
Time Series trong Business
Time series analysis là foundation của business analytics - từ sales forecasting, inventory management đến demand planning. Master kỹ năng này để predict và plan hiệu quả.
1.1 Time Series Components
Text
1┌─────────────────────────────────────────────────────────┐2│ Time Series Decomposition │3├─────────────────────────────────────────────────────────┤4│ │5│ Original = Trend + Seasonality + Residual │6│ │7│ ┌────────────────────────────────────────────┐ │8│ │ Trend: Long-term direction │ │9│ │ ↗ Upward / ↘ Downward / → Flat │ │10│ └────────────────────────────────────────────┘ │11│ │12│ ┌────────────────────────────────────────────┐ │13│ │ Seasonality: Regular patterns │ │14│ │ 📅 Daily / Weekly / Monthly / Yearly │ │15│ └────────────────────────────────────────────┘ │16│ │17│ ┌────────────────────────────────────────────┐ │18│ │ Residual: Random noise │ │19│ │ ~ Unpredictable fluctuations │ │20│ └────────────────────────────────────────────┘ │21│ │22└─────────────────────────────────────────────────────────┘2. Time Series Data Preparation
2.1 Creating Time Series
Python
1import pandas as pd2import numpy as np3import matplotlib.pyplot as plt45# Create datetime index6dates = pd.date_range(start='2022-01-01', end='2024-12-31', freq='D')78# Generate sample data with trend + seasonality + noise9np.random.seed(42)10n = len(dates)11trend = np.linspace(100, 200, n) # Upward trend12seasonality = 30 * np.sin(2 * np.pi * np.arange(n) / 365) # Yearly cycle13noise = np.random.normal(0, 10, n)1415sales = trend + seasonality + noise1617df = pd.DataFrame({'date': dates, 'sales': sales})18df.set_index('date', inplace=True)1920print(df.head())21print(f"\nShape: {df.shape}")22print(f"Date range: {df.index.min()} to {df.index.max()}")2.2 Datetime Operations
Python
1# Date components2df['year'] = df.index.year3df['month'] = df.index.month4df['quarter'] = df.index.quarter5df['day_of_week'] = df.index.dayofweek6df['week_of_year'] = df.index.isocalendar().week7df['is_weekend'] = df.index.dayofweek >= 58df['is_month_start'] = df.index.is_month_start9df['is_month_end'] = df.index.is_month_end1011# Parse dates from strings12df['date'] = pd.to_datetime(df['date_string'], format='%Y-%m-%d')1314# Handle different formats15df['date'] = pd.to_datetime(df['date_col'], infer_datetime_format=True)1617# Set datetime index18df = df.set_index('date')19df = df.sort_index()2.3 Handling Missing Dates
Python
1# Create complete date range2full_range = pd.date_range(df.index.min(), df.index.max(), freq='D')34# Reindex to fill missing dates5df = df.reindex(full_range)67# Fill missing values8df['sales'] = df['sales'].fillna(method='ffill') # Forward fill9df['sales'] = df['sales'].fillna(method='bfill') # Backward fill10df['sales'] = df['sales'].interpolate(method='linear') # Interpolate11df['sales'] = df['sales'].fillna(df['sales'].mean()) # Mean1213# Check for gaps14missing_dates = full_range.difference(df.index)15print(f"Missing dates: {len(missing_dates)}")3. Time Series Visualization
3.1 Basic Plots
Python
1fig, axes = plt.subplots(3, 1, figsize=(14, 10))23# Line plot4axes[0].plot(df.index, df['sales'], linewidth=0.8)5axes[0].set_title('Daily Sales')6axes[0].set_ylabel('Sales')78# Monthly resampling9monthly = df['sales'].resample('M').mean()10axes[1].plot(monthly.index, monthly.values, marker='o')11axes[1].set_title('Monthly Average Sales')12axes[1].set_ylabel('Sales')1314# Year-over-year comparison15for year in df.index.year.unique():16 yearly_data = df[df.index.year == year]['sales']17 yearly_data.index = yearly_data.index.dayofyear18 axes[2].plot(yearly_data.index, yearly_data.values, label=str(year), alpha=0.7)19axes[2].set_title('Year-over-Year Comparison')20axes[2].set_xlabel('Day of Year')21axes[2].legend()2223plt.tight_layout()24plt.show()3.2 Seasonal Plot
Python
1def seasonal_plot(df, column, freq='month'):2 """Create seasonal subseries plot"""3 fig, axes = plt.subplots(4, 3, figsize=(14, 10))4 axes = axes.flatten()5 6 for i, month in enumerate(range(1, 13)):7 monthly_data = df[df.index.month == month][column]8 axes[i].plot(monthly_data.index.year, monthly_data.values, 'o-')9 axes[i].set_title(f'Month {month}')10 axes[i].axhline(monthly_data.mean(), color='r', linestyle='--', alpha=0.5)11 12 plt.suptitle('Seasonal Subseries Plot')13 plt.tight_layout()14 plt.show()1516seasonal_plot(df, 'sales')3.3 Heatmap
Python
1# Pivot for heatmap2heatmap_data = df.pivot_table(3 values='sales',4 index=df.index.year,5 columns=df.index.month,6 aggfunc='mean'7)89plt.figure(figsize=(12, 6))10plt.imshow(heatmap_data, cmap='YlOrRd', aspect='auto')11plt.colorbar(label='Average Sales')12plt.yticks(range(len(heatmap_data.index)), heatmap_data.index)13plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 14 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])15plt.xlabel('Month')16plt.ylabel('Year')17plt.title('Sales Heatmap by Year and Month')18plt.show()4. Resampling và Aggregation
4.1 Downsampling (Higher to Lower Frequency)
Python
1# Daily to Weekly2weekly = df['sales'].resample('W').agg({3 'mean': 'mean',4 'sum': 'sum',5 'min': 'min',6 'max': 'max'7})89# Daily to Monthly10monthly = df['sales'].resample('M').agg(['mean', 'std', 'min', 'max'])1112# Daily to Quarterly13quarterly = df['sales'].resample('Q').sum()1415# Custom aggregation16monthly_stats = df.resample('M').agg({17 'sales': ['sum', 'mean', 'std'],18 'quantity': 'sum'19})2021# Rename columns22monthly_stats.columns = ['total_sales', 'avg_sales', 'std_sales', 'total_qty']4.2 Upsampling (Lower to Higher Frequency)
Python
1# Monthly to Daily2monthly_data = pd.DataFrame({3 'date': pd.date_range('2024-01-01', periods=12, freq='M'),4 'value': [100, 120, 115, 130, 145, 160, 155, 170, 165, 180, 190, 200]5})6monthly_data.set_index('date', inplace=True)78# Upsample with forward fill9daily = monthly_data.resample('D').ffill()1011# Upsample with interpolation12daily_interp = monthly_data.resample('D').interpolate(method='linear')1314# Spline interpolation (smoother)15daily_spline = monthly_data.resample('D').interpolate(method='spline', order=3)4.3 Custom Business Rules
Python
1# Business days only2business_days = df.asfreq('B')34# Week starting Monday5weekly_mon = df.resample('W-MON').sum()67# Month end8monthly_end = df.resample('M').last()910# Quarter end11quarterly = df.resample('Q-DEC').sum() # Fiscal year ending December5. Moving Statistics
5.1 Rolling Windows
Python
1# Simple moving average2df['SMA_7'] = df['sales'].rolling(window=7).mean()3df['SMA_30'] = df['sales'].rolling(window=30).mean()45# Exponential moving average6df['EMA_7'] = df['sales'].ewm(span=7, adjust=False).mean()7df['EMA_30'] = df['sales'].ewm(span=30, adjust=False).mean()89# Rolling statistics10df['rolling_std'] = df['sales'].rolling(window=30).std()11df['rolling_min'] = df['sales'].rolling(window=30).min()12df['rolling_max'] = df['sales'].rolling(window=30).max()1314# Plot15fig, ax = plt.subplots(figsize=(14, 6))16ax.plot(df.index, df['sales'], alpha=0.3, label='Actual')17ax.plot(df.index, df['SMA_30'], label='30-day SMA')18ax.plot(df.index, df['EMA_30'], label='30-day EMA')19ax.legend()20ax.set_title('Sales with Moving Averages')21plt.show()5.2 Rolling Window Functions
Python
1# Custom rolling function2def rolling_range(x):3 return x.max() - x.min()45df['rolling_range'] = df['sales'].rolling(window=30).apply(rolling_range)67# Rolling correlation8df['rolling_corr'] = df['sales'].rolling(window=30).corr(df['marketing_spend'])910# Rolling regression slope11from scipy import stats1213def rolling_slope(y):14 x = np.arange(len(y))15 slope, _, _, _, _ = stats.linregress(x, y)16 return slope1718df['trend_slope'] = df['sales'].rolling(window=30).apply(rolling_slope)5.3 Expanding Windows
Python
1# Cumulative statistics2df['cumsum'] = df['sales'].cumsum()3df['cummean'] = df['sales'].expanding().mean()4df['cummax'] = df['sales'].expanding().max()5df['cummin'] = df['sales'].expanding().min()67# Year-to-date8df['ytd_sum'] = df.groupby(df.index.year)['sales'].cumsum()6. Trend Analysis
6.1 Linear Trend
Python
1from scipy import stats23# Extract values4x = np.arange(len(df))5y = df['sales'].values67# Linear regression8slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)910# Calculate trend line11df['trend'] = intercept + slope * x1213print(f"Slope: {slope:.4f} per day")14print(f"R-squared: {r_value**2:.4f}")15print(f"P-value: {p_value:.4e}")1617# Plot18plt.figure(figsize=(12, 6))19plt.plot(df.index, df['sales'], alpha=0.5, label='Actual')20plt.plot(df.index, df['trend'], 'r-', linewidth=2, label='Trend')21plt.legend()22plt.title(f'Sales Trend (slope: {slope:.4f}/day)')23plt.show()6.2 Polynomial Trend
Python
1# Polynomial fit2x = np.arange(len(df))3y = df['sales'].values45# Degree 2 (quadratic)6coeffs = np.polyfit(x, y, 2)7poly = np.poly1d(coeffs)8df['poly_trend'] = poly(x)910# Compare trends11fig, ax = plt.subplots(figsize=(12, 6))12ax.plot(df.index, df['sales'], alpha=0.5, label='Actual')13ax.plot(df.index, df['trend'], label='Linear')14ax.plot(df.index, df['poly_trend'], label='Polynomial')15ax.legend()16plt.show()6.3 Detrending
Python
1# Remove linear trend2df['detrended'] = df['sales'] - df['trend']34# Using differencing5df['diff_1'] = df['sales'].diff(1) # First difference6df['diff_7'] = df['sales'].diff(7) # Weekly difference (remove weekly seasonality)78# Percentage change9df['pct_change'] = df['sales'].pct_change()10df['pct_change_7'] = df['sales'].pct_change(periods=7)7. Seasonality Analysis
7.1 Decomposition
Python
1from statsmodels.tsa.seasonal import seasonal_decompose23# Additive decomposition (Original = Trend + Seasonal + Residual)4result_add = seasonal_decompose(df['sales'], model='additive', period=365)56# Multiplicative decomposition (Original = Trend × Seasonal × Residual)7result_mult = seasonal_decompose(df['sales'], model='multiplicative', period=365)89# Plot components10fig = result_add.plot()11fig.set_size_inches(14, 10)12plt.tight_layout()13plt.show()1415# Extract components16trend = result_add.trend17seasonal = result_add.seasonal18residual = result_add.resid7.2 STL Decomposition (More Robust)
Python
1from statsmodels.tsa.seasonal import STL23# STL decomposition4stl = STL(df['sales'], period=365, robust=True)5result = stl.fit()67# Plot8fig = result.plot()9fig.set_size_inches(14, 10)10plt.show()1112# Get seasonally adjusted data13df['seasonally_adjusted'] = df['sales'] - result.seasonal7.3 Seasonal Indices
Python
1def calculate_seasonal_indices(df, column, freq='month'):2 """Calculate seasonal indices"""3 if freq == 'month':4 grouped = df.groupby(df.index.month)[column].mean()5 elif freq == 'dayofweek':6 grouped = df.groupby(df.index.dayofweek)[column].mean()7 elif freq == 'quarter':8 grouped = df.groupby(df.index.quarter)[column].mean()9 10 # Calculate index (ratio to overall mean)11 overall_mean = df[column].mean()12 seasonal_index = grouped / overall_mean13 14 return seasonal_index1516monthly_index = calculate_seasonal_indices(df, 'sales', 'month')17print("Monthly Seasonal Indices:")18print(monthly_index)1920# Visualize21plt.figure(figsize=(10, 5))22monthly_index.plot(kind='bar')23plt.axhline(y=1, color='r', linestyle='--')24plt.title('Monthly Seasonal Indices')25plt.xlabel('Month')26plt.ylabel('Index')27plt.show()8. Forecasting Basics
8.1 Simple Methods
Python
1# Naive forecast (last value)2df['forecast_naive'] = df['sales'].shift(1)34# Seasonal naive (same day last year)5df['forecast_seasonal_naive'] = df['sales'].shift(365)67# Moving average forecast8df['forecast_ma'] = df['sales'].rolling(window=30).mean().shift(1)910# Exponential smoothing11df['forecast_ewm'] = df['sales'].ewm(span=30).mean().shift(1)8.2 Simple Exponential Smoothing
Python
1from statsmodels.tsa.holtwinters import SimpleExpSmoothing23# Fit model4model = SimpleExpSmoothing(df['sales']).fit(smoothing_level=0.2)56# In-sample fitted values7df['ses_fitted'] = model.fittedvalues89# Forecast10forecast = model.forecast(30) # Next 30 days1112print(f"Smoothing level (alpha): {model.params['smoothing_level']:.4f}")8.3 Holt-Winters (Triple Exponential Smoothing)
Python
1from statsmodels.tsa.holtwinters import ExponentialSmoothing23# Fit model with trend and seasonality4model = ExponentialSmoothing(5 df['sales'],6 trend='add', # Additive trend7 seasonal='add', # Additive seasonality8 seasonal_periods=365 # Yearly seasonality9).fit()1011# In-sample fitted12df['hw_fitted'] = model.fittedvalues1314# Forecast15forecast = model.forecast(90) # Next 90 days1617# Plot18fig, ax = plt.subplots(figsize=(14, 6))19ax.plot(df.index[-365:], df['sales'][-365:], label='Actual')20ax.plot(forecast.index, forecast.values, 'r--', label='Forecast')21ax.legend()22ax.set_title('Holt-Winters Forecast')23plt.show()8.4 Forecast Evaluation
Python
1from sklearn.metrics import mean_absolute_error, mean_squared_error23def evaluate_forecast(actual, predicted, model_name='Model'):4 """Calculate forecast metrics"""5 # Remove NaN6 mask = ~(np.isnan(actual) | np.isnan(predicted))7 actual = actual[mask]8 predicted = predicted[mask]9 10 mae = mean_absolute_error(actual, predicted)11 rmse = np.sqrt(mean_squared_error(actual, predicted))12 mape = np.mean(np.abs((actual - predicted) / actual)) * 10013 14 print(f"\n{model_name} Performance:")15 print(f" MAE: {mae:.2f}")16 print(f" RMSE: {rmse:.2f}")17 print(f" MAPE: {mape:.2f}%")18 19 return {'mae': mae, 'rmse': rmse, 'mape': mape}2021# Compare methods22evaluate_forecast(df['sales'], df['forecast_naive'], 'Naive')23evaluate_forecast(df['sales'], df['forecast_ma'], 'Moving Average')24evaluate_forecast(df['sales'], df['hw_fitted'], 'Holt-Winters')9. Time Series SQL
9.1 Date Functions
SQL
1-- PostgreSQL date operations2SELECT3 order_date,4 EXTRACT(YEAR FROM order_date) as year,5 EXTRACT(MONTH FROM order_date) as month,6 EXTRACT(DOW FROM order_date) as day_of_week,7 DATE_TRUNC('month', order_date) as month_start,8 order_date - INTERVAL '7 days' as week_ago,9 order_date + INTERVAL '30 days' as month_ahead10FROM orders;1112-- Generate date series13SELECT generate_series(14 '2024-01-01'::date,15 '2024-12-31'::date,16 '1 day'::interval17)::date as date;9.2 Time Series Aggregation
SQL
1-- Daily to monthly2SELECT3 DATE_TRUNC('month', order_date) as month,4 COUNT(*) as orders,5 SUM(amount) as revenue,6 AVG(amount) as avg_order7FROM orders8GROUP BY DATE_TRUNC('month', order_date)9ORDER BY month;1011-- Moving average with window functions12SELECT13 order_date,14 amount,15 AVG(amount) OVER (16 ORDER BY order_date17 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW18 ) as moving_avg_7d19FROM orders;2021-- Year-over-year comparison22WITH monthly_sales AS (23 SELECT24 DATE_TRUNC('month', order_date) as month,25 SUM(amount) as revenue26 FROM orders27 GROUP BY DATE_TRUNC('month', order_date)28)29SELECT30 month,31 revenue,32 LAG(revenue, 12) OVER (ORDER BY month) as revenue_last_year,33 ROUND((revenue - LAG(revenue, 12) OVER (ORDER BY month)) / 34 LAG(revenue, 12) OVER (ORDER BY month) * 100, 2) as yoy_growth35FROM monthly_sales36ORDER BY month;9.3 Fill Missing Dates
SQL
1-- Fill missing dates with zero2WITH date_series AS (3 SELECT generate_series(4 (SELECT MIN(order_date) FROM orders),5 (SELECT MAX(order_date) FROM orders),6 '1 day'::interval7 )::date as date8),9daily_sales AS (10 SELECT11 order_date::date as date,12 SUM(amount) as revenue13 FROM orders14 GROUP BY order_date::date15)16SELECT17 ds.date,18 COALESCE(d.revenue, 0) as revenue19FROM date_series ds20LEFT JOIN daily_sales d ON ds.date = d.date21ORDER BY ds.date;10. Thực hành
Time Series Project
Exercise: Complete Time Series Analysis
Python
1# Analyze retail sales data:2# 1. Load and prepare data3# 2. Visualize trends and seasonality4# 3. Decompose time series5# 4. Calculate seasonal indices6# 5. Build simple forecast7# 6. Evaluate forecast accuracy89# YOUR CODE HERE💡 Xem đáp án
Python
1import pandas as pd2import numpy as np3import matplotlib.pyplot as plt4from statsmodels.tsa.seasonal import STL5from statsmodels.tsa.holtwinters import ExponentialSmoothing6from sklearn.metrics import mean_absolute_error, mean_squared_error78# 1. Generate sample data9np.random.seed(42)10dates = pd.date_range('2021-01-01', '2024-12-31', freq='D')11n = len(dates)1213# Components14trend = np.linspace(1000, 1500, n)15yearly_season = 200 * np.sin(2 * np.pi * np.arange(n) / 365)16weekly_season = 50 * np.sin(2 * np.pi * np.arange(n) / 7)17noise = np.random.normal(0, 50, n)1819sales = trend + yearly_season + weekly_season + noise20sales = np.maximum(sales, 0) # No negative sales2122df = pd.DataFrame({'date': dates, 'sales': sales})23df.set_index('date', inplace=True)2425print("Data Summary:")26print(df.describe())2728# 2. Visualizations29fig, axes = plt.subplots(2, 2, figsize=(14, 10))3031# Daily sales32axes[0, 0].plot(df.index, df['sales'], alpha=0.5)33axes[0, 0].set_title('Daily Sales (4 Years)')3435# Monthly aggregation36monthly = df['sales'].resample('M').mean()37axes[0, 1].plot(monthly.index, monthly.values, marker='o')38axes[0, 1].set_title('Monthly Average Sales')3940# Day of week pattern41dow_avg = df.groupby(df.index.dayofweek)['sales'].mean()42axes[1, 0].bar(range(7), dow_avg.values)43axes[1, 0].set_xticks(range(7))44axes[1, 0].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])45axes[1, 0].set_title('Day of Week Pattern')4647# Monthly pattern48month_avg = df.groupby(df.index.month)['sales'].mean()49axes[1, 1].bar(range(1, 13), month_avg.values)50axes[1, 1].set_title('Monthly Pattern')51axes[1, 1].set_xlabel('Month')5253plt.tight_layout()54plt.savefig('time_series_eda.png', dpi=150)55plt.show()5657# 3. STL Decomposition58stl = STL(df['sales'], period=365, robust=True)59result = stl.fit()6061fig = result.plot()62fig.set_size_inches(14, 10)63plt.savefig('stl_decomposition.png', dpi=150)64plt.show()6566# 4. Seasonal Indices67monthly_indices = df.groupby(df.index.month)['sales'].mean() / df['sales'].mean()68print("\nMonthly Seasonal Indices:")69for month, idx in enumerate(monthly_indices, 1):70 direction = "↑" if idx > 1 else "↓"71 print(f" Month {month:2d}: {idx:.3f} {direction}")7273# 5. Train/Test Split74train = df['sales'][:'2024-06-30']75test = df['sales']['2024-07-01':]7677print(f"\nTrain: {len(train)} days")78print(f"Test: {len(test)} days")7980# 6. Holt-Winters Forecast81model = ExponentialSmoothing(82 train,83 trend='add',84 seasonal='add',85 seasonal_periods=36586).fit()8788forecast = model.forecast(len(test))8990# 7. Evaluate91mae = mean_absolute_error(test, forecast)92rmse = np.sqrt(mean_squared_error(test, forecast))93mape = np.mean(np.abs((test.values - forecast.values) / test.values)) * 1009495print(f"\nForecast Performance:")96print(f" MAE: {mae:.2f}")97print(f" RMSE: {rmse:.2f}")98print(f" MAPE: {mape:.2f}%")99100# 8. Plot Forecast101fig, ax = plt.subplots(figsize=(14, 6))102103# Last year of training + test104ax.plot(train[-365:].index, train[-365:].values, label='Training', alpha=0.7)105ax.plot(test.index, test.values, label='Actual', alpha=0.7)106ax.plot(forecast.index, forecast.values, 'r--', label='Forecast', linewidth=2)107108ax.fill_between(109 forecast.index,110 forecast - 2*rmse,111 forecast + 2*rmse,112 alpha=0.2,113 color='red',114 label='95% CI'115)116117ax.legend()118ax.set_title(f'Sales Forecast (MAPE: {mape:.2f}%)')119ax.set_ylabel('Sales')120plt.savefig('forecast_result.png', dpi=150)121plt.show()122123# 9. Summary Report124print("\n" + "="*50)125print("TIME SERIES ANALYSIS SUMMARY")126print("="*50)127print(f"Data Period: {df.index.min().date()} to {df.index.max().date()}")128print(f"Total Days: {len(df)}")129print(f"\nTrend: Upward (from ~{trend[0]:.0f} to ~{trend[-1]:.0f})")130print(f"Seasonality: Strong yearly + weekly patterns")131print(f"\nForecast Model: Holt-Winters (Additive)")132print(f"Forecast Accuracy: MAPE = {mape:.2f}%")11. Tổng kết
| Topic | Key Concepts |
|---|---|
| Components | Trend, Seasonality, Residual |
| Resampling | resample(), downsampling, upsampling |
| Moving Stats | rolling(), ewm(), expanding() |
| Decomposition | seasonal_decompose(), STL |
| Forecasting | Naive, EMA, Holt-Winters |
| Evaluation | MAE, RMSE, MAPE |
Bài tiếp theo: Cohort Analysis
