Facebook Prophet¶
Limitations¶
- Basically glorified curve-fitting to time variable
- Tends to overfit
- Does not extrapolate well
Improve .fit()
¶
import numpy as np
import pandas as pd
import time
import datetime
from prophet import Prophet
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from batch_elastic_net import BatchElasticNetRegression
def make_sine_wave(length: int, n_cycles: int):
"""
Makes a sine wave given some length and the number of cycles it should go through in that period
"""
samples = np.linspace(0, length, length)
return np.sin(2 * np.pi * n_cycles * samples)
def generate_dataset(n_items):
"""
Generates a time series dataset with weekly frequency for two years. Randomly assigns the yearly, monthly and
trend values for each item
"""
year_in_weeks = 104
yealy_s = make_sine_wave(year_in_weeks, 2)
monthly_s = make_sine_wave(year_in_weeks, year_in_weeks / 24)
trend = np.arange(year_in_weeks) / year_in_weeks
all_ys = []
for i in range(n_items):
d = (np.stack([yealy_s, monthly_s, trend], axis=1) * np.random.rand(3)).sum(axis=1) + np.random.rand(year_in_weeks)
all_ys.append(d + (np.random.rand(len(d))-0.45).cumsum())
return pd.DataFrame(zip(*all_ys), index = pd.date_range(datetime.datetime(2020, 1, 1), freq='w', periods=len(d)))
def get_changepoint_idx(length, n_changepoints, changepoint_range=0.8):
"""
Finds the indices of slope change-points using Prophet's logic: assign them uniformly of the first changepoint_range
percentage of the data
"""
hist_size = int(np.floor(length * changepoint_range))
return np.linspace(0, hist_size - 1, n_changepoints+1).round().astype(int)[1:]
def make_changepoint_features(n, changes_idx):
"""
Creates initial slope and slope change-points features given a length of data and locations of indices.
The features are 0s for the first elements until their idx is reached, and then they move linearly upwards.
These features can be used to model a time series with an initial slope and the deltas of change-points.
"""
linear = np.arange(n).reshape(-1,1)
feats = [linear]
for i in changes_idx:
slope_feat = np.zeros(n)
slope_feat[i:] = np.arange(0, n-i)
slope_feat = slope_feat.reshape(-1,1)
feats.append(slope_feat)
feat = np.concatenate(feats, axis=1)
return feat
def run_prophet():
t = time.time()
all_prophets_datasets_forecasts = {}
for name, dataset in data_sets.items():
all_p_forecast = []
for i in range(dataset.shape[1]):
ds = dataset.iloc[:, i].reset_index()
ds.columns = ['ds', 'y']
# if uncertainty samples is not None it will take way more time
m = Prophet(n_changepoints=n_changepoints, changepoint_prior_scale=change_prior, growth='linear',
uncertainty_samples=None,
yearly_seasonality=True, weekly_seasonality=False, seasonality_prior_scale=seasonality_prior)
m.fit(ds)
forecast = m.predict(ds)
all_p_forecast.append(forecast.yhat)
all_prophets_datasets_forecasts[name] = pd.DataFrame(zip(*all_p_forecast), index=ds.ds)
return all_prophets_datasets_forecasts, time.time() - t
def run_batch_linear():
big_num = 20. # used as std of prior when it should be uninformative
p = Prophet()
t = time.time()
all_BatchLinear_datasets_forecasts = {}
for name, dataset in data_sets.items():
dates = pd.Series(dataset.index)
dataset_length = len(dataset)
idx = get_changepoint_idx(dataset_length, n_changepoints)
seasonal_feat = p.make_seasonality_features(dates, 365.25, 10, 'yearly_sine')
changepoint_feat = make_changepoint_features(dataset_length, idx) / dataset_length
feat = np.concatenate([changepoint_feat, seasonal_feat], axis=1)
n_changepoint_feat = changepoint_feat.shape[1] - 1
# laplace prior only on changepoints (seasonals get big_num, to avoid l1 regularization on it)
l1_priors = np.array([big_num] + [change_prior] * n_changepoint_feat + [big_num] * seasonal_feat.shape[1])
# normal prior on initial slope and on seasonals, and a big_num on changepoints to avoid l2 regularization
l2_priors = np.array([5] + [big_num] * n_changepoint_feat + [seasonality_prior] * seasonal_feat.shape[
1]) # normal prior only on seasonal
# this is how Prophet scales the data before fitting - divide by max of each item
scale = dataset.max()
scaled_y = dataset / scale
blr = BatchElasticNetRegression()
blr.fit(feat, scaled_y, l1_reg_params=l1_priors, l2_reg_params=l2_priors, as_bayesian_prior=True, verbose=True,
iterations=1500)
# get the predictions for the train
all_BatchLinear_datasets_forecasts[name] = pd.DataFrame(blr.predict(feat) * scale.values, index=dates)
return all_BatchLinear_datasets_forecasts, time.time() - t
if __name__ == '__main__':
data_files_names = ['d1', 'd2', 'M5_sample']
data_sets = {name: pd.read_csv(f'data_files/{name}.csv', index_col=0, parse_dates=True) for name in data_files_names}
data_sets['randomly_generated'] = generate_dataset(500)
# can play with these params for both predictors
change_prior = 0.5
# the seasonality_prior is an uninformative prior (hardly any regularization), which is the default for Prophet and usually does not require changing
seasonality_prior = 10
n_changepoints = 15
all_prophets_datasets_forecasts, prophet_time = run_prophet()
all_BatchLinear_datasets_forecasts, batch_time = run_batch_linear()
print(f'total number of items: {sum([x.shape[1] for x in data_sets.values()])}')
print(f'Prophet time: {round(prophet_time, 2)}; batch time: {round(batch_time, 2)}')
# plot examples from datasets (copy to notebook and repeat for different items and datasets)
name = 'd1'
batch_preds = all_BatchLinear_datasets_forecasts[name]
prophet_preds = all_prophets_datasets_forecasts[name]
orig_data = data_sets[name]
i = np.random.randint(0, orig_data.shape[1])
orig_data.iloc[:, i].plot(label='target')
batch_pred = batch_preds.iloc[:, i]
prophet_pred = prophet_preds.iloc[:, i]
prophet_pred.plot(label='prophet')
batch_pred.plot(label='my_batch')
plt.title(f'Pearson {round(pearsonr(batch_pred, prophet_pred)[0], 3)}')
plt.legend()
plt.show()
# mean pearson
all_corrs = {}
for name in data_sets.keys():
batch_preds = all_BatchLinear_datasets_forecasts[name]
prophet_preds = all_prophets_datasets_forecasts[name]
corrs = []
for i in range(prophet_preds.shape[1]):
corrs.append(pearsonr(batch_preds.iloc[:, i], prophet_preds.iloc[:, i])[0])
all_corrs[name] = np.mean(corrs)
print(all_corrs)
IDK
import numpy as np
import torch
import torch.optim as optim
from typing import Optional, Union
BIG_STD = 20. # used as std of prior when it should be uninformative (when we do not wish to regularize at all)
def to_tensor(x):
return torch.from_numpy(np.array(x)).float()
class BatchElasticNetRegression(object):
"""
Elastic net for the case where we have multiple targets (y), all to be fitted with the same features (X).
Learning all items in parallel, in a single "network" is more efficient then iteratively fitting a regression for
each target.
Allows to set different l1 and l2 regularization params for each of the features.
Can also be used to estimate the MAP of a Bayesian regression with Laplace or Normal priors instead of L1 and L2.
"""
def __init__(self):
self.coefs = None
self.intercepts = None
def fit(self,
X, y,
l1_reg_params: Optional[Union[np.array, float]] = None,
l2_reg_params: Optional[Union[np.array, float]] = None,
as_bayesian_prior=False, iterations=500, verbose=True, lr_rate=0.1):
"""
Fits multiple regressions. Both X and y are 2d matrices, where X is the common features for all the targets,
and y contains all the concatenated targets.
If as_bayesian_prior==False then the l1 and l2 reg params are regularization params
If as_bayesian_prior==True then l1 is treated as the std of the laplace prior and l2 as the std for the normal
prior.
The reg params / std of priors can either be a single value for all features, or set a different regularization
or prior for each feature separately. e.g. if we have 3 features, l1_reg_params can be [0.5, 1.2, 0] to set
regularization for each.
TODO:
Add normalization before fitting
Requires more work on the optimizer to be faster
"""
n_items = y.shape[1]
k_features = X.shape[1]
n_samples = X.shape[0]
# TODO: if l1_reg_params is None just don't calculate this part of the loss, instead of multiplying by 0
if l1_reg_params is None:
l1_reg_params = BIG_STD if as_bayesian_prior else 0.
if type(l1_reg_params) == float:
l1_reg_params = [l1_reg_params] * k_features
if l2_reg_params is None:
l2_reg_params = BIG_STD if as_bayesian_prior else 0.
if type(l2_reg_params) == float:
l2_reg_params = [l2_reg_params] * k_features
assert len(l1_reg_params) == len(l2_reg_params) == k_features, 'Regularization values must match X.shape[1]'
if as_bayesian_prior:
assert 0 not in l1_reg_params and 0 not in l2_reg_params, 'Cannot have 0 prior'
# convert to tensors and set initial params
t_features = to_tensor(X)
t_target = to_tensor(y)
learned_coefs = torch.rand(k_features, n_items, requires_grad=True)
learned_intercepts = torch.rand(n_items, requires_grad=True)
# TODO: or auto-estimate initial sigma based on data std?
est_sigma = torch.ones(n_items)
if as_bayesian_prior:
# If the params are priors then they must become a matrix, not a simple vector - because the conversion
# depends on the sigma of errors for each target y. The actual regularization params will be different
# for each item based on its sigma.
t_l1_reg_params = to_tensor(np.stack([l1_reg_params] * n_items, axis=1))
l1_alpha = self.calc_l1_alpha_from_prior(est_sigma, t_l1_reg_params, n_samples)
t_l2_reg_params = to_tensor(np.stack([l2_reg_params] * n_items, axis=1))
l2_alpha = self.calc_l2_alpha_from_prior(est_sigma, t_l2_reg_params, n_samples)
else:
l1_alpha = to_tensor(l1_reg_params)
l2_alpha = to_tensor(l2_reg_params)
# TODO: add scheduler for learning rate
optimizer = optim.Adam([learned_coefs, learned_intercepts], lr_rate)
for i in range(iterations):
optimizer.zero_grad(set_to_none=True)
res = torch.matmul(t_features, learned_coefs) + learned_intercepts
diff_loss = (1 / (2 * n_samples)) * ((res - t_target) ** 2).sum(axis=0)
if as_bayesian_prior:
reg_loss = (l1_alpha * learned_coefs.abs()).sum(axis=0) + (l2_alpha * learned_coefs ** 2).sum(axis=0)
else:
reg_loss = torch.matmul(l1_alpha, learned_coefs.abs()) + torch.matmul(l2_alpha, learned_coefs ** 2)
loss = (diff_loss + reg_loss).sum()
loss.backward()
optimizer.step()
if as_bayesian_prior and i % 50 == 0:
# if the params are the priors - we must convert them to the equivalent l1/l2 loss params.
# This conversion depends on the final sigma of errors of the forecast, which is unknown until we
# have a forecast using those same params... We iteratively improve our estimate of sigma and
# re-compute the corresponding regularization params based on those sigmas.
# The sigma is per target in y, therefore the l1/l2 params are per item.
est_sigma = (res - t_target).std(axis=0).detach()
l1_alpha = self.calc_l1_alpha_from_prior(est_sigma, t_l1_reg_params, n_samples)
l2_alpha = self.calc_l2_alpha_from_prior(est_sigma, t_l2_reg_params, n_samples)
if i % 50 == 0 and verbose:
print(loss)
# TODO: early stopping if converges
self.coefs = learned_coefs.detach().numpy()
self.intercepts = learned_intercepts.detach().numpy()
def predict(self, X):
return X @ self.coefs + self.intercepts
@staticmethod
def calc_l1_alpha_from_prior(est_sigma, b_prior, n_samples):
"""
Converts from the std of a Laplace prior to the equivalent L1 regularization param.
The conversion formula is divided by 2*n_samples since we divided the diff_loss by 2*n_samples as well,
to match sklearn's implementation of Lasso.
"""
return est_sigma ** 2 / (b_prior * n_samples)
@staticmethod
def calc_l2_alpha_from_prior(est_sigma, b_prior, n_samples):
return est_sigma ** 2 / (b_prior ** 2 * 2 * n_samples)
Improve .predict()
¶
01/25/23 edit: FB engineers integrated the solution in this post into the package in version release 1.1.2. Due to some implementation differences it is still slightly slower than the solution in the end of this post, but not significantly.
import numpy as np
import pandas as pd
from fbprophet import Prophet
def _make_historical_mat_time(deltas, changepoints_t, t_time, n_row=1):
"""
Creates a matrix of slope-deltas where these changes occured in training data according to the trained prophet obj
"""
diff = np.diff(t_time).mean()
prev_time = np.arange(0, 1 + diff, diff)
idxs = []
for changepoint in changepoints_t:
idxs.append(np.where(prev_time > changepoint)[0][0])
prev_deltas = np.zeros(len(prev_time))
prev_deltas[idxs] = deltas
prev_deltas = np.repeat(prev_deltas.reshape(1, -1), n_row, axis=0)
return prev_deltas, prev_time
def prophet_logistic_uncertainty(
mat: np.ndarray,
deltas: np.ndarray,
prophet_obj: Prophet,
cap_scaled: np.ndarray,
t_time: np.ndarray,
):
"""
Vectorizes prophet's logistic growth uncertainty by creating a matrix of future possible trends.
"""
def ffill(arr):
mask = arr == 0
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
np.maximum.accumulate(idx, axis=1, out=idx)
return arr[np.arange(idx.shape[0])[:, None], idx]
k = prophet_obj.params["k"][0]
m = prophet_obj.params["m"][0]
n_length = len(t_time)
# for logistic growth we need to evaluate the trend all the way from the start of the train item
historical_mat, historical_time = _make_historical_mat_time(deltas, prophet_obj.changepoints_t, t_time, len(mat))
mat = np.concatenate([historical_mat, mat], axis=1)
full_t_time = np.concatenate([historical_time, t_time])
# apply logistic growth logic on the slope changes
k_cum = np.concatenate((np.ones((mat.shape[0], 1)) * k, np.where(mat, np.cumsum(mat, axis=1) + k, 0)), axis=1)
k_cum_b = ffill(k_cum)
gammas = np.zeros_like(mat)
for i in range(mat.shape[1]):
x = full_t_time[i] - m - np.sum(gammas[:, :i], axis=1)
ks = 1 - k_cum_b[:, i] / k_cum_b[:, i + 1]
gammas[:, i] = x * ks
# the data before the -n_length is the historical values, which are not needed, so cut the last n_length
k_t = (mat.cumsum(axis=1) + k)[:, -n_length:]
m_t = (gammas.cumsum(axis=1) + m)[:, -n_length:]
sample_trends = cap_scaled / (1 + np.exp(-k_t * (t_time - m_t)))
# remove the mean because we only need width of the uncertainty centered around 0
# we will add the width to the main forecast - yhat (which is the mean) - later
sample_trends = sample_trends - sample_trends.mean(axis=0)
return sample_trends
def _make_trend_shift_matrix(mean_delta: float, likelihood: float, future_length: float, k: int = 10000) -> np.ndarray:
"""
Creates a matrix of random trend shifts based on historical likelihood and size of shifts.
Can be used for either linear or logistic trend shifts.
Each row represents a different sample of a possible future, and each column is a time step into the future.
"""
# create a bool matrix of where these trend shifts should go
bool_slope_change = np.random.uniform(size=(k, future_length)) < likelihood
shift_values = np.random.laplace(0, mean_delta, size=bool_slope_change.shape)
mat = shift_values * bool_slope_change
n_mat = np.hstack([np.zeros((len(mat), 1)), mat])[:, :-1]
mat = (n_mat + mat) / 2
return mat
def add_prophet_uncertainty(
prophet_obj: Prophet,
forecast_df: pd.DataFrame,
using_train_df: bool = False,
):
"""
Adds yhat_upper and yhat_lower to the forecast_df used by fbprophet, based on the params of a trained prophet_obj
and the interval_width.
Use using_train_df=True if the forecast_df is not for a future time but for the training data.
"""
assert prophet_obj.history is not None, "Model has not been fit"
assert "yhat" in forecast_df.columns, "Must have the mean yhat forecast to build uncertainty on"
interval_width = prophet_obj.interval_width
if using_train_df: # there is no trend-based uncertainty if we're only looking on the past where trend is known
sample_trends = np.zeros(10000, len(forecast_df))
else: # create samples of possible future trends
future_time_series = ((forecast_df["ds"] - prophet_obj.start) / prophet_obj.t_scale).values
single_diff = np.diff(future_time_series).mean()
change_likelihood = len(prophet_obj.changepoints_t) * single_diff
deltas = prophet_obj.params["delta"][0]
n_length = len(forecast_df)
mean_delta = np.mean(np.abs(deltas)) + 1e-8
if prophet_obj.growth == "linear":
mat = _make_trend_shift_matrix(mean_delta, change_likelihood, n_length, k=10000)
sample_trends = mat.cumsum(axis=1).cumsum(axis=1) # from slope changes to actual values
sample_trends = sample_trends * single_diff # scaled by the actual meaning of the slope
elif prophet_obj.growth == "logistic":
mat = _make_trend_shift_matrix(mean_delta, change_likelihood, n_length, k=1000)
cap_scaled = (forecast_df["cap"] / prophet_obj.y_scale).values
sample_trends = prophet_logistic_uncertainty(mat, deltas, prophet_obj, cap_scaled, future_time_series)
else:
raise NotImplementedError
# add gaussian noise based on historical levels
sigma = prophet_obj.params["sigma_obs"][0]
historical_variance = np.random.normal(scale=sigma, size=sample_trends.shape)
full_samples = sample_trends + historical_variance
# get quantiles and scale back (prophet scales the data before fitting, so sigma and deltas are scaled)
width_split = (1 - interval_width) / 2
quantiles = np.array([width_split, 1 - width_split]) * 100 # get quantiles from width
quantiles = np.percentile(full_samples, quantiles, axis=0)
# Prophet scales all the data before fitting and predicting, y_scale re-scales it to original values
quantiles = quantiles * prophet_obj.y_scale
forecast_df["yhat_lower"] = forecast_df.yhat + quantiles[0]
forecast_df["yhat_upper"] = forecast_df.yhat + quantiles[1]
p = Prophet(uncertainty_samples=None) # tell Prophet not to create the interval by itself
p = p.fit(training_df)
# set to your number of periods and freq
forecast_df = p.make_future_dataframe(periods=10, freq='W', include_history=False)
training_df = p.predict(training_df)
forecast_df = p.predict(forecast_df)
add_prophet_uncertainty(p, training_df, using_train_df=True)
add_prophet_uncertainty(p, forecast_df)