In [None]:
### The bike sharing forecasting problem (river tutorial)

In [None]:
!pip install river

In [None]:
from river import datasets
from pprint import pprint
from river import compose
from river import linear_model
from river import metrics
from river import evaluate
from river import preprocessing
from river import optim
from river import feature_extraction
from river import stats
from river import ensemble
from river import neighbors
import datetime as dt

In [None]:
## Look at the first entry of the dataset

X_y = datasets.Bikes() # Connect to the stream

for x, y in X_y:
    pprint(x)
    print(f'Number of available bikes: {y}') # <- We want to predict the number of available bikes
    print(f'Number of features: {len(x)}')
    break # exit after the first call 

{'clouds': 75,
 'description': 'light rain',
 'humidity': 81,
 'moment': datetime.datetime(2016, 4, 1, 0, 0, 7),
 'pressure': 1017.0,
 'station': 'metro-canal-du-midi',
 'temperature': 6.54,
 'wind': 9.3}
Number of available bikes: 1
Number of features: 8


In [None]:
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind') # Select only numeric features.
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001)) # <-- Optimizer is Stochastic Gradient Descent. LR=1e-3

metric = metrics.MAE()

In [None]:
model

In [None]:
evaluate.progressive_val_score(X_y, model, metric, print_every=20_000) # We want to report every 20k data points

[20,000] MAE: 4.912727
[40,000] MAE: 5.333554
[60,000] MAE: 5.330948
[80,000] MAE: 5.392313
[100,000] MAE: 5.423059
[120,000] MAE: 5.541223
[140,000] MAE: 5.613023
[160,000] MAE: 5.622428
[180,000] MAE: 5.567824


MAE: 5.563893

In [None]:
# This was a large mean absolute error...
# Let's add more information. What about the hour of the day?
def get_hour(x):
    x['hour'] = x['moment'].hour
    return x

model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean()) # We also aggregate the number of bikes in hours
)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE()

In [None]:
model

In [None]:
evaluate.progressive_val_score(X_y, model, metric, print_every=20_000)

[20,000] MAE: 3.721246
[40,000] MAE: 3.829972
[60,000] MAE: 3.845068
[80,000] MAE: 3.910259
[100,000] MAE: 3.888652
[120,000] MAE: 3.923727
[140,000] MAE: 3.980953
[160,000] MAE: 3.950034
[180,000] MAE: 3.934545


MAE: 3.933498

In [None]:
# Less error. As expected, the number of available bikes depends on the hour of the day. 
# What about day of the week? 

def get_hour_and_weekday(x):
  x['weekday'] = (x['moment'].weekday() < 6) # Mon:0,Tue:1,...,Sat:6,Sun:7
  x['hour'] = x['moment'].hour
  return x

model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour_and_weekday |
    feature_extraction.TargetAgg(by=['station', 'hour', 'weekday'], how=stats.Mean()) # We also aggregate the number of bikes in hours
)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE()

In [None]:
model

In [None]:
evaluate.progressive_val_score(X_y, model, metric, print_every=20_000)

[20,000] MAE: 3.323605
[40,000] MAE: 3.545601
[60,000] MAE: 3.609249
[80,000] MAE: 3.697535
[100,000] MAE: 3.703275
[120,000] MAE: 3.726729
[140,000] MAE: 3.789677
[160,000] MAE: 3.753734
[180,000] MAE: 3.734026


MAE: 3.733385

In [None]:
## Experiment with different optimizers: optim.RMSProp(), optim.Adam(), optim.NesterovMomentum(), etc

In [None]:
## Let's try an ensemble. 

model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour_and_weekday |
    feature_extraction.TargetAgg(by=['station', 'hour', 'weekday'], how=stats.Mean())
)

model += feature_extraction.TargetAgg(by='station', how=stats.EWMean(0.5))
model |= preprocessing.StandardScaler()
model |= ensemble.EWARegressor([ # <- This ensemble regressor will combine 
                                # 3 linear regression models trained with different optimizers. 
    linear_model.LinearRegression(optim.SGD()), # The EWARegressor will run the 3 models in parallel 
    linear_model.LinearRegression(optim.RMSProp()),# and assign weights to each model based on their individual performance.
    linear_model.LinearRegression(optim.Adam())
])

metric = metrics.MAE()

In [None]:
model

In [None]:
## Our evaluation will also improve
## We want to evaluate the model by forecasting 30 minutes ahead and only 
## updating the model once the true values are available. 
## This can be done using the moment and delay parameters in progressive_val_score.
evaluate.progressive_val_score(
    dataset=datasets.Bikes(),
    model=model,
    metric=metric,
    moment='moment',
    delay=dt.timedelta(minutes=30),
    print_every=20_000
)

[20,000] MAE: 2.249336
[40,000] MAE: 2.238924
[60,000] MAE: 2.268627
[80,000] MAE: 2.283885
[100,000] MAE: 2.291669
[120,000] MAE: 2.272123
[140,000] MAE: 2.257576
[160,000] MAE: 2.28196
[180,000] MAE: 2.285042


MAE: 2.288666

In [None]:
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour_and_weekday |
    feature_extraction.TargetAgg(by=['station', 'hour', 'weekday'], how=stats.Mean())
)

model += feature_extraction.TargetAgg(by='station', how=stats.EWMean(0.5))
model |= preprocessing.StandardScaler()
model |= ensemble.EWARegressor([ 
    linear_model.LinearRegression(optim.SGD()), 
    linear_model.LinearRegression(optim.RMSProp()),
    linear_model.LinearRegression(optim.Adam()),
    neighbors.KNNRegressor(window_size=50)
])

metric = metrics.MAE()

In [None]:
model

In [None]:
evaluate.progressive_val_score(
    dataset=datasets.Bikes(),
    model=model,
    metric=metric,
    moment='moment',
    delay=dt.timedelta(minutes=30),
    print_every=20_000
)

[20,000] MAE: 2.24487
[40,000] MAE: 2.236691
[60,000] MAE: 2.267139
[80,000] MAE: 2.282768
[100,000] MAE: 2.290776
[120,000] MAE: 2.271379
[140,000] MAE: 2.256938
[160,000] MAE: 2.281402
[180,000] MAE: 2.284546


MAE: 2.288177