# Import 3rd party libraries
import warnings
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
warnings.filterwarnings('ignore')


# Load cleaned data from pickle file
trips_data = pd.read_pickle('trips_data_final.pkl')


trips_data = trips_data.dropna(axis=0, how='any', subset=['Temp (°C)', 'Rel Hum (%)', 'Wind Spd (km/h)'])

trips_data = trips_data[~(trips_data['Start Time'].dt.year == 2016)]


# Split data for training, validating, and testing
from sklearn.model_selection import train_test_split

train, test = train_test_split(trips_data, train_size=0.7, 
                               test_size=0.3, random_state=0)
val, test = train_test_split(test, train_size=0.5,
                             test_size=0.5, random_state=0)

print('Train:', train.shape, 'Val:', val.shape, 'Test:', test.shape)

Train: (5585256, 43) Val: (1196841, 43) Test: (1196841, 43)


import holidays

# Determine if the day is a workday
def is_workday(times):
    return 1 if any(x in times.dt.weekday.values for x in range(5)) else 0

# Determine if the day is a holiday
def is_holiday(times):
    can_holidays = holidays.Canada(years=[2017, 2018, 2019, 2020], prov='ON')

    return 1 if any(x in times.dt.date.values for x in can_holidays) else 0

# Determine if the day had precipitation
def is_precipitation(conditions):
    return 1 if all(conditions.str.contains('Clear')) else 0

# Group data into hourly rides
def group_rides(df):
    """
    Returns a dataframe with rides grouped into hours. 
    Duration will be averaged, weather will be kept as hourly. 
    Annual/Casual members are counted.

    Parameters
    ----------
    df : DataFrame
        The dataframe to be grouped

    Returns
    ----------
    DataFrame
        DataFrame containing number of rides for each hour with aggregated data
    """
    
    # Use named aggregation to aggregate the columns
    grouped = df.groupby(df['Start Time'].dt.round('H')).agg(
        rides=('User Type', 'count'),
        annual_members=('User Type', lambda x: x[x == 'Annual Member'].count()),
        casual_members=('User Type', lambda x: x[x == 'Casual Member'].count()),
        weekday=('Start Time', is_workday),
        holiday=('Start Time', is_holiday),
        duration=('Trip Duration', np.mean),
        year=('Year', lambda x: x.iloc[0]),
        month=('Month', lambda x: x.iloc[0]),
        dayofweek=('Start Time', lambda x: x.iloc[0].dayofweek),
        hour=('Time', lambda x: int(x.iloc[0][:2])),
        temp=('Temp (°C)', lambda x: x.iloc[0]),
        humidity=('Rel Hum (%)', lambda x: x.iloc[0]),
        wind_speed=('Wind Spd (km/h)', np.mean),
        weather=('Weather', is_precipitation)
    )

    # Add a column for period/time of day
    # 0000-0600, 0600-1800, 1800-2400
    grouped.loc[:, 'period'] = np.digitize(grouped['hour'], np.array([0, 6, 18, 24]))

    # Create cyclical time features using sin/cos functions
    grouped['month_sin'] = np.sin((grouped.month-1)*(2.*np.pi/12))
    grouped['month_cos'] = np.cos((grouped.month-1)*(2.*np.pi/12))
    grouped['dayofweek_sin'] = np.sin(grouped.dayofweek*(2.*np.pi/7))
    grouped['dayofweek_cos'] = np.cos(grouped.dayofweek*(2.*np.pi/7))
    grouped['hr_sin'] = np.sin(grouped.hour*(2.*np.pi/24))
    grouped['hr_cos'] = np.cos(grouped.hour*(2.*np.pi/24))

    return grouped


train = group_rides(train)
val = group_rides(val)
test = group_rides(test)


def rmse(errors):
    """Return the root mean squared error."""
    return np.sqrt(np.mean(errors ** 2))

rmse_scores = {}
    
# Calculate constant_rmse
rmse_scores['constant_rmse'] = rmse(val['rides'] - np.mean(train['rides']))

print('Constant model validation RMSE: {} rides'.format(rmse_scores['constant_rmse']))

Constant model validation RMSE: 138.25019044883243 rides


from sklearn.preprocessing import StandardScaler

num_features = ['temp', 'humidity', 'wind_speed']
cat_features = ['month', 'dayofweek', 'hour', 'holiday', 'weekday']

# Fit scaler on data
scaler = StandardScaler()
scaler.fit(train[num_features])

# Create features ready for modelling
def create_features(df):
    """
    Returns a dataframe with scaled and dummy encoded features

    Parameters
    ----------
    df : DataFrame
        The dataframe to create features from

    Returns
    ----------
    DataFrame
        DataFrame containing scaled numerical features and dummy encoded
        categorical features ready for modelling
    """
    scaled = df[num_features].copy()

    # Scale the numerical features
    scaled.iloc[:, :] = scaler.transform(scaled)

    # Dummy encode categorical features
    cats = [pd.get_dummies(df[s], prefix=s, drop_first=True) for s in cat_features]

    return pd.concat([scaled] + cats, axis=1).reset_index().drop('Start Time', axis=1)

from sklearn.linear_model import LinearRegression

def lin_model_rmse(model, train, val):
    """
    Returns the rmse for a linear model using the train and val datasets
    passed as arguments. 
    """
    X_train = create_features(train)
    y_train = train.loc[:, 'rides']

    X_val = create_features(val)
    y_val = val.loc[:, 'rides']

    linear_model.fit(X_train, y_train)

    return rmse(y_val - linear_model.predict(X_val))


# Base case, assume fit_intercept=True
linear_model = LinearRegression(fit_intercept=True)

rmse_scores['base_linear_rmse'] = lin_model_rmse(linear_model, train, val)

# Print score
print('Base case, multiple linear regression model validation RMSE: {} rides'.format(rmse_scores['base_linear_rmse']))

Base case, multiple linear regression model validation RMSE: 188.70727111457393 rides


# Base case, modified, assume fit_intercept=False
linear_model = LinearRegression(fit_intercept=False)

rmse_scores['base_no_intercept_rmse'] = lin_model_rmse(linear_model, train, val)

# Print score
print('Base case (no intercept) validation RMSE: {} rides'.format(rmse_scores['base_no_intercept_rmse']))

Base case (no intercept) validation RMSE: 188.7005860956224 rides


num_features = ['temp', 'humidity', 'wind_speed', 'month_sin', 'month_cos', 
                'dayofweek_sin', 'dayofweek_cos', 'hr_sin', 'hr_cos']
cat_features = ['holiday']

# Fit scaler on data
scaler = StandardScaler()
scaler.fit(train[num_features])

# Cyclical time, assume fit_intercept=True
linear_model = LinearRegression(fit_intercept=True)

rmse_scores['cyclical_rmse'] = lin_model_rmse(linear_model, train, val)

# Print score
print('Linear regression model with cyclical time features validation RMSE: {} rides'.format(rmse_scores['cyclical_rmse']))

# Cyclical time, modified, assume fit_intercept=False
linear_model = LinearRegression(fit_intercept=False)

rmse_scores['cyclical_rmse_no_int'] = lin_model_rmse(linear_model, train, val)

# Print score
print('Linear regression model with cyclical time features (no intercept) validation RMSE: {} rides'.format(rmse_scores['cyclical_rmse_no_int']))

Linear regression model with cyclical time features validation RMSE: 181.98381785596519 rides
Linear regression model with cyclical time features (no intercept) validation RMSE: 119.58830107370308 rides


# No intercept
model = LinearRegression(fit_intercept=False)
val_errors = []

for year in np.unique(train['year']):
    
    # Filter to year
    year_train = train[train['year'] == year]
    year_val = val[val['year'] == year]
    
    # Fit model
    model.fit(create_features(year_train), year_train.loc[:, 'rides'])
    
    # Compute year errors
    year_errors = year_val.loc[:, 'rides'] - model.predict(create_features(year_val))

    # Collect errors
    val_errors.extend(year_errors)

# Compute val score
rmse_scores['year_cyclical_rmse'] = rmse(np.array(val_errors))

# Print score
print('Linear regression model, by year, with cyclical time, validation RMSE: {} rides'.format(rmse_scores['year_cyclical_rmse']))

Linear regression model, by year, with cyclical time, validation RMSE: 130.2079362697383 rides


# Convert dictionary to DataFrame to sort values
rmse_plot = pd.DataFrame.from_dict(rmse_scores, 
            orient='index', 
            columns=['RMSE Scores']).sort_values(by='RMSE Scores')

# Plot figure
fig, ax = plt.subplots(figsize=(8,5))
sns.barplot(x=rmse_plot['RMSE Scores'], y=rmse_plot.index)
ax.set_title('RMSE Scores for various Linear Models', fontsize=16)
ax.set_xlabel('RMSE Scores (Lower Is Better)')

# Label bars
for p in ax.patches:
    width = p.get_width()
    ax.text(width + 1,
            p.get_y() + p.get_height() / 2,
            '{:1.2f}'.format(width),
            ha = 'left',
            va = 'center'
    )

plt.show()


num_features = ['temp', 'humidity', 'wind_speed']
cat_features = ['month', 'dayofweek', 'hour', 'holiday']

# Fit scaler on data
scaler = StandardScaler()
scaler.fit(train[num_features])

# Create train and val datasets with selected features
X_train = create_features(train)
y_train = train.loc[:, 'rides']

X_val = create_features(val)
y_val = val.loc[:, 'rides']


from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Fit default lasso model, alpha=1.0, fit_intercept=True, normalize=False
lasso = Lasso().fit(X_train, y_train)

# Define parameters to tune
param_grid = {
    'alpha': (0, 0.001, 0.01, 0.1, 1, 10, 100, 1000),
    'fit_intercept': (True, False),
}

# Print the RMSE score for default Lasso model
print('Pre-tuning Lasso RMSE Score: ', rmse(y_val - lasso.predict(X_val)))

# Perform GridSearchCV to find optimal hyper parameters
grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# Print best score and parameters
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

# Compute val score
rmse_scores['lasso'] = rmse(y_val - grid_result.predict(X_val))

# Print score
print('Lasso Regression model validation RMSE: {} rides'.format(rmse_scores['lasso']))

Pre-tuning Lasso RMSE Score:  178.9269911669568
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Score:  -22161.722769261403
Best Params:  {'alpha': 0.01, 'fit_intercept': True}
Lasso Regression model validation RMSE: 188.5637463154891 rides


# Fit lasso model with the tuned parameters
lasso = Lasso(alpha=0.1, fit_intercept=False).fit(X_train, y_train)

for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {lasso.coef_[coef]}')

temp:  64.8175720669415
humidity:  -26.056506306133777
wind_speed:  -6.738488181392903
month_2:  1.404762635985059
month_3:  -2.5381648999450723
month_4:  -15.901954171583423
month_5:  15.153739147537653
month_6:  52.804809354456395
month_7:  81.12441442712297
month_8:  107.4221144009904
month_9:  108.11289032058062
month_10:  68.28426186992758
month_11:  27.551410596529653
month_12:  14.515564364211333
dayofweek_1:  17.510704105618142
dayofweek_2:  30.789758870747917
dayofweek_3:  19.26817760824033
dayofweek_4:  21.91937997238254
dayofweek_5:  10.043400065952136
dayofweek_6:  -4.06108094995895
hour_1:  -5.279341522993009
hour_2:  -14.18081443992992
hour_3:  -23.35645523497643
hour_4:  -29.130075571167453
hour_5:  -24.838863534326048
hour_6:  -4.700027933942394
hour_7:  35.73258148022806
hour_8:  168.72353256647926
hour_9:  243.61202618284642
hour_10:  119.1829405829983
hour_11:  90.72731072653433
hour_12:  136.464915873793
hour_13:  149.4446152778285
hour_14:  151.7527557917272
hour_15:  156.20135840803755
hour_16:  214.00023068724283
hour_17:  333.4123223075147
hour_18:  351.4553251655975
hour_19:  251.36124080121513
hour_20:  186.9533991888724
hour_21:  120.5771389640059
hour_22:  77.90784178063377
hour_23:  39.71025862076748
holiday_1:  -32.92432627315458


from sklearn.linear_model import Ridge

# Fit default Ridge model, alpha=1.0, fit_intercept=True
ridge = Ridge().fit(X_train, y_train)

# Define parameters to tune
param_grid = {
    'alpha': (0, 0.001, 0.01, 0.1, 1, 10, 100, 1000),
    'fit_intercept': (True, False),
}

# Print the RMSE score for default Ridge model
print('Pre-tuning Ridge RMSE Score: ', rmse(y_val - ridge.predict(X_val)))

# Perform GridSearchCV to find optimal hyper parameters
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# Print best score and parameters
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

# Compute val score
rmse_scores['ridge'] = rmse(y_val - grid_result.predict(X_val))

# Print score
print('Ridge Regression model validation RMSE: {} rides'.format(rmse_scores['ridge']))

Pre-tuning Ridge RMSE Score:  188.63411460007836
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Score:  -22155.836110732933
Best Params:  {'alpha': 10, 'fit_intercept': True}
Ridge Regression model validation RMSE: 188.06530352098702 rides


# Fit ridge model with the tuned parameters
ridge = Ridge(alpha=100, fit_intercept=False).fit(X_train, y_train)

for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {ridge.coef_[coef]}')

temp:  62.65397598213387
humidity:  -26.891652166518448
wind_speed:  -5.516214369572646
month_2:  20.028285673509156
month_3:  14.612312816800792
month_4:  2.734452370604046
month_5:  36.5210560381922
month_6:  74.76392264729095
month_7:  103.24585390629863
month_8:  128.21400175595858
month_9:  128.14462911399588
month_10:  88.01452172124456
month_11:  46.47562340700942
month_12:  32.914310800064214
dayofweek_1:  37.327576098260785
dayofweek_2:  50.22632356803149
dayofweek_3:  38.86825925741152
dayofweek_4:  41.31833303847929
dayofweek_5:  29.612088449039753
dayofweek_6:  14.466585247868759
hour_1:  -39.64255675787584
hour_2:  -47.94738371262298
hour_3:  -56.58096952983365
hour_4:  -61.97566668593611
hour_5:  -57.92716896895159
hour_6:  -39.138132232398775
hour_7:  2.9584031877274217
hour_8:  126.99865868435685
hour_9:  196.78512887386344
hour_10:  80.58770652441841
hour_11:  53.96557116841847
hour_12:  96.58438435355264
hour_13:  108.67376529441296
hour_14:  110.8082804093221
hour_15:  114.96770734770149
hour_16:  168.92847364099893
hour_17:  280.3595510410494
hour_18:  297.28420130959336
hour_19:  204.01871120985538
hour_20:  144.0027180945713
hour_21:  82.11910972661846
hour_22:  42.3643747857892
hour_23:  6.762179534300408
holiday_1:  -22.150118290450724


from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_2 = make_pipeline(PolynomialFeatures(2), LinearRegression())

poly_2.fit(X_train, y_train)

# Compute rmse score
rmse_scores['poly_2_deg'] = rmse(y_val - poly_2.predict(X_val))

# Print rmse score
print('Polynomial (degree 2) Regression model validation RMSE: {} rides'.format(rmse_scores['poly_2_deg']))

Polynomial (degree 2) Regression model validation RMSE: 201.36833376076885 rides


# Convert dictionary to DataFrame to sort values
rmse_plot = pd.DataFrame.from_dict(rmse_scores, 
            orient='index', 
            columns=['RMSE Scores']).sort_values(by='RMSE Scores')

# Plot figure
fig, ax = plt.subplots(figsize=(8,5))
sns.barplot(x=rmse_plot['RMSE Scores'], y=rmse_plot.index)
ax.set_title('RMSE Scores for various models', fontsize=16)
ax.set_xlabel('RMSE Scores (Lower Is Better)')

# Label bars
for p in ax.patches:
    width = p.get_width()
    ax.text(width + 1,
            p.get_y() + p.get_height() / 2,
            '{:1.2f}'.format(width),
            ha = 'left',
            va = 'center'
    )

plt.show()


# Recreate the Linear Model with lowest RMSE
num_features = ['temp', 'humidity', 'wind_speed', 'month_sin', 'month_cos', 
                'dayofweek_sin', 'dayofweek_cos', 'hr_sin', 'hr_cos']
cat_features = []

# Fit scaler on data
scaler = StandardScaler()
scaler.fit(train[num_features])

# Recreate training data
X_train = create_features(train)
y_train = train.loc[:, 'rides']

# Split the test dataset into X and y components
X_test = create_features(test)
y_test = test.loc[:, 'rides']

# Create and fit model
linear_model = LinearRegression(fit_intercept=False)
linear_model.fit(X_train, y_train)

# Calculate RMSE of the test dataset
test_rmse = rmse(y_test - linear_model.predict(X_test))

print(f'Test dataset RMSE: {test_rmse}')

Test dataset RMSE: 117.7263678173935


# Calculate RMSE if all negative predictions were 0
test_rmse_above_0 = rmse(y_test - np.maximum(0, linear_model.predict(X_test)))

print(f'Test RMSE if negative predictions were taken as 0: {test_rmse_above_0}')

Test RMSE if negative predictions were taken as 0: 68.15684052218472


# Calculate residuals
residuals = y_test - np.maximum(0, linear_model.predict(X_test))
#residuals = y_test - linear_model.predict(X_test)

# Plot residuals
fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.regplot(y_test, residuals, scatter_kws={'s':1, 'alpha':0.1})
ax.set_xlabel('Hourly Rides (Test Data)')
ax.set_ylabel('Residuals (Actual Hourly Rides - Predicted')
ax.set_title('Residuals vs Hourly Rides on Test Data');


def forecast(date, wind_spd, temp, humidity):
    df = pd.DataFrame([pd.Timestamp(date, tz='EST')], columns=['Start Time'])
    df = df.set_index(df['Start Time'])
    df['month'] = df['Start Time'].dt.month
    df['dayofweek'] = df['Start Time'].dt.dayofweek
    df['hour'] = df['Start Time'].dt.hour
    
    df['wind_speed'] = wind_spd
    df['temp'] = temp
    df['humidity'] = humidity
    
    df['month_sin'] = np.sin((df.month-1)*(2.*np.pi/12))
    df['month_cos'] = np.cos((df.month-1)*(2.*np.pi/12))
    df['dayofweek_sin'] = np.sin(df.dayofweek*(2.*np.pi/7))
    df['dayofweek_cos'] = np.cos(df.dayofweek*(2.*np.pi/7))
    df['hr_sin'] = np.sin(df.hour*(2.*np.pi/24))
    df['hr_cos'] = np.cos(df.hour*(2.*np.pi/24))

    return linear_model.predict(create_features(df))

print('Sunday early morning, cold weather: ', forecast(date='2021-04-11 0400', wind_spd=50, temp=-15, humidity=5))
print('Wenesday end of work day, warm weather: ', forecast(date='2021-04-14 1700', wind_spd=5, temp=25, humidity=60))

Sunday early morning, cold weather:  [-270.32905166]
Wenesday end of work day, warm weather:  [241.94885331]

CIV1498 - Introduction to Data Science¶

Team https://xkcd.com/1838/¶

Feature Engineering¶

Processing¶

Model Selection¶

Multiple Linear Regression Model, Base Case¶

Cyclical Time¶

Period/Yearly Patterns¶

Other Models¶

Lasso Regression¶

Ridge Regression¶

Polynomial Regression¶

Testing the Model¶

Conclusions¶