This is a mini post part of this project. (Originally posted here ).

Take a quick look at time of day distribution

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
datadir = '/opt/data'
localdir = '/opt/program'
tripsdf = pd.read_csv(f'{datadir}/2013-07 - Citi Bike trip data.csv')
stationsdf = pd.read_csv(f'{localdir}/datas/stations/stations-2018-12-04-c.csv',
                        index_col=0)
tripsdf.iloc[0]
tripduration                               634
starttime                  2013-07-01 00:00:00
stoptime                   2013-07-01 00:10:34
start station id                           164
start station name             E 47 St & 2 Ave
start station latitude                 40.7532
start station longitude               -73.9703
end station id                             504
end station name               1 Ave & E 15 St
end station latitude                   40.7322
end station longitude                 -73.9817
bikeid                                   16950
usertype                              Customer
birth year                                  \N
gender                                       0
Name: 0, dtype: object
tripsdf['starttime'].map(lambda x: x[11:13]).iloc[:10]
0    00
1    00
2    00
3    00
4    00
5    00
6    00
7    00
8    00
9    00
Name: starttime, dtype: object
tripsdf['hour'] = tripsdf['starttime'].map(lambda x: x[11:13])
# For all time looks like maybe two peaks

fig = plt.figure(figsize=(6, 6))
fig.patch.set_facecolor('xkcd:mint green')
ax = fig.add_subplot(111, )

ax.hist(tripsdf.hour.tolist(), bins=24)
plt.grid(True)
# Perhaps on weekdays different?
import fresh.utils as fu

fu.prepare_weekday_feature(tripsdf)
fig = plt.figure(figsize=(6, 6))
fig.patch.set_facecolor('xkcd:mint green')
ax = fig.add_subplot(111, )
ax.hist(tripsdf[tripsdf.weekday == True].hour.tolist(), bins=24)
ax.set_title('Weekday hour histogram')
plt.grid(True)
# Weekend wow big difference. 
fig = plt.figure(figsize=(6, 6))
fig.patch.set_facecolor('xkcd:mint green')
ax = fig.add_subplot(111, )
ax.hist(tripsdf[tripsdf.weekday == False].hour.tolist(), bins=24)
ax.set_title('Weekend hour histogram')
plt.grid(True)
# Ok based on the above, going to create a version 2...
# time_of_day_v2_peaky
# Not sure how to account for the different peaks on weekends
#     # 0: 6-10, 1: 11-15, 2: 16-20, 3: 21-00, 00-5

Age

  • Somehow I did not include age this time around but I should.
tripsdf[['usertype', 'birth year']].iloc[:5]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

# Ok should basically fill these as np.nan for 'Customer'
dict(tripsdf[tripsdf.usertype=='Customer']['birth year'].value_counts())
{'\\N': 174887, '1995': 28}
# Luckily no nulls
tripsdf[(tripsdf.usertype=='Subscriber')& (tripsdf['birth year'].isnull())].shape
(0, 18)
def make_xtick_labels(x, step=5):
    '''Given x, step the labels every <step>
    Aka, take every <step>th x label
    '''
    x_ticks = [i for i in  range(len(x)) if i % step == 0]
    x_labels = [x[i] for i in x_ticks]
    return x_ticks, x_labels
import numpy as np
tripsdf['birth year'] = tripsdf['birth year'].map(lambda x:int(x) if x != '\\N' else np.nan )
# Discard below 1913 as np.nan. TODO
tripsdf[(tripsdf.usertype=='Subscriber')& (tripsdf['birth year'] < 1913)].shape
(226, 18)
X = [x for x in tripsdf[(tripsdf.usertype == 'Subscriber')]['birth year'].tolist()
    if x > 1913]
print(len(X))
fig = plt.figure(figsize=(12, 6))
fig.patch.set_facecolor('xkcd:mint green')
ax = fig.add_subplot(111, )

ax.hist(X[:10000], bins=70)

# x_ticks, x_labels = make_xtick_labels(x[:1000], step=20)
# ax.set_xticks(x_ticks)
# ax.set_xticklabels(x_labels, rotation=-45)

ax.set_title('Birth year binned (2013-07)')
plt.grid(True)
fig.show()
668262
# Ok this seems normal ish. so might as well just split somewhat arbitrarily or evenly

def get_quantiles(unsorted):
    data = sorted(unsorted)
    minimum = data[0]
    Q1 = np.percentile(data, 25, interpolation = 'midpoint') 
    median = np.median(data)
    Q3 = np.percentile(data, 75, interpolation = 'midpoint') 
    maximum = data[-1]
    return [minimum, Q1, median, Q3, maximum]

def show_da_stats(bundle):
    H, bins = bundle['hist']
    quantiles = bundle['quantiles']
    fig = plt.figure(figsize=(12, 6))
    fig.patch.set_facecolor('xkcd:mint green')
    ax = fig.add_subplot(111, )
    ax.scatter(quantiles, [1, 1, 1, 1, 1])
    ax.axvline(quantiles[1], label='q:25%')
    ax.axvline(quantiles[2], label='q:50%')
    ax.axvline(quantiles[3], label='q:75%')

    ax.set_title(bundle['title'])
    ax.plot(bins, np.insert(H, 0, H[0]), drawstyle='steps', color='green')
    plt.grid(True)
    fig.legend()
    fig.show()
    
#
hist = np.histogram(X, bins=100, range=None)

quantiles = get_quantiles(X)
bundle = {'hist': hist, 'quantiles': quantiles,
               'title': 'Birth year binned (2013-07)'}
print('quantiles', quantiles)
show_da_stats(bundle)
quantiles [1920.0, 1969.0, 1978.0, 1984.0, 1997.0]
# And the age quantiles to make this a year independent feature.. 
2013 - np.array([1920.0, 1969.0, 1978.0, 1984.0, 1997.0])
array([93., 44., 35., 29., 16.])
import datetime; import pytz
x = dict(tripsdf.iloc[2])
x

(x['start_dt'] - 
     datetime.datetime(int(x['birth year']), 1, 1, tzinfo=pytz.timezone('US/Eastern'))
).days/365. , (x['start_dt'] - np.nan)
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-102-ee3bddc50852> in <module>
      5 (x['start_dt'] - 
      6      datetime.datetime(int(x['birth year']), 1, 1, tzinfo=pytz.timezone('US/Eastern'))
----> 7 ).days/365. , (x['start_dt'] - np.nan)


TypeError: unsupported operand type(s) for -: 'Timestamp' and 'float'
x['start_dt'] - pd.NaT
NaT
# Reload once again since I modified a col
tripsdf = pd.read_csv(f'{datadir}/2013-07 - Citi Bike trip data.csv')
reload(fu)
minidf = tripsdf.iloc[:1000].copy()
fu.prepare_weekday_feature(minidf)
fu.age_feature(minidf)
list(reversed([93., 44., 35., 29., 16.]))
[16.0, 29.0, 35.0, 44.0, 93.0]
minidf['birth_bin'] = pd.cut(minidf['age'], bins=[16.0, 29.0, 35.0, 44.0, 93.0],
                            labels=[0, 1, 2, 3])
minidf[['age', 'birth', 'birth_bin']].iloc[:10]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

Prepare to build the v3 proc bundle

  • I have a nice reference for testing
  • And for building, I need to do some reverse engineering to maintain the train/test split from earlier.
import fresh.predict_utils as fpu

bundle = fpu.load_bundle_in_docker()
Loading from bundle_loc /opt/ml/model/all_bundle_with_stationsdf.joblib
print('original proc bundle notebook', bundle['proc_bundle']['bundle']['notebook'])
bundle['proc_bundle']['bundle']['proc_bundle'].keys()
print('this is the bundle glue notebook, cool', bundle['notebook'])
print('model notebook', bundle['model_bundle']['bundle']['notebook'])
print('train', bundle['model_bundle']['bundle']['train'])
print('test', bundle['model_bundle']['bundle']['validation_metrics']['test'])
original proc bundle notebook 2020-07-03-aws.ipynb
this is the bundle glue notebook, cool 2020-08-18-glue.ipynb
model notebook 2020-07-10-aws.ipynb
train /home/ec2-user/SageMaker/learn-citibike/artifacts/2020-07-08T143732Z/train.libsvm
test /home/ec2-user/SageMaker/learn-citibike/artifacts/2020-07-08T143732Z/test.libsvm
import fresh.preproc.v3 as pv3
!pwd