#!/usr/bin/env python3
# In this solution, 2 linear regression
# models are combined using second stage
# linear regression.
import pandas as pd
from sklearn.linear_model import LinearRegression
def preprocess(raw):
data = raw.copy()
# region indicators
one_hot_region = pd.get_dummies(data['region'])
data = data.join(one_hot_region)
# year and year squared
data['year'] = data['date'].map(lambda x: x.year)
data['year^2'] = data['year']**2
# day of year up to 4th power
data['doy'] = data['date'].map(lambda x: x.dayofyear)
data['doy^2'] = data['doy']**2
data['doy^3'] = data['doy']**3
data['doy^4'] = data['doy']**4
# 30-day rolling average of T2M
data['T2M_rolling'] = data.groupby('region').T2M.apply(
pd.rolling_mean, window=30, min_periods=1)
return data.dropna()
train_data_raw = pd.read_csv('train.csv', parse_dates=['date'], index_col='Id')
test_data_raw = pd.read_csv('test.csv', parse_dates=['date'], index_col='Id')
preprocess(train_data_raw).to_csv('train_preprocessed.csv')
preprocess(test_data_raw).to_csv('test_preprocessed.csv')
# Read preprocessed data from a file to be
# bit-identical with the original solution where
# preprocessing was done in a separate script.
train_data = pd.read_csv('train_preprocessed.csv', index_col='Id')
test_data = pd.read_csv('test_preprocessed.csv', index_col='Id')
# Notation:
# X0 - training predictors
# y0 - training target
# X1 - test predictors
# y1 - predictions
y0 = train_data['mortality_rate']
# Model A
#
# Like the benchmark solution it does not
# use region as predictor. Main changes over
# a benchmark were these additional features:
# - "day of year" up to the 4th power to capture
# seasonal pattern
# - "year" to capture trend
# - "year" squared to capture apparent change
# in trend at the end of training period
# Also not using "PM10" and "PM25" gives
# less noisy predictions which seem to perform
# better on LB.
a = LinearRegression()
a_predictors = ['NO2', 'year', 'year^2', 'doy', 'doy^2', 'doy^3', 'doy^4']
# train
a_X0 = train_data[a_predictors]
a.fit(a_X0, y0)
# Model B
#
# Includes region as predictor. CV improved
# compared to A, but LB score worse. Later also
# moved NO2 to model A and used 30-day rolling
# average of T2M rather than T2M (less noise in
# predictions).
b = LinearRegression()
b_predictors = [
'T2M_rolling', 'year', 'year^2', 'doy', 'doy^2', 'doy^3', 'doy^4',
'E12000001', 'E12000002', 'E12000003', 'E12000004',
'E12000005', 'E12000006', 'E12000007', 'E12000008'
]
# train
b_X0 = train_data[b_predictors]
b.fit(b_X0, y0)
# Models which use region as predictor performed
# good in CV but had bad scores on LB. Maybe
# reducing regional effect but not eliminating it
# completely could help?
# Somewhere at this point I have stopped relying on CV.
# and used feedback from LB to tune the model.
# Model C: auxiliary model for computing mean
# mortality rate per region
c = LinearRegression()
c_predictors = [
'E12000001', 'E12000002', 'E12000003', 'E12000004',
'E12000005', 'E12000006', 'E12000007', 'E12000008'
]
# train
c_X0 = train_data[c_predictors]
c.fit(c_X0, y0)
# Subtract 67% of C from B to reduce regional
# effect in B. Settled on 0.67 after trying
# couple of different values on LB.
region_effect_reduction = 0.67
# Model AB
#
# Produces the final solution. Combines A and
# and "reduced B" by using their predictions
# as features for a linear regression.
ab = LinearRegression()
# train
ab_X0 = pd.DataFrame({
'A': a.predict(a_X0),
'reduced B': b.predict(b_X0) - (c.predict(c_X0) * region_effect_reduction)
})
ab.fit(ab_X0, y0)
# predict
a_X1 = test_data[a_predictors]
b_X1 = test_data[b_predictors]
c_X1 = test_data[c_predictors]
ab_X1 = pd.DataFrame({
'A': a.predict(a_X1),
'reduced B': b.predict(b_X1) - (c.predict(c_X1) * region_effect_reduction)
})
ab_y1 = ab.predict(ab_X1)
# save predictions for submision
predictions = pd.DataFrame(index=test_data.index)
predictions['mortality_rate'] = ab_y1
predictions.to_csv('predictions.csv')