LGBM Regression Template
時系列データの予測 (Kaggle Electric Price Prediction)
#data analysis libraries
import numpy as np
import pandas as pd
import math
#最大表示列数の指定(ここでは50列を指定)
pd.set_option('display.max_columns', 50)
#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#machinelearning libraries
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import lightgbm as lgb
import category_encoders as ce
from sklearn.decomposition import PCA
import tqdm
from sklearn import preprocessing
import warnings
warnings.resetwarnings()
warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', DeprecationWarning)
0. Load_Data
# azureml-core のバージョン 1.0.72 以上が必要です
# バージョン 1.1.34 以降の azureml-dataprep[pandas] が必要です
from azureml.core import Workspace, Dataset
subscription_id = '7c8a3124-bfd6-4ad6-9036-0ac6bc860830'
resource_group = 'Dept-PEN-ALG-D-RG'
workspace_name = 'algworkspace'
workspace = Workspace(subscription_id, resource_group, workspace_name)
train = Dataset.get_by_name(workspace, name='sp-act-01').to_pandas_dataframe()
test = Dataset.get_by_name(workspace, name='sp-act-02').to_pandas_dataframe().reset_index(drop=True)
weather = Dataset.get_by_name(workspace, name='sp-act-03').to_pandas_dataframe().reset_index(drop=True)
sample_sub = Dataset.get_by_name(workspace, name='sp-act-04').to_pandas_dataframe().set_index("id")
1. Data Confirmation & Pre-Treatment
#Train_Dataset
display(train.head())
display(train.shape)
#Test_Dataset
display(test.head())
display(test.shape)
#Weather_Dataset
display(weather.head())
display(weather.shape)
#Sample_submmision
display(sample_sub.head())
display(sample_sub.shape)
1.1 Energy Dataset
#Confirm data summary
train.describe(include="all")
# Drop unusable columns like NaN,0
# forecast data also dropped to pridict with actual data only
drop_columns=['id',
'generation fossil coal-derived gas',
'generation fossil oil shale',
'generation fossil peat',
'generation geothermal',
'generation hydro pumped storage aggregated',
'generation marine',
'generation wind offshore',
'forecast wind offshore eday ahead',
'total load forecast',
'forecast solar day ahead',
'forecast wind onshore day ahead']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)
#Confirm null data rate
def get_null_df(df: pd.DataFrame):
"""Calculate missing data rate
Args:
df (pd.DataFrame): target_df
Returns:
col_null_df: dataframe of null_feature_rate
"""
col_null_df = pd.DataFrame(columns = ['Column', 'Type', 'Total NaN', '%'])
col_null = df.columns[df.isna().any()].to_list()
L = len(df)
for col in col_null:
T = 0
if df[col].dtype == "float64":
T = "Numerical"
elif df[col].dtype == "int64":
T = "Numerical"
else:
T = "Categorical"
nulls = len(df[df[col].isna() == True][col])
col_null_df = col_null_df.append({'Column': col,
'Type': T,
'Total NaN': nulls,
'%': (nulls / L)*100
}, ignore_index=True)
return col_null_df
print("Train_data")
display(get_null_df(train))
print("Test_data")
display(get_null_df(test))
#Fill NaN values with ffill
time_train=train["time"]
time_test=test["time"]
train=train.iloc[:,2:].interpolate(method='linear', limit_direction='forward', axis=0)
test=test.iloc[:,2:].interpolate(method='linear', limit_direction='forward', axis=0)
train.insert(0,"time",time_train)
test.insert(0,"time",time_test)
1.2 Weather Data
#Confirm data summary
weather.describe(include="all")
def get_unique_df(df: pd.DataFrame):
"""Show unique column data
Args:
df (pd.dataFrame):target_df
Returns:
unique_df: unique_df_columns
"""
unique_df = pd.DataFrame(columns=['Feature', 'Unique', 'Count'])
for col in df.columns:
v = df[col].unique()
l = len(v)
unique_df = unique_df.append({'Feature':col,
'Unique':v,
'Count':l}, ignore_index=True)
return unique_df
get_unique_df(weather)
# Drop unusable columns
drop_columns=['year']
weather = weather.drop(drop_columns, axis=1)
#Confirm null data rate
print("Weather_data")
display(get_null_df(weather))
#delete indent
weather=weather.replace(" Barcelona", "Barcelona")
city_name=["Valencia", "Madrid", "Bilbao", "Barcelona", "Seville"]
for city in city_name:
print(city+":"+str(len(weather[weather.city_name==city].reset_index(drop=True))))
#make all city weather data frame
#adjust time frame to Valencia(Min data number)
def make_weather_dataset(df,city_name=["Madrid", "Bilbao", "Barcelona", "Seville","Valencia"]):
df_concat=pd.DataFrame()
for city in city_name:
city_df=weather[weather.city_name==city].reset_index(drop=True)
dt_iso=city_df["dt_iso"]
city_df=city_df.drop(columns="city_name")
city_df.columns= [str(col) + '_'+ city for col in city_df.columns]
df_concat=pd.concat([df_concat,city_df.iloc[:,1:]],axis=1)
df_concat.insert(0,"time",dt_iso)
df_concat=df_concat[~df_concat["time"].isnull()]
df_concat=df_concat.drop_duplicates(subset=["time"]).reset_index(drop=True)
return df_concat
weather_concat=make_weather_dataset(weather)
1.3 Merge Two Datasets
#change date datatype
train["time"]=pd.to_datetime(train["time"])
weather_concat["time"]=pd.to_datetime(weather_concat["time"])
#concat with weather_df
train_all=pd.merge(train,weather_concat,on="time",how="left")
test_all=pd.merge(test,weather_concat,on="time",how="left")
# train_all=train_all.loc[:,~train_all.columns.duplicated()]
# test_all=test_all.loc[:,~test_all.columns.duplicated()]
display(train.shape)
display(train_all.shape)
display(test.shape)
display(test_all.shape)
2. Visualization and Time Series Analysis
#Confirm Distribution of train & test dataset
def get_numeric_features_plot(train: pd.DataFrame, test: pd.DataFrame, cont_features: list, height, figsize,hspace=.3):
"""Show Numeric Features Distribution
Args:
train (pd.DataFrame): train_df
test (pd.DataFrame): test_df
cont_features (list): target_features
height ([type]): plot_height
figsize ([type]): plot_size
hspace (float, optional): space of figs. Defaults to .3.
"""
ncols = 2
nrows = int(math.ceil(len(cont_features)/2))
fig, axs = plt.subplots(
ncols=ncols, nrows=nrows, figsize=(height*2, height*nrows))
plt.subplots_adjust(right=1.5, hspace=hspace)
for i, feature in enumerate(cont_features):
plt.subplot(nrows, ncols, i+1)
# Distribution of target features
sns.distplot(train[feature], label='Train',
hist=True, color='#e74c3c')
sns.distplot(test[feature], label='Test',
hist=True, color='#2ecc71')
plt.xlabel('{}'.format(feature), size=figsize, labelpad=15)
plt.ylabel('Density', size=figsize, labelpad=15)
plt.tick_params(axis='x', labelsize=figsize)
plt.tick_params(axis='y', labelsize=figsize)
plt.legend(loc='upper right', prop={'size': figsize})
plt.legend(loc='upper right', prop={'size': figsize})
plt.title('Distribution of {} Feature'.format(
feature), size=figsize, y=1.05)
plt.show()
#Get only float and int columns
cont_features=list(train_all.select_dtypes(include='float' or 'int').columns)
cont_features.remove("price actual")
get_numeric_features_plot(train_all,test_all,cont_features=cont_features,height=4,figsize=10, hspace=.7)
# Categorical Features
def get_categorical_features_plot(train: pd.DataFrame, test: pd.DataFrame, cat_features: list, height, figsize, hspace=.3):
"""Show Numeric Features Distribution
Args:
train (pd.DataFrame): train_df
test (pd.DataFrame): test_df
cat_features (list): target_features
height ([type]): plot_height
figsize ([type]): plot_size
hspace (float, optional): space of figs. Defaults to .3.
"""
ncols = 2
nrows = int(math.ceil(len(cat_features)/2))
train["type"] = "train"
test["type"] = "test"
whole_df = pd.concat([train, test], axis=0).reset_index(drop=True)
fig, axs = plt.subplots(
ncols=ncols, nrows=nrows, figsize=(height*2, height*nrows))
plt.subplots_adjust(right=1.5, hspace=hspace)
for i, feature in enumerate(cat_features):
plt.subplot(nrows, ncols, i+1)
# Distribution of target features
ax=sns.countplot(data=whole_df, x=feature, hue="type")
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
plt.xlabel('{}'.format(feature), size=figsize, labelpad=15)
plt.ylabel('Density', size=figsize, labelpad=15)
plt.tick_params(axis='x', labelsize=figsize)
plt.tick_params(axis='y', labelsize=figsize)
plt.legend(loc='upper right', prop={'size': figsize})
plt.legend(loc='upper right', prop={'size': figsize})
plt.title('Count of {} Feature'.format(feature), size=figsize, y=1.05)
plt.show()
#Get only object columns
cat_features=list(train_all.select_dtypes(include='object').columns)
get_categorical_features_plot(train_all,test_all,cat_features=cat_features,height=8,figsize=10, hspace=1.3)
#Autocorrelation (3 yeasrs 2015-2017)
rolling = train_all['price actual'].rolling(24*7, center=True).mean()
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15, 8))
ax1.plot(rolling, linestyle='-', linewidth=2, label='Weekly rolling mean')
ax1.set_title("price actual")
plot_acf(train_all['price actual'], lags=50, ax=ax2)
plot_pacf(train_all['price actual'], lags=50, ax=ax3)
plt.tight_layout()
plt.show()
#Autocorrelation (1 year 2015)
train_2015=train_all.iloc[:8760,:]
rolling = train_2015['price actual'].rolling(24*7, center=True).mean()
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15, 8))
ax1.plot(rolling, linestyle='-', linewidth=2, label='Weekly rolling mean')
ax1.set_title("price actual")
plot_acf(train_2015['price actual'], lags=50, ax=ax2)
plot_pacf(train_2015['price actual'], lags=50, ax=ax3)
plt.tight_layout()
plt.show()
# Find the correlations between the electricity price and the rest of the features
correlations = train_all.corr(method='pearson')
highly_correlated = abs(correlations[correlations > 0.7])
print(highly_correlated[highly_correlated < 1.0].stack())
3. Feature Engineering
Parameters
class CONFIG:
exp="exp11"
lag=168
fold=3
depth=3
3.1 Time Features
## Make Time features by sin/cos
def make_time(input_df):
output_df=pd.DataFrame()
output_df["time"]=input_df["time"]
#output_df["weekday"]=""
# for i in range(len(output_df)):
# position = output_df["time"][i]
# weekday = position.weekday()
# if (weekday == 6):
# output_df.loc[position, 'weekday'] = 2
# elif (weekday == 5):
# output_df.loc[position, 'weekday'] = 1
# else:
# output_df.loc[position, 'weekday'] = 0
output_df["month"]=output_df["time"].apply(lambda x: x.month)
output_df["day"]=output_df["time"].apply(lambda x: x.day)
output_df["year"]=output_df["time"].apply(lambda x: x.year)
#Monthyly trend by Day
output_df["month_sin"] = np.sin(output_df["day"] /31*np.pi * 2)
output_df["month_cos"] = np.cos(output_df["day"] /31* np.pi * 2 )
#Yearly trend by Month
output_df["year_sin"] = np.sin(output_df["month"] /12 * np.pi * 2 )
output_df["year_cos"] = np.cos(output_df["month"] /12 * np.pi * 2 )
#Quater trend by month
output_df["quater_sin"] = np.sin(output_df["month"] /4 * np.pi * 2 )
output_df["quater_cos"] = np.cos(output_df["month"] /4 * np.pi * 2 )
#Half Yearly trend by Month
output_df["half_sin"] = np.sin(output_df["month"] /6 * np.pi * 2 )
output_df["half_cos"] = np.cos(output_df["month"] /6 * np.pi * 2 )
#2 Month Trend by Month
output_df["2month_sin"] = np.sin(output_df["month"] /2 * np.pi * 2 )
output_df["2month_cos"] = np.cos(output_df["month"] /2 * np.pi * 2 )
output_df=output_df.drop(columns=["time"])
# print(input_df.shape)
# print(output_df.shape)
# print(len(input_df.columns.unique()))
# print(len(output_df.columns.unique()))
return output_df
3.2 Make Lag Feature
def make_lag(input_df):
#Select only highly correlated columns for lag features
targets=list(highly_correlated[highly_correlated < 1.0].dropna(how="all").index)
targets.remove("price actual")
#1 day
#shift_time=args.lag
shift_time=CONFIG.lag
output_df=input_df[targets].copy()
for target in targets:
for i in range(shift_time):
output_df[target+f"_lag{i+1}"] = output_df[target].shift(i+1)
output_df=output_df.fillna(0)
output_df=output_df.iloc[:,len(targets):]
return output_df
3.3 Encording
def label_encoding(input_df):
target_cols=list(input_df.select_dtypes(include='object').columns)
features = input_df[target_cols]
encoder = ce.OrdinalEncoder().fit(features.values)
output_df = pd.DataFrame(encoder.transform(features.values))
output_df.columns = target_cols
output_df = output_df.add_prefix("LE_")
output_df=output_df.astype(str)
output_df = pd.concat([input_df, output_df], axis=1)
output_df=output_df.iloc[:,-len(target_cols):]
return output_df
def count_encoding(input_df):
target_cols=list(input_df.select_dtypes(include='object').columns)
features = input_df[target_cols]
encoder = ce.CountEncoder().fit(features.values)
output_df = pd.DataFrame(encoder.transform(features.values))
output_df.columns = target_cols
output_df = output_df.add_prefix("CE_")
output_df = pd.concat([input_df, output_df], axis=1)
output_df=output_df.iloc[:,-len(target_cols):]
return output_df
def target_encoding(input_df):
target_cols=list(input_df.select_dtypes(include='object').columns)
features = input_df[target_cols]
target=input_df["price actual"]
encoder = ce.TargetEncoder().fit(features.values, target)
output_df = pd.DataFrame(encoder.transform(features.values))
output_df.columns = target_cols
output_df = output_df.add_prefix("TE_")
output_df = pd.concat([input_df, output_df], axis=1)
output_df=output_df.iloc[:,-len(target_cols):]
return output_df
3.4 Apply Feature Enginering to Dataset
def preprocess(train, test):
input_df = pd.concat([train, test]).reset_index(drop=True)
funcs = [make_time,
label_encoding,
count_encoding,
target_encoding,
make_lag]
output = []
for func in tqdm.tqdm(funcs):
_df = func(input_df)
output.append(_df)
output = pd.concat(output, axis=1)
output = pd.concat([input_df,output], axis=1)
# output=standard_scaler(output)
# output=select_features(output)
drop_cols=list(input_df.select_dtypes(include='object').columns)
output=output.drop(columns=drop_cols)
train_x = output.iloc[:len(train)]
test_x = output.iloc[len(train):].reset_index(drop=True)
test_x=test_x.drop(columns=["price actual"])
return train_x, test_x
train_x, test_x=preprocess(train_all,test_all)
4. Model Development
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26305 entries, 0 to 26304
Columns: 3162 entries, time to temp_max_Valencia_lag168
dtypes: datetime64[ns](1), float64(3120), int64(25), object(16)
memory usage: 634.6+ MB
#Get Categorical features
cat_col=list(train_x.select_dtypes(include='object').columns)
train_x[cat_col]=train_x[cat_col].astype(int)
test_x[cat_col]=test_x[cat_col].astype(int)
#Get Columns for Predict
pred_col = list(train_x.columns)
pred_col.remove("price actual")
pred_col.remove("time")
# ====================================================
# Model Preparation
# ====================================================
class SingleLgb:
def __init__(self, cat_col, seed=71, dry_run=False):
self.train_param = self.get_param(seed)
if dry_run:
self.num_rounds = 2000
else:
self.num_rounds = 10000
self.cat_col=cat_col
def do_train_direct(self, x_train, x_test, y_train, y_test):
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)
print('Start training...')
#LOGGER.info('Start training...')
model = lgb.train(self.train_param,
lgb_train,
valid_sets=[lgb_eval],
verbose_eval=100,
num_boost_round=self.num_rounds,
early_stopping_rounds=100,
categorical_feature=self.cat_col
)
print('End training...')
#LOGGER.info('End training...')
return model
@staticmethod
def show_feature_importance(model, filename=None):
fi = pd.DataFrame({
"name": model.feature_name(),
"importance_split": model.feature_importance(importance_type="split").astype(int),
"importance_gain": model.feature_importance(importance_type="gain").astype(int),
})
fi = fi.sort_values(by="importance_gain", ascending=False)
print(fi)
#LOGGER.info(fi)
@staticmethod
def get_param(seed=71):
return {
'num_leaves': 1023,
'min_data_in_leaf': 50,
'objective': 'regression',
'metric': 'rmse',
'max_depth': CONFIG.depth,
'learning_rate': 0.05,
"boosting": "gbdt",
"feature_fraction": 0.9,
"verbosity": -1,
"random_state": seed,
}
class SingleTrainer:
def __init__(self, pred_col, cat_col,dry_run=False):
self.pred_col = pred_col
self.target_col = 'price actual'
self.dry_run = dry_run
self.cat_col=cat_col
def adjust_price(self,x):
price_list=[]
for item in x:
remain=item%25
if remain >=12.5:
item=(item-remain)+25
price_list.append(item)
else:
item=item-remain
price_list.append(item)
return price_list
def train_model(self, df):
X = df[self.pred_col]
y = df[self.target_col]
kf = TimeSeriesSplit(n_splits=CONFIG.fold)
models, scores = list(), list()
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
print("---------")
print("fold=", fold)
#LOGGER.info("---------")
#LOGGER.info(f"fold={fold}")
X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
print(X_train.shape, X_val.shape)
lgbm = SingleLgb(seed=71, dry_run=self.dry_run, cat_col=self.cat_col)
model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
y_pred=model.predict(X_val)
y_pred=self.adjust_price(y_pred)
score=np.sqrt(mean_squared_error(y_val,y_pred))
if fold == 0:
lgbm.show_feature_importance(model)
models.append(model)
scores.append(score)
print(f'fold= {fold} RMSE Score='+str(score))
#LOGGER.info(f'fold= {fold} | MAE Score='+str(score))
#LOGGER.info(np.mean(scores))
return models, np.mean(scores)
5. Training
trainer = SingleTrainer(pred_col, cat_col=cat_col,dry_run=False)
models, score = trainer.train_model(train_x)
print(score)
preds_train = []
for m in models:
preds_train.append(m.predict(train_x[pred_col]))
pred_train = np.mean(preds_train, axis=0) # モデルたちの平均をとる
fig, ax = plt.subplots(figsize=(8, 8))
sns.histplot(train_x["price actual"], label='Actual', ax=ax, color='black')
sns.histplot(pred_train, label='Predict', ax=ax, color='C1')
ax.legend()
ax.grid()
#yy-plot
def yy_plot(y_obs, y_pred):
yvalues = np.concatenate([y_obs, y_pred])
ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
plt.figure(figsize=(8, 8))
plt.scatter(y_obs, y_pred,alpha=.6)
plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
#plt.xlim(10, 17)
#plt.ylim(10,17)
plt.xlabel('y_observed', fontsize=24)
plt.ylabel('y_predicted', fontsize=24)
plt.title('Observed-Predicted Plot', fontsize=24)
plt.tick_params(labelsize=16)
yy_plot(y_obs=train_x["price actual"].values,y_pred=pred_train)
6. Inference
preds = []
for m in models:
preds.append(m.predict(test_x[pred_col]))
pred = np.mean(preds, axis=0) # モデルたちの平均をとる
sub_df = sample_sub.copy()
sub_df["predicted"]=pred
7. Make Submit File
#csvとして吐き出す
sub_df.to_csv(f"submit_"+CONFIG.exp+".csv")