LGBM Classification Template
分類(2値分類)
必要モジュールの読み込み
import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm
from sklearn import model_selection, metrics
import lightgbm as lgb
データの確認
# データ読み込み
train_df = pd.read_csv("../input/uec2021-exercise-1/train.csv")
test_df = pd.read_csv("../input/uec2021-exercise-1/test.csv")
sample_submission_df = pd.read_csv("../input/uec2021-exercise-1/sample_submission.csv")
train_df.shape, test_df.shape, sample_submission_df.shape
train_df.head(2),
test_df.head(2)
sample_submission_df.head(2)
モデルの作成
class SingleLgb:
def __init__(self, seed=99, dry_run=False):
self.train_param = self.get_param(seed)
if dry_run:
self.num_rounds = 100
else:
self.num_rounds = 4000
def do_train_direct(self, x_train, x_test, y_train, y_test):
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)
# print('Start training...')
model = lgb.train(self.train_param,
lgb_train,
valid_sets=[lgb_eval],
verbose_eval=400,
num_boost_round=self.num_rounds,
early_stopping_rounds=400,
#categorical_feature=[]
)
# print('End training...')
return model
@staticmethod
def show_feature_importance(model, filename=None):
fi = pd.DataFrame({
"name": model.feature_name(),
"importance_split": model.feature_importance(importance_type="split").astype(int),
"importance_gain": model.feature_importance(importance_type="gain").astype(int),
})
fi = fi.sort_values(by="importance_gain", ascending=False)
print(fi)
@staticmethod
def get_param(seed=99):
return {
'num_leaves': 1023,
'min_data_in_leaf': 50,
'objective': 'binary',
'metric': 'AUC',
'max_depth': -1,
'learning_rate': 0.05,
"boosting": "gbdt",
"feature_fraction": 0.9,
"verbosity": -1,
"random_state": seed,
}
class SingleTrainer:
def __init__(self, pred_col, dry_run=False):
self.pred_col = pred_col
self.target_col = "is_canceled"
self.dry_run = dry_run
self.val_size = 1000*1000
def train_model(self, df):
X = df[self.pred_col]
y = df[self.target_col]
kf = model_selection.KFold(n_splits=4)
models, scores = list(), list()
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
print("---------")
print("fold=", fold)
X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
print(X_train.shape, X_val.shape)
lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
score = model.best_score["valid_0"]["auc"]
if fold == 0:
lgbm.show_feature_importance(model)
models.append(model)
scores.append(score)
#break
return models, np.mean(scores)
学習の実行
pred_col = [
'hotel', 'lead_time', 'stays_in_weekend_nights',
'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
'country', 'market_segment', 'distribution_channel',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'assigned_room_type', 'booking_changes', 'deposit_type',
'agent',
'company', 'days_in_waiting_list', 'customer_type', 'adr',
'required_car_parking_spaces', 'total_of_special_requests'
]
temp_df = train_df.copy()
temp_test = test_df.copy()
cat_col = [
"hotel", "meal", "country", "market_segment",
"distribution_channel", "reserved_room_type",
"assigned_room_type", "deposit_type", "customer_type"
]
for c in cat_col:
temp_df[c] = temp_df[c].astype("category")
temp_test[c] = temp_test[c].astype("category")
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(temp_df)
推論
preds = []
for m in models:
preds.append(m.predict(temp_test[pred_col]))
# データ作成
sub_df = pd.DataFrame({
"id": test_df["id"],
"pred": np.mean(preds, axis=0)
})
train_df["is_canceled"].value_counts()
# ファイルへの書き出し
sub_df.to_csv("sub_cv_models.csv", index=False)
分類(多値分類)
必要モジュールの読み込み
import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm
from sklearn import model_selection, metrics
import lightgbm as lgb
データ読み込み/確認
train = pd.read_csv("/kaggle/input/uec2021-exercise-2/train_student_info.csv")
test = pd.read_csv("/kaggle/input/uec2021-exercise-2/test_student_info.csv")
train.shape, test.shape
courses = pd.read_csv("/kaggle/input/uec2021-exercise-2/courses.csv")
print(courses.shape) # コースの詳細で、IDで表現できなくはないので一旦無視しとく
registration = pd.read_csv("/kaggle/input/uec2021-exercise-2/student_registration.csv")
print(registration.shape)
student_assess = pd.read_csv("/kaggle/input/uec2021-exercise-2/student_assessment.csv")
print(student_assess.shape)
assess = pd.read_csv("/kaggle/input/uec2021-exercise-2/assessments.csv")
print(assess.shape)
前処理関数
def make_sa_df():
sa = pd.merge(student_assess, assess, on="id_assessment", how="left")
aggs = {
"date": ["max"],
}
key = "id_student"
ret_df = sa.groupby(key).agg(aggs).reset_index()
ret_df.columns = [key] + [k+"_"+agg for k, v in aggs.items() for agg in v]
print(ret_df.shape)
return ret_df
def make_reg_df():
key = "id_student"
aggs = {
"date_registration": ["mean"],
}
key = "id_student"
ret_df = registration.groupby(key).agg(aggs).reset_index()
ret_df.columns = [key] + [k+"_"+agg for k, v in aggs.items() for agg in v]
print(ret_df.shape)
return ret_df
def merge_df(main_df, sa_df, reg_df):
key = "id_student"
ret_df = pd.merge(main_df, reg_df, on=key, how="left")
ret_df = pd.merge(ret_df, sa_df, on=key, how="left")
print(ret_df.shape)
return ret_df
def make_cat_cols(df):
cat_cols = [
'code_module', 'code_presentation', 'gender',
'region', 'highest_education', 'imd_band', 'age_band',
'studied_credits', 'disability',
]
for c in cat_cols:
df[c] = df[c].astype("category")
return df
sa_df = make_sa_df()
reg_df = make_reg_df()
train_df = merge_df(train, sa_df, reg_df)
train_df = make_cat_cols(train_df)
学習モデルの作成
class SingleLgb:
def __init__(self, seed=99, dry_run=False):
self.train_param = self.get_param(seed)
if dry_run:
self.num_rounds = 100
else:
self.num_rounds = 1000
def do_train_direct(self, x_train, x_test, y_train, y_test):
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)
# print('Start training...')
model = lgb.train(self.train_param,
lgb_train,
valid_sets=[lgb_eval],
verbose_eval=100,
num_boost_round=self.num_rounds,
early_stopping_rounds=100,
#categorical_feature=[]
)
# print('End training...')
return model
@staticmethod
def show_feature_importance(model, filename=None):
fi = pd.DataFrame({
"name": model.feature_name(),
"importance_split": model.feature_importance(importance_type="split").astype(int),
"importance_gain": model.feature_importance(importance_type="gain").astype(int),
})
fi = fi.sort_values(by="importance_gain", ascending=False)
print(fi)
@staticmethod
def get_param(seed=99):
return {
'num_leaves': 31,
'min_data_in_leaf': 50,
'objective': 'multiclass',
"num_class": 4,
'metric': 'multi_logloss',
'max_depth': -1,
'learning_rate': 0.05,
"boosting": "gbdt",
"feature_fraction": 0.9,
"verbosity": -1,
"random_state": seed,
}
class SingleTrainer:
def __init__(self, pred_col, dry_run=False):
self.pred_col = pred_col
self.target_col = "final_result_int"
self.dry_run = dry_run
self.val_size = 1000*1000
def train_model(self, df):
X = df[self.pred_col]
y = df[self.target_col]
kf = model_selection.KFold(n_splits=4)
models, scores = list(), list()
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
print("---------")
print("fold=", fold)
X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
print(X_train.shape, X_val.shape)
lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
score = model.best_score["valid_0"]["multi_logloss"]
if fold == 0:
lgbm.show_feature_importance(model)
self.eval_model(model, X_val, y_val)
models.append(model)
scores.append(score)
#break
return models, np.mean(scores)
def eval_model(self, model, X_val, y_val):
temp_pred = model.predict(X_val)
temp_pred = np.argmax(temp_pred, axis=1)
acc_score = (temp_pred == y_val).mean()
print(pd.DataFrame(temp_pred).value_counts())
print("acc_score=", acc_score)
学習の実行
pred_col = [
'code_module', 'code_presentation', 'gender',
'region', 'highest_education', 'imd_band', 'age_band',
'num_of_prev_attempts', 'studied_credits', 'disability',
'date_registration_mean', 'date_max',
]
temp_df = train_df.copy()
target_map = {
"Distinction": 3,
"Pass": 2,
"Withdrawn": 0,
"Fail": 1,
}
temp_df["final_result_int"] = temp_df["final_result"].map(target_map).astype(int)
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(temp_df)
print(score)
推論
test_df = merge_df(test, sa_df, reg_df)
test_df = make_cat_cols(test_df)
test_df.shape
preds = []
for m in models:
preds.append(m.predict(test_df[pred_col]))
pred = np.mean(preds, axis=0) # モデルたちの平均をとる
print(pd.DataFrame(pred).describe())
pred = np.argmax(pred, axis=1) # 予測が最大のカラムを予測クラスとする
pred.shape, test_df.shape
sub_df = pd.DataFrame({
"id": test_df["id"],
"pred": pred
})
sub_df["pred"].value_counts()
temp_df["final_result_int"].value_counts()
target_map = {
3: "Distinction",
2: "Pass",
0: "Withdrawn",
1: "Fail",
}
sub_df["pred"] = sub_df["pred"].map(target_map)
# ファイルへの書き出し
sub_df.to_csv("base_model.csv", index=False)