コンテンツにスキップ

BRAINSNACKS_PYTHON_TOOLS_SDK

LGBM Cls Template

LGBM Classification Template¶

分類(2値分類)¶

必要モジュールの読み込み¶

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics

import lightgbm as lgb

データの確認¶

# データ読み込み
train_df = pd.read_csv("../input/uec2021-exercise-1/train.csv")
test_df = pd.read_csv("../input/uec2021-exercise-1/test.csv")
sample_submission_df = pd.read_csv("../input/uec2021-exercise-1/sample_submission.csv")

train_df.shape, test_df.shape, sample_submission_df.shape

train_df.head(2),
test_df.head(2)

train_df.columns

sample_submission_df.head(2)

モデルの作成¶

class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 4000

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=400,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=400,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        print(fi)

    @staticmethod
    def get_param(seed=99):
        return {
            'num_leaves': 1023,
            'min_data_in_leaf': 50,
            'objective': 'binary',
            'metric': 'AUC',
            'max_depth': -1,
            'learning_rate': 0.05,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }

class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "is_canceled"
        self.dry_run = dry_run
        self.val_size = 1000*1000

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        kf = model_selection.KFold(n_splits=4)

        models, scores = list(), list()
        for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
            print("---------")
            print("fold=", fold)
            X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
            print(X_train.shape, X_val.shape)

            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["auc"]
            if fold == 0:                
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
            #break
        return models, np.mean(scores)

学習の実行¶

pred_col = [
    'hotel', 'lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests'
]

temp_df = train_df.copy()
temp_test = test_df.copy()
cat_col = [
    "hotel", "meal", "country", "market_segment",
    "distribution_channel", "reserved_room_type",
    "assigned_room_type", "deposit_type", "customer_type"
]
for c in cat_col:
    temp_df[c] = temp_df[c].astype("category")
    temp_test[c] = temp_test[c].astype("category")

trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(temp_df)

推論¶

preds = []
for m in models:
    preds.append(m.predict(temp_test[pred_col]))

# データ作成
sub_df = pd.DataFrame({
    "id": test_df["id"], 
    "pred": np.mean(preds, axis=0)
})

sub_df.head(2)

sub_df["pred"].hist()

train_df["is_canceled"].value_counts()

# ファイルへの書き出し
sub_df.to_csv("sub_cv_models.csv", index=False)

分類(多値分類)¶

必要モジュールの読み込み¶

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics

import lightgbm as lgb

データ読み込み/確認¶

train = pd.read_csv("/kaggle/input/uec2021-exercise-2/train_student_info.csv")
test = pd.read_csv("/kaggle/input/uec2021-exercise-2/test_student_info.csv")
train.shape, test.shape

courses = pd.read_csv("/kaggle/input/uec2021-exercise-2/courses.csv")
print(courses.shape) # コースの詳細で、IDで表現できなくはないので一旦無視しとく
registration = pd.read_csv("/kaggle/input/uec2021-exercise-2/student_registration.csv")
print(registration.shape)
student_assess = pd.read_csv("/kaggle/input/uec2021-exercise-2/student_assessment.csv")
print(student_assess.shape)
assess = pd.read_csv("/kaggle/input/uec2021-exercise-2/assessments.csv")
print(assess.shape)

assess.head(2)

student_assess.head(2)

train.head(2)

前処理関数¶

def make_sa_df():
    sa = pd.merge(student_assess, assess, on="id_assessment", how="left")
    aggs = {
        "date": ["max"],
    }
    key = "id_student"
    ret_df = sa.groupby(key).agg(aggs).reset_index()
    ret_df.columns = [key] + [k+"_"+agg for k, v in aggs.items() for agg in v]
    print(ret_df.shape)
    return ret_df

def make_reg_df():
    key = "id_student"
    aggs = {
        "date_registration": ["mean"],
    }
    key = "id_student"
    ret_df = registration.groupby(key).agg(aggs).reset_index()
    ret_df.columns = [key] + [k+"_"+agg for k, v in aggs.items() for agg in v]
    print(ret_df.shape)
    return ret_df

def merge_df(main_df, sa_df, reg_df):
    key = "id_student"
    ret_df = pd.merge(main_df, reg_df, on=key, how="left")
    ret_df = pd.merge(ret_df, sa_df, on=key, how="left")
    print(ret_df.shape)
    return ret_df

def make_cat_cols(df):
    cat_cols = [
        'code_module', 'code_presentation', 'gender',
        'region', 'highest_education', 'imd_band', 'age_band',
        'studied_credits', 'disability',
    ]
    for c in cat_cols:
        df[c] = df[c].astype("category")
    return df

sa_df = make_sa_df()
reg_df = make_reg_df()

train_df = merge_df(train, sa_df, reg_df)
train_df = make_cat_cols(train_df)

学習モデルの作成¶

class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param(seed)
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 1000

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        print(fi)

    @staticmethod
    def get_param(seed=99):
        return {
            'num_leaves': 31,
            'min_data_in_leaf': 50,
            'objective': 'multiclass',
            "num_class": 4,
            'metric': 'multi_logloss',
            'max_depth': -1,
            'learning_rate': 0.05,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": seed,
        }

class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "final_result_int"
        self.dry_run = dry_run
        self.val_size = 1000*1000

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        kf = model_selection.KFold(n_splits=4)

        models, scores = list(), list()
        for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
            print("---------")
            print("fold=", fold)
            X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
            print(X_train.shape, X_val.shape)

            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["multi_logloss"]
            if fold == 0:                
                lgbm.show_feature_importance(model)
                self.eval_model(model, X_val, y_val)
            models.append(model)
            scores.append(score)
            #break
        return models, np.mean(scores)

    def eval_model(self, model, X_val, y_val):
        temp_pred = model.predict(X_val)
        temp_pred = np.argmax(temp_pred, axis=1)
        acc_score = (temp_pred == y_val).mean()
        print(pd.DataFrame(temp_pred).value_counts())
        print("acc_score=", acc_score)

学習の実行¶

pred_col = [
    'code_module', 'code_presentation', 'gender',
    'region', 'highest_education', 'imd_band', 'age_band',
    'num_of_prev_attempts', 'studied_credits', 'disability', 
    'date_registration_mean', 'date_max',
]

temp_df = train_df.copy()
target_map = {
    "Distinction": 3,
    "Pass": 2,
    "Withdrawn": 0,
    "Fail": 1,
}
temp_df["final_result_int"] = temp_df["final_result"].map(target_map).astype(int)

trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(temp_df)
print(score)

推論¶

test_df = merge_df(test, sa_df, reg_df)
test_df = make_cat_cols(test_df)
test_df.shape

preds = []
for m in models:
    preds.append(m.predict(test_df[pred_col]))

pred = np.mean(preds, axis=0) # モデルたちの平均をとる
print(pd.DataFrame(pred).describe())
pred = np.argmax(pred, axis=1) # 予測が最大のカラムを予測クラスとする

pred.shape, test_df.shape

sub_df = pd.DataFrame({
    "id": test_df["id"], 
    "pred": pred
})

sub_df["pred"].value_counts()

temp_df["final_result_int"].value_counts()

target_map = {
    3: "Distinction",
    2: "Pass",
    0: "Withdrawn",
    1: "Fail",
}
sub_df["pred"] = sub_df["pred"].map(target_map)

sub_df.head()

# ファイルへの書き出し
sub_df.to_csv("base_model.csv", index=False)