コンテンツにスキップ

NN Model

モジュール読み込み

import numpy as np
import pandas as pd
import lightgbm as lgb

from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, log_loss

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

データ読み込み

INPUT_DIR = "../input/uec2021-exercise-2/"
OUTPUT_DIR = "./"
df = pd.read_csv(INPUT_DIR + "train_student_info.csv")
df_test = pd.read_csv(INPUT_DIR + "test_student_info.csv")
df_all = pd.concat([df, df_test], sort=False, ignore_index=True)

df_registration = pd.read_csv(INPUT_DIR + "student_registration.csv")
df_all = pd.merge(df_all, df_registration, how="left")
df_all
df_assessment = pd.read_csv(INPUT_DIR + "student_assessment.csv")
df_assessment_meta = pd.read_csv(INPUT_DIR + "assessments.csv")
df_assessment = pd.merge(df_assessment, df_assessment_meta, how="left")
df_assessment

テーブル処理

# is_banked (sum, mean) group by (module, presentation, student)
keys = ["code_module", "code_presentation", "id_student"]
df_agg = df_assessment.groupby(keys)["is_banked"].agg(["sum", "mean"]).reset_index()
df_agg.columns = keys + ["is_banked_sum", "is_banked_mean"]
df_all = pd.merge(df_all, df_agg, how="left")

# is_banked (sum, mean) group by (student)
keys = ["id_student"]
df_agg = df_assessment.groupby(keys)["is_banked"].agg(["sum", "mean"]).reset_index()
df_agg.columns = keys + ["is_banked_sum_student", "is_banked_mean_student"]
df_all = pd.merge(df_all, df_agg, how="left")
# 締切何日前に提出したか
df_assessment["date_submitted_before"] = df_assessment["date"] - df_assessment["date_submitted"]

# 平均提出日からの差
df_assessment["date_submitted_mean"] = df_assessment.groupby("id_assessment")["date_submitted"].transform("mean")
df_assessment["date_submitted_diff"] = df_assessment["date_submitted"] - df_assessment["date_submitted_mean"]
# group by (module, presentation, student)
keys = ["code_module", "code_presentation", "id_student"]
df_agg = df_assessment.groupby(keys, as_index=False)[["date_submitted_before", "date_submitted_diff"]].mean()
df_agg.columns = keys + ["date_submitted_before_mean", "date_submitted_diff_mean"]
df = pd.merge(df, df_agg, how="left")

# by student (student)
keys = ["id_student"]
df_agg = df_assessment.groupby(keys, as_index=False)[["date_submitted_before", "date_submitted_diff"]].mean()
df_agg.columns = keys + ["date_submitted_before_mean_student", "date_submitted_diff_mean_student"]
df = pd.merge(df, df_agg, how="left")
# num of assessment groupby (module, presentation, student)
keys = ["code_module", "code_presentation", "id_student"]
df_agg = df_assessment.groupby(keys, as_index=False).size()
df_agg.columns = keys + ["n_assessment"]

# 平均assessment数からの差
df_agg["n_assessment_diff"] = df_agg["n_assessment"] - df_agg.groupby(["code_module", "code_presentation"])["n_assessment"].transform("mean")

df_all = pd.merge(df_all, df_agg, how="left")
df_vle = pd.read_csv(INPUT_DIR + "student_vle.csv")
df_vle_meta = pd.read_csv(INPUT_DIR + "vle.csv")
df_vle = pd.merge(df_vle, df_vle_meta, how="left")
df_vle
keys = ["code_module", "code_presentation", "id_student"]

# num of unique site, num of day, sum of click group by (module, presentation, student)
df_agg = df_vle.groupby(keys)[["id_site", "sum_click"]].agg({"id_site": "nunique", "sum_click": ["count", "sum"]}).reset_index()
df_agg.columns = keys + ["n_site", "n_day", "sum_click"]

df_agg["click_per_day"] = df_agg["sum_click"] / df_agg["n_day"]
df_agg["click_per_site"] = df_agg["sum_click"] / df_agg["n_site"]
df_agg["day_per_site"] = df_agg["n_day"] / df_agg["n_site"]

# log transformation for NN
for col in ["n_site", "n_day", "sum_click", "click_per_day", "click_per_site", "day_per_site"]:
    df_agg[col] = np.log1p(df_agg[col])
df_all = pd.merge(df_all, df_agg, how="left")
for t in tqdm(df_vle["activity_type"].unique()):
    keys = ["code_module", "code_presentation", "id_student"]

    # num of unique site, num of day, sum of click group by (module, presentation, student)
    df_agg = df_vle.query("activity_type == @t").groupby(keys)[["id_site", "sum_click"]].agg({"id_site": "nunique", "sum_click": ["count", "sum"]}).reset_index()
    df_agg.columns = keys + ["n_site", "n_day", "sum_click"]

    df_agg["click_per_day"] = df_agg["sum_click"] / df_agg["n_day"]
    df_agg["click_per_site"] = df_agg["sum_click"] / df_agg["n_site"]
    df_agg["day_per_site"] = df_agg["n_day"] / df_agg["n_site"]

    # log transformation for NN
    for col in ["n_site", "n_day", "sum_click", "click_per_day", "click_per_site", "day_per_site"]:
        df_agg[col] = np.log1p(df_agg[col])
    df_all = pd.merge(df_all, df_agg, how="left")
    df_all = pd.merge(df_all, df_agg, on=keys, how="left", suffixes=["", f"_{t}"])

前処理

df_all["gender"] = (df_all["gender"] == "M").astype(int)
df_all["disability"] = (df_all["disability"] == "Y").astype(int)

df_all["imd_band"] = df_all["imd_band"].str[0].astype(float)
df_all["age_band"] = df_all["age_band"].str[0].astype(float)
cat_vars = ["code_module", "code_presentation", "region", "highest_education"]

enc = OrdinalEncoder(dtype=int)
df_all[cat_vars] = enc.fit_transform(df_all[cat_vars].fillna(""))
fig, axes = plt.subplots(figsize=(15, 80), nrows=35, ncols=4)

for i, col in enumerate(df_all.columns.drop(["id", "id_student", "final_result"] + cat_vars)):
    ax = axes[i // 4, i % 4]
    ax.hist(df_all[col], bins=100)
    ax.set_title(col)

fig.tight_layout()
plt.show()
df = df_all.iloc[:len(df), :].reset_index(drop=True)
df_test = df_all.iloc[len(df):, :].reset_index(drop=True)
result_map = {"Withdrawn": 0, "Fail": 1, "Pass": 2, "Distinction": 3}
result_inv_map = {0: "Withdrawn", 1: "Fail", 2: "Pass", 3: "Distinction"}

モデリング

class MyDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.FloatTensor(X_num)
        self.X_cat = torch.LongTensor(X_cat)
        if y is not None:
            self.y = torch.LongTensor(y)

    def __len__(self):
        return self.X_num.shape[0]

    def __getitem__(self, idx):
        if "y" in dir(self):
            return (self.X_num[idx, :], self.X_cat[idx, :], self.y[idx])
        else:
            return (self.X_num[idx, :], self.X_cat[idx, :])
class NNModel(nn.Module):
    def __init__(self, input_size_num, output_size, n_categories, emb_size, hidden_sizes, dropout):
        super().__init__()

        self.embs = nn.ModuleList()
        for i in range(len(n_categories)):
            self.embs.append(nn.Embedding(n_categories[i], emb_size))

        input_size = input_size_num + sum(emb.embedding_dim for emb in self.embs)
        self.mlp = nn.Sequential(
            nn.Linear(input_size, hidden_sizes[0]),
            nn.BatchNorm1d(hidden_sizes[0]),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
            nn.BatchNorm1d(hidden_sizes[1]),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hidden_sizes[1], output_size),
        )

    def forward(self, x_num, x_cat):
        x_cat = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)]
        x = torch.cat([x_num] + x_cat, axis=1)
        x = self.mlp(x)
        return x
def train(model, data_loader, optimizer, criterion, device):
    model.train()

    for batch in data_loader:
        X_num = batch[0].to(device)
        X_cat = batch[1].to(device)
        y = batch[2].to(device)

        preds = model(X_num, X_cat)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def evaluate(model, data_loader, criterion, device):
    model.eval()

    n = 0
    total_loss = 0.0
    for batch in data_loader:
        X_num = batch[0].to(device)
        X_cat = batch[1].to(device)
        y = batch[2].to(device)

        with torch.no_grad():
            preds = model(X_num, X_cat)

        loss = criterion(preds, y)
        total_loss += loss.item()
        n += X_num.shape[0]

    avg_loss = total_loss / n

    return avg_loss


def predict(model, data_loader, device):
    model.eval()

    preds_all = []
    for batch in data_loader:
        X_num = batch[0].to(device)
        X_cat = batch[1].to(device)

        with torch.no_grad():
            preds = model(X_num, X_cat)
        preds = torch.softmax(preds, axis=1)    
        preds = preds.cpu().numpy()
        preds_all.append(preds)

    preds_all = np.concatenate(preds_all)

    return preds_all
n_categories = (df_all[cat_vars].max(axis=0)+1).tolist()
%%time

X = df.drop(["id", "id_student", "final_result"], axis=1)
y = df["final_result"].map(result_map)
groups = df["id_student"].values

n_splits = 5
batch_size = 256

preds_valid = np.zeros((len(df), 4))
preds_test = np.zeros((len(df_test), 4))

gkf = GroupKFold(n_splits=n_splits)
for i, (idx_train, idx_valid) in enumerate(gkf.split(X, y, groups=groups)):   
    print(f"fold: {i}")
    X_train_num = X.iloc[idx_train, :].drop(cat_vars, axis=1).fillna(0)
    X_train_cat = X.iloc[idx_train, :][cat_vars]
    y_train = y.iloc[idx_train]

    X_valid_num = X.iloc[idx_valid, :].drop(cat_vars, axis=1).fillna(0)
    X_valid_cat = X.iloc[idx_valid, :][cat_vars]
    y_valid = y.iloc[idx_valid]

    X_test_num = df_test[X.columns].drop(cat_vars, axis=1).fillna(0)
    X_test_cat = df_test[cat_vars]

    # standardization
    scaler = StandardScaler()
    scaler.fit(X_train_num)
    X_train_num = scaler.transform(X_train_num)
    X_valid_num = scaler.transform(X_valid_num)
    X_test_num = scaler.transform(X_test_num)

    # dataset
    ds_train = MyDataset(X_train_num, X_train_cat.values, y_train.values)
    ds_valid = MyDataset(X_valid_num, X_valid_cat.values, y_valid.values)
    ds_test = MyDataset(X_test_num, X_test_cat.values)

    # dataloader
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True, 
                          num_workers=0, pin_memory=True, drop_last=True)
    dl_valid = DataLoader(ds_valid, batch_size=batch_size, shuffle=False, 
                          num_workers=0, pin_memory=True, drop_last=False)
    dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, 
                          num_workers=0, pin_memory=True, drop_last=False)

    # build model
    torch.manual_seed(0)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = NNModel(input_size_num=X_train_num.shape[1],
                    output_size=4,
                    n_categories=n_categories,
                    emb_size=10,
                    hidden_sizes=(128, 64),
                    dropout=0.5)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    criterion = nn.CrossEntropyLoss(reduction="sum")

    best_loss = np.inf
    for epoch in range(50):
        train(model, dl_train, optimizer, criterion, device)
        loss = evaluate(model, dl_valid, criterion, device)
        if loss < best_loss:
            best_loss = loss
            torch.save(model.state_dict(), f"model.pth")
            print(f"epoch: {epoch}\tvalid-loss: {loss}\tbest!")
        else:
            print(f"epoch: {epoch}\tvalid-loss: {loss}")

    with torch.no_grad():
        model.load_state_dict(torch.load(f"model.pth"))
        preds_valid[idx_valid] = predict(model, dl_valid, device)    
        preds_test += predict(model, dl_test, device) / n_splits
    print()

logloss = log_loss(y, preds_valid)
acc = accuracy_score(y, np.argmax(preds_valid, axis=1))
print(f"logloss: {logloss:.5f}\tacc: {acc:.5f}")

推論

submission = pd.read_csv(INPUT_DIR + "sample_submission.csv")
submission["pred"] = pd.Series(np.argmax(preds_test, axis=1)).map(result_inv_map)
submission.to_csv(OUTPUT_DIR + "nn.csv", index=False)
submission["pred"].value_counts()
Back to top