NLP
BERT (Signate)
フォルダ指定
input_dir = "/content/drive/MyDrive/signate/SystematicReviewWorkShop-PeerSupportGroup/data/"
output_dir = "/content/drive/MyDrive/signate/SystematicReviewWorkShop-PeerSupportGroup/log/"
submission_dir = "/content/drive/MyDrive/signate/SystematicReviewWorkShop-PeerSupportGroup/submission/"
model_dir = "/content/drive/MyDrive/signate/SystematicReviewWorkShop-PeerSupportGroup/model_bin/"
pred_dir = "/content/drive/MyDrive/signate/SystematicReviewWorkShop-PeerSupportGroup/pred/"
必要モジュールの読み込み
import os
import math
import random
import pandas as pd
import numpy as np
from glob import glob
import gc
gc.enable()
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score
from transformers import BertConfig, RobertaConfig
from transformers import (get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup)
from transformers import BertTokenizer, RobertaTokenizer
from transformers import BertModel, RobertaModel
from transformers import AutoConfig
from transformers import BertForSequenceClassification, RobertaForSequenceClassification
from torch import cuda
import time
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel, AutoModelForSequenceClassification
from transformers import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
from IPython.display import clear_output
from tqdm import tqdm, trange
Config 設定
class CFG:
exp = "exp002"
seed = 71
fold = 5
max_len = 71
epochs = 3
train_batch_size = 16
valid_batch_size = 32
model_name = "bert-base-uncased"
CONFIG = CFG()
ディレクトリの作成
os.makedirs(model_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(pred_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(output_dir+CONFIG.exp+"/", exist_ok=True)
シードの固定
def set_random_seed(random_seed):
random.seed(random_seed)
np.random.seed(random_seed)
os.environ["PYTHONHASHSEED"] = str(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
set_random_seed(CONFIG.seed)
ログの取得
def init_logger(log_file=output_dir + CONFIG.exp+ f"/{CONFIG.exp}_train.log"):
from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger
logger = getLogger(__name__)
logger.setLevel(INFO)
handler1 = StreamHandler()
handler1.setFormatter(Formatter("%(message)s"))
handler2 = FileHandler(filename=log_file)
handler2.setFormatter(Formatter("%(message)s"))
logger.addHandler(handler1)
logger.addHandler(handler2)
return logger
LOGGER = init_logger()
CPU/GPUの指定
device = 'cuda' if cuda.is_available() else 'cpu'
print(f'{device} is used')
データ分割
def get_train_data(train):
# 交差検証 用の番号を振ります。
Fold = StratifiedKFold(n_splits=CONFIG.fold, shuffle=True, random_state=CONFIG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
train.loc[val_index, "fold"] = int(n)
train["fold"] = train["fold"].astype(np.uint8)
return train
def get_test_data(test):
return test
前処理
import re
import unicodedata
def clean_text(text):
replaced_text = text.lower()
replaced_text = re.sub(r'[【】]', ' ', replaced_text) # 【】の除去
replaced_text = re.sub(r'[()()]', ' ', replaced_text) # ()の除去
replaced_text = re.sub(r'[[]\[\]]', ' ', replaced_text) # []の除去
replaced_text = re.sub(r'[@@]\w+', '', replaced_text) # メンションの除去
replaced_text = re.sub(
r'https?:\/\/.*?[\r\n ]', '', replaced_text) # URLの除去
replaced_text = re.sub(r' ', ' ', replaced_text) # 全角空白の除去
return replaced_text
def normalize(text):
normalized_text = normalize_unicode(text)
normalized_text = normalize_number(normalized_text)
normalized_text = lower_text(normalized_text)
return normalized_text
def lower_text(text):
return text.lower()
def normalize_unicode(text, form='NFKC'):
normalized_text = unicodedata.normalize(form, text)
return normalized_text
def normalize_number(text):
replaced_text = re.sub(r'\d+', '0', text)
return replaced_text
def text_cleaning(text):
text = clean_text(text)
text = normalize(text)
text = lower_text(text)
text = normalize_unicode(text)
return text
def data_cleaning(data):
return [text_cleaning(text) for text in data]
データの読み込み
pd.set_option("display.max_colwidth", 50)
train = pd.read_csv(input_dir + "train.csv")
test = pd.read_csv(input_dir + "test.csv")
sub = pd.read_csv(input_dir + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
# 文字化けデータを除外するなどの処理は入れた方がいい?αとかが文字化けの原因
train["title"] = data_cleaning(train["title"])
train = get_train_data(train)
train.head()
データローダー
class SRWSDataset(Dataset):
def __init__(self, df, model_name, include_labels=True):
tokenizer = BertTokenizer.from_pretrained(model_name)
self.df = df
self.include_labels = include_labels
self.title = df["title"].tolist()
self.encoded = tokenizer.batch_encode_plus(
self.title,
padding ="max_length",
max_length = CONFIG.max_len,
truncation = True,
return_attention_mask = True
)
if self.include_labels:
self.labels = df["judgement"].values
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
input_ids = torch.tensor(self.encoded["input_ids"][idx])
attention_mask = torch.tensor(self.encoded["attention_mask"][idx])
if self.include_labels:
label = torch.tensor(self.labels[idx]).float()
return input_ids, attention_mask, label
return input_ids, attention_mask
BERTモデル
class SRWSModel(nn.Module):
def __init__(self, model_name):
super().__init__()
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids, attention_mask):
out = self.model(input_ids=input_ids, attention_mask=attention_mask)
out = self.sigmoid(out.logits).squeeze()
return out
ユーティリティー
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def asMinutes(s):
m = math.floor(s / 60)
s -= m * 60
return "%dm %ds" % (m, s)
def timeSince(since, percent):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))
訓練/推論
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
start = end = time.time()
losses = AverageMeter()
# switch to train mode
model.train()
for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
optimizer.zero_grad()
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
batch_size = labels.size(0)
y_preds = model(input_ids, attention_mask)
loss = criterion(y_preds, labels)
# record loss
losses.update(loss.item(), batch_size)
loss.backward()
optimizer.step()
if step % 100 == 0 or step == (len(train_loader) - 1):
print(
f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
f"Loss: {losses.avg:.4f} "
)
return losses.avg
def valid_fn(valid_loader, model, criterion, device):
start = end = time.time()
losses = AverageMeter()
# switch to evaluation mode
model.eval()
preds = []
for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
batch_size = labels.size(0)
# compute loss
with torch.no_grad():
y_preds = model(input_ids, attention_mask)
loss = criterion(y_preds, labels)
losses.update(loss.item(), batch_size)
# record score
preds.append(y_preds.to("cpu").numpy())
if step % 100 == 0 or step == (len(valid_loader) - 1):
print(
f"EVAL: [{step}/{len(valid_loader)}] "
f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
f"Loss: {losses.avg:.4f} "
)
predictions = np.concatenate(preds)
return losses.avg, predictions
def inference():
predictions = []
test_dataset = SRWSDataset(test, CONFIG.model_name, include_labels=False)
test_loader = DataLoader(
test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True
)
for fold in range(CONFIG.fold):
LOGGER.info(f"========== model: {CONFIG.model_name} fold: {fold} inference ==========")
model = SRWSModel(CONFIG.model_name)
model.to(device)
model.load_state_dict(torch.load(model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name}_fold{fold}_best.pth")["model"])
model.eval()
preds = []
for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
with torch.no_grad():
y_preds = model(input_ids, attention_mask)
preds.append(y_preds.to("cpu").numpy())
preds = np.concatenate(preds)
predictions.append(preds)
predictions = np.mean(predictions, axis=0)
return predictions
訓練実行
def train_loop(train, fold):
LOGGER.info(f"========== fold: {fold} training ==========")
# ====================================================
# Data Loader
# ====================================================
trn_idx = train[train["fold"] != fold].index
val_idx = train[train["fold"] == fold].index
train_folds = train.loc[trn_idx].reset_index(drop=True)
valid_folds = train.loc[val_idx].reset_index(drop=True)
train_dataset = SRWSDataset(train_folds, CONFIG.model_name)
valid_dataset = SRWSDataset(valid_folds, CONFIG.model_name)
train_loader = DataLoader(
train_dataset,
batch_size=CONFIG.train_batch_size,
shuffle=True,
num_workers=4,
pin_memory=True,
drop_last=True,
)
valid_loader = DataLoader(
valid_dataset,
batch_size=CONFIG.valid_batch_size,
shuffle=False,
num_workers=4,
pin_memory=True,
drop_last=False,
)
# ====================================================
# Model
# ====================================================
model = SRWSModel(CONFIG.model_name)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()
# ====================================================
# Loop
# ====================================================
best_score = -1
best_loss = np.inf
for epoch in range(CONFIG.epochs):
start_time = time.time()
# train
avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)
# eval
avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
valid_labels = valid_folds["judgement"].values
# scoring
score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)
elapsed = time.time() - start_time
LOGGER.info(
f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s"
)
LOGGER.info(f"Epoch {epoch+1} - Score: {score}")
if score > best_score:
best_score = score
LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
torch.save(
{"model": model.state_dict(), "preds": preds}, model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name}_fold{fold}_best.pth"
)
check_point = torch.load(model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name}_fold{fold}_best.pth")
valid_folds["preds"] = check_point["preds"]
return valid_folds
def get_result(result_df):
preds = result_df["preds"].values
labels = result_df["judgement"].values
score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
LOGGER.info(f"Score: {score:<.5f}")
学習の実行
# Training
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
oof_df = pd.DataFrame()
for fold in range(CONFIG.fold):
_oof_df = train_loop(train, fold)
oof_df = pd.concat([oof_df, _oof_df])
LOGGER.info(f"========== fold: {fold} result ==========")
get_result(_oof_df)
# CV result
LOGGER.info(f"========== CV ==========")
get_result(oof_df)
# Save OOF result
oof_df.to_csv(pred_dir +CONFIG.exp + "/oof_df.csv", index=False)
# Inference
predictions = inference()
predictions = np.where(predictions < border, 0, 1)
# submission
sub["judgement"] = predictions
sub.to_csv(submission_dir +CONFIG.exp+ "_submission.csv", index=False, header=False)
Light GBM (Kiva)
必要モジュールの読み込み
import logging
import datetime
import warnings
import joblib
import pandas as pd
pd.set_option("display.max_colwidth", 50)
import numpy as np
import matplotlib.pyplot as plt
from currency_converter import CurrencyConverter
import seaborn as sns
import os
import re
from tqdm import tqdm
from sklearn.metrics import fbeta_score
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy.optimize import minimize, minimize_scalar
from contextlib import contextmanager
from sklearn import model_selection, metrics
import category_encoders as ce
import torch
import lightgbm as lgb
from time import time
import texthero as hero
import transformers
import pycld2 as cld2
#モジュール読み込み
import sys,os
sys.path.append('/home/azureuser/cloudfiles/code/Users/hirahara.kazuki/module')
import visualize_module
import preprocess_module
import importlib
importlib.reload(visualize_module)
importlib.reload(preprocess_module)
vs=visualize_module.Feature_Confirmation()
sc=preprocess_module.ScalerModule()
en=preprocess_module.Encoding_Module()
CONFIG
# ====================================================
# CFG
# ====================================================
class CONFIG:
exp='baseline'
bert_model_name= 'bert-base-uncased'
fold = 5
seed=71
フォルダーの指定と読み込み
# ====================================================
# Folders
# ====================================================
input_dir = "../data/"
interm_dir = "../interm/"
submission_dir = "../submission/"
model_dir = "../model/"
pred_dir = "../pred/"
os.makedirs(model_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(pred_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(interm_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(submission_dir+CONFIG.exp+"/", exist_ok=True)
データ読み込み
# ====================================================
# Read Data
# ====================================================
train = pd.read_csv(input_dir + "train.csv")
test = pd.read_csv(input_dir + "test.csv")
sub = pd.read_csv(input_dir + "sample_submission.csv")
#Train_Dataset
display(train.head())
display(train.shape)
#Test_Dataset
display(test.head())
display(test.shape)
不要なカラムの削除
# ====================================================
# Delete Unrequired Columns
# ====================================================
# 不要なカラムの削除
drop_columns=[#'LOAN_ID',
'ORIGINAL_LANGUAGE',
'DESCRIPTION',#とりあえず翻訳を信頼する
'IMAGE_ID',
'COUNTRY_CODE']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)
前処理
# ====================================================
# Basic Preprocess of Words
# ====================================================
#Text heroによる前処理
def cleansing_hero_only_text(input_df, text_col):
## get text only
custom_pipeline = [
hero.preprocessing.fillna,
hero.preprocessing.lowercase,#小文字変換
#hero.preprocessing.remove_digits,
#hero.preprocessing.remove_punctuation,# 句読点の削除
hero.preprocessing.remove_diacritics,# ダイアクリティカルマーク(発音区別符号。àやéなど)の削除
hero.preprocessing.remove_stopwords,# ストップワードの削除
hero.preprocessing.remove_whitespace,# スペースの削除
#hero.preprocessing.stem
]
texts = hero.clean(input_df[text_col], custom_pipeline)
return texts
#Punct削除
def clean_puncts(x):
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
'·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
'“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
'▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
'∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '(', ')', '~',
'➡', '%', '⇒', '▶', '「', '➄', '➆', '➊', '➋', '➌', '➍', '⓪', '①', '②', '③', '④', '⑤', '⑰', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽',
'=', '※', '㈱', '、', '△', '℮', 'ⅼ', '‐', '」', '┝', '↳', '◉', '/', '+', '○',
'【', '】', '✅', '☑', '➤', '゙', '↳', '〶', '☛', '「', '⁺', '『', '≫',
'©', '<sub>','Aホイ', 'ホイ', "テゥ"
]
for punct in puncts:
x = x.replace(punct, '')
return x
#改行コードの削除
def remove_br(text):
text = text.replace("br", "")
#半角スペースの削除
text = text.replace(" ", " ")
text = text.replace(" ", " ")
return text
#金額の取得
def get_price(x):
# #usdに変換できる貨幣は変換する
# def change_usd(x):
# rate_unit=x[-3:].upper()
# price=re.sub("[a-z\s]","",x)
# c = CurrencyConverter()
# try:
# x=c.convert(price, rate_unit, 'USD')
# except:
# x=price
# return x
re_text=("\d+\sphp|\d+\scop|\d+\shnl|\d+\skes|\d+\susd|\d+\srwf|"
"\d+\spyg|\d+\spen|\d+\svnd|\d+\smga|\d+\sinr|\d+\smzn|"
"\d+\spkr|\d+\sjod|\d+\sxof|\d+\slrd|\d+\sugx|\d+\sgtq|"
"\d+\smxn|\d+\skhr|\d+\stjs|\d+\shtg|\d+\segp|\d+\sidr|"
"\d+\snio|\d+\sghs|\d+\sfjd|\d+\sbob|\d+\ssbd|\d+\swst|"
"\d+\sbrl|\d+\sngn|\d+\sxaf|\d+\seur|\d+\skgs|\d+\smwk|"
"\d+\sgel|\d+\sall|\d+\szmw|\d+\scrc|\d+\stop|\d+\smdl|"
"\d+\slsl|\d+\sdop|\d+\ssll|\d+\stry|\d+\snpr|\d+\sthb|"
"\d+\spgk|\d+\sils|\d+\samd")
results = re.findall(re_text,x)
if len(results)==0:
return 0
else:
price_list=[]
for result in results:
price_v=re.sub("[a-z\s]","",result)
price_list.append(price_v)
price_list=[s.strip(' ') for s in price_list]
#一時的に最大値を取得
price_max=max(price_list)
#一番はじめの値を申請者の年齢と仮定
return price_max
#正規化表現によるに年齢取得
def get_age(x):
result_1 = re.findall('\d{2}\syears',x)
result_2 = re.findall('\d{2}year',x)
result=result_1+result_2
if len(result) == 0:
return 0
else:
result=[s.strip('years') for s in result]
result=[s.strip('year') for s in result]
result=[s.strip(' ') for s in result]
result=[s.strip(' ') for s in result]
#一番はじめの値を申請者の年齢と仮定
result=result[0]
return result
def basic_preprocess(input_df):
output_df=input_df.copy()
output_df["DESCRIPTION_TRANSLATED"] = cleansing_hero_only_text(output_df,"DESCRIPTION_TRANSLATED")
output_df["DESCRIPTION_TRANSLATED"] = output_df["DESCRIPTION_TRANSLATED"].apply(lambda x: clean_puncts(x))
output_df["DESCRIPTION_TRANSLATED"] = output_df["DESCRIPTION_TRANSLATED"].apply(lambda x: remove_br(x))
output_df["age"] = output_df["DESCRIPTION_TRANSLATED"].apply(lambda x: get_age(x))
output_df["price"] = output_df["DESCRIPTION_TRANSLATED"].apply(lambda x: get_price(x))
output_df["age"] =output_df["age"].astype("int")
output_df["price"] =output_df["price"].astype("int")
return output_df
特徴量の抽出
# ====================================================
# Feature Engineering of WORDS
# ====================================================
#各特徴量の取得
def basic_text_features_transforme(input_df, text_columns, name=""):
"""basic な text 特徴量"""
def _get_features(dataframe, column):
_df = pd.DataFrame()
_df[column + name + '_num_chars'] = dataframe[column].apply(len)
_df[column + name + '_num_exclamation_marks'] = dataframe[column].apply(lambda x: x.count('!'))
_df[column + name + '_num_question_marks'] = dataframe[column].apply(lambda x: x.count('?'))
_df[column + name + '_num_punctuation'] = dataframe[column].apply(lambda x: sum(x.count(w) for w in '.,;:'))
_df[column + name + '_num_symbols'] = dataframe[column].apply(lambda x: sum(x.count(w) for w in '*&$%'))
_df[column + name + '_num_words'] = dataframe[column].apply(lambda x: len(x.split()))
_df[column + name + '_num_unique_words'] = dataframe[column].apply(lambda x: len(set(w for w in x.split())))
_df[column + name + '_words_vs_unique'] = _df[column + name + '_num_unique_words'] / _df[column + name + '_num_words']
_df[column + name + '_words_vs_chars'] = _df[column + name + '_num_words'] / _df[column + name + '_num_chars']
return _df
def vectorize_text(input_df,
text_columns,
vectorizer=CountVectorizer(),
transformer=TruncatedSVD(n_components=128),
name='html_count_svd'):
output_df = pd.DataFrame()
output_df[text_columns] = input_df[text_columns].fillna('missing').astype(str)
features = []
for c in text_columns:
sentence = vectorizer.fit_transform(output_df[c])
feature = transformer.fit_transform(sentence)
num_p = feature.shape[1]
feature = pd.DataFrame(feature, columns=[name+str(c)+str(num_p) + f'={i:03}' for i in range(num_p)])
features.append(feature)
output_df = pd.concat(features, axis=1)
return output_df
class BERT_Vectorizer:
"""
事前学習済み BERT モデルを使ったテキスト特徴抽出
https://www.guruguru.science/competitions/16/discussions/fb792c87-6bad-445d-aa34-b4118fc378c1/
"""
def __init__(self, model_name='bert-base-uncased', max_len=128):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model_name = model_name
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
self.model = transformers.AutoModel.from_pretrained(self.model_name)
self.model = self.model.to(self.device)
self.max_len = max_len
def vectorize(self, sentence : str) -> np.array:
inp = self.tokenizer.encode(sentence)
len_inp = len(inp)
if len_inp >= self.max_len:
inputs = inp[:self.max_len]
masks = [1] * self.max_len
else:
inputs = inp + [0] * (self.max_len - len_inp)
masks = [1] * len_inp + [0] * (self.max_len - len_inp)
inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
output = self.model(inputs_tensor, masks_tensor)
seq_out, pooled_out = output['last_hidden_state'], output['pooler_output']
if torch.cuda.is_available():
return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
else:
return seq_out[0][0].detach().numpy()
def get_basic_text_features(input_df):
output_df = basic_text_features_transforme(input_df,
text_columns=["DESCRIPTION_TRANSLATED", "LOAN_USE"])
return output_df
def get_tfidf_features__svd64(input_df):
output_df = vectorize_text(input_df,
text_columns=["DESCRIPTION_TRANSLATED", "LOAN_USE"],
vectorizer=TfidfVectorizer(min_df=0.001, max_df=0.99),
transformer=TruncatedSVD(n_components=64),
name="tfidf_svd_")
return output_df
def get_count_features__svd64(input_df):
output_df = vectorize_text(input_df,
text_columns=["DESCRIPTION_TRANSLATED", "LOAN_USE"],
vectorizer=CountVectorizer(min_df=0.001, max_df=0.99),
transformer=TruncatedSVD(n_components=64),
name="count_svd_")
return output_df
def get_bert_feature_description(input_df):
vectorizer = BERT_Vectorizer(model_name=CONFIG.bert_model_name)
texts = input_df['DESCRIPTION_TRANSLATED'].fillna('')
text_vecs = np.array([vectorizer.vectorize(x) for x in texts])
pca = PCA(n_components=32)
text_vecs = pca.fit_transform(text_vecs)
output_df = pd.DataFrame(text_vecs, columns=[f'bert_description_vecs={i:03}' for i in range(text_vecs.shape[1])])
output_df.index = input_df.index
return output_df
def get_bert_feature_loan(input_df):
vectorizer = BERT_Vectorizer(model_name=CONFIG.bert_model_name)
texts = input_df['LOAN_USE'].fillna('')
text_vecs = np.array([vectorizer.vectorize(x) for x in texts])
pca = PCA(n_components=32)
text_vecs = pca.fit_transform(text_vecs)
output_df = pd.DataFrame(text_vecs, columns=[f'bert_loan_vecs={i:03}' for i in range(text_vecs.shape[1])])
output_df.index = input_df.index
return output_df
# ====================================================
# Feature Engineering of Category
# ====================================================
def target_encoding(input_df):
target_cols=['ACTIVITY_NAME',
'SECTOR_NAME',
'COUNTRY_NAME',
'TOWN_NAME',
'CURRENCY']
target=input_df["LOAN_AMOUNT"]
features = input_df[target_cols]
encoder = ce.TargetEncoder().fit(features.values, target)
output_df = pd.DataFrame(encoder.transform(features.values))
output_df.columns = target_cols
output_df = output_df.add_prefix("TE_")
output_df = pd.concat([input_df, output_df], axis=1)
output_df=output_df.iloc[:,-len(target_cols):]
return output_df
def label_encoding(input_df):
target_cols=['ACTIVITY_NAME',
'SECTOR_NAME',
'COUNTRY_NAME',
'TOWN_NAME',
'CURRENCY_POLICY',
'CURRENCY',
'REPAYMENT_INTERVAL',
'DISTRIBUTION_MODEL']
features = input_df[target_cols]
encoder = ce.OrdinalEncoder().fit(features.values)
output_df = pd.DataFrame(encoder.transform(features.values))
output_df.columns = target_cols
output_df = output_df.add_prefix("LE_")
output_df = pd.concat([input_df, output_df], axis=1)
output_df=output_df.iloc[:,-len(target_cols):]
return output_df
前処理の実行
def preprocess(train, test):
"""前処理の実行関数"""
input_df = pd.concat([train, test]).reset_index(drop=True)
funcs = [basic_preprocess,
get_basic_text_features,
get_tfidf_features__svd64,
get_count_features__svd64,
#get_bert_feature_description,
#get_bert_feature_loan,
target_encoding,
label_encoding]
output = []
for func in funcs:
_df = func(input_df)
output.append(_df)
output = pd.concat(output, axis=1)
train_x = output.iloc[:len(train)]
test_x = output.iloc[len(train):].reset_index(drop=True)
return train_x, test_x
train_x, test_x=preprocess(train,test)
学習
# ====================================================
# Model Preparation
# ====================================================
class SingleLgb:
def __init__(self, cat_col, seed=CONFIG.seed, dry_run=False):
self.train_param = self.get_param(seed)
if dry_run:
self.num_rounds = 10000
else:
self.num_rounds = 10000
self.cat_col=cat_col
def do_train_direct(self, x_train, x_test, y_train, y_test):
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)
print('Start training...')
model = lgb.train(self.train_param,
lgb_train,
valid_sets=[lgb_eval],
verbose_eval=100,
num_boost_round=self.num_rounds,
early_stopping_rounds=100,
categorical_feature=self.cat_col
)
print('End training...')
return model
@staticmethod
def show_feature_importance(model, filename=None):
fi = pd.DataFrame({
"name": model.feature_name(),
"importance_split": model.feature_importance(importance_type="split").astype(int),
"importance_gain": model.feature_importance(importance_type="gain").astype(int),
})
fi = fi.sort_values(by="importance_gain", ascending=False)
print(fi)
@staticmethod
def get_param(seed=CONFIG.seed):
return {
'num_leaves': 1023,
'min_data_in_leaf': 50,
'objective': 'regression',
'metric': 'mae',
'max_depth': -1,
'learning_rate': 0.05,
"boosting": "gbdt",
"feature_fraction": 0.9,
"verbosity": -1,
"random_state": seed,
}
class SingleTrainer:
def __init__(self, pred_col, cat_col,dry_run=False):
self.pred_col = pred_col
self.target_col = 'LOAN_AMOUNT'
self.dry_run = dry_run
self.val_size = 1000*1000
self.cat_col=cat_col
def train_model(self, df):
X = df[self.pred_col]
y = df[self.target_col]
kf = model_selection.KFold(n_splits=CONFIG.fold)
models, scores = list(), list()
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
print("---------")
print("fold=", fold)
X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
print(X_train.shape, X_val.shape)
lgbm = SingleLgb(seed=CONFIG.seed, dry_run=self.dry_run, cat_col=self.cat_col)
model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
score = model.best_score["valid_0"]["mae"]
if fold == 0:
lgbm.show_feature_importance(model)
models.append(model)
scores.append(score)
print(f'fold= {fold} MAE Score')
self.eval_model(model, X_val, y_val)
return models, np.mean(scores)
def eval_model(self, model, X_val, y_val):
temp_pred = model.predict(X_val)
#temp_pred = np.argmax(temp_pred, axis=1)
mae_score = temp_pred.mean()
print(pd.DataFrame(temp_pred).value_counts())
print("Score=", np.exp(mae_score))
# ====================================================
# Make Model Input
# ====================================================
#drop unrequired_columns
train_input=train_x.drop(columns=["TAGS",
"LOAN_ID",
'DESCRIPTION_TRANSLATED',
'LOAN_USE',
'ACTIVITY_NAME',
'SECTOR_NAME',
'COUNTRY_NAME',
'TOWN_NAME',
'CURRENCY_POLICY',
'CURRENCY',
'REPAYMENT_INTERVAL',
'DISTRIBUTION_MODEL'
])
display(train_input.shape)
test_input=test_x.drop(columns=["TAGS",
"LOAN_ID",
'DESCRIPTION_TRANSLATED',
"LOAN_AMOUNT",
'LOAN_USE',
'ACTIVITY_NAME',
'SECTOR_NAME',
'COUNTRY_NAME',
'TOWN_NAME',
'CURRENCY_POLICY',
'CURRENCY',
'REPAYMENT_INTERVAL',
'DISTRIBUTION_MODEL'
])
display(test_input.shape)
#Targetの対数変換
train_input=train_input.copy()
train_input["LOAN_AMOUNT"] = np.log(train_input["LOAN_AMOUNT"])
#train_x.select_dtypes(include='object').columns
cat_col=['LE_ACTIVITY_NAME',
'LE_SECTOR_NAME',
'LE_COUNTRY_NAME',
'LE_TOWN_NAME',
'LE_CURRENCY_POLICY',
'LE_CURRENCY',
'LE_REPAYMENT_INTERVAL',
'LE_DISTRIBUTION_MODEL']
pred_col=train_input.columns
len(train_input.columns.unique())
len(test_input.columns.unique())
# ====================================================
# Training
# ====================================================
trainer = SingleTrainer(pred_col, cat_col=cat_col,dry_run=True)
models, score = trainer.train_model(train_input)
print("MAE_SCORE="+np.exp(score))
推論
# ====================================================
# Prediction
# ====================================================
preds = []
for m in models:
preds.append(m.predict(test_input[pred_col]))
pred = np.exp(np.mean(preds, axis=0))
sub_predict=sub.copy()
sub_predict["LOAN_AMOUNT"]=pred
sub_predict=sub.copy()
sub_predict.to_csv(submission_dir+CONFIG.exp+"/"+"submission.csv")