Scikit-Learn Basic¶

サンプルデータのロード¶

# -*- coding: utf-8 -*-

# scikit-learnのdatasetsをロード
from sklearn import datasets

# digitデータセットをロード
digits = datasets.load_digits()

# 特徴量行列を作成
features = digits.data

# ターゲットベクトルを作成
target = digits.target

# 最初の観測を表示
features[0]

シミュレーションによるサンプルデータの作成¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from sklearn.datasets import make_regression

# 特徴量行列、ターゲットベクトル、生成に用いた係数の真の値を生成
features, target, coefficients = make_regression(n_samples = 100,
                                                 n_features = 3,
                                                 n_informative = 3,
                                                 n_targets = 1,
                                                 noise = 0.0,
                                                 coef = True,
                                                 random_state = 1)

# 特徴量行列とターゲットベクトルを表示
print('特徴量行列\n', features[:3])
print('ターゲットベクトル\n', target[:3])

##########

# ライブラリをロード
from sklearn.datasets import make_classification

# 特徴量行列、ターゲットベクトルを生成
features, target = make_classification(n_samples = 100,
                                       n_features = 3,
                                       n_informative = 3,
                                       n_redundant = 0,
                                       n_classes = 2,
                                       weights = [.25, .75],
                                       random_state = 1)

# 特徴量行列、ターゲットベクトルを表示
print('特徴量行列\n', features[:3])
print('ターゲットベクトル\n', target[:3])

##########

# ライブラリをロード
from sklearn.datasets import make_blobs

# 特徴量行列、ターゲットベクトルを生成
features, target = make_blobs(n_samples = 100,
                              n_features = 2,
                              centers = 3,
                              cluster_std = 0.5,
                              shuffle = True,
                              random_state = 1)

# 特徴量行列、ターゲットベクトルを表示
print('特徴量行列\n', features[:3])
print('ターゲットベクトル\n', target[:3])

##########

# ライブラリをロード
import matplotlib.pyplot as plt

# 散布プロットを表示
plt.scatter(features[:,0], features[:,1], c=target)
plt.show()

特徴量のスケール変換¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn import preprocessing

# 特徴量を作成
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])

# スケール変換器を作成
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# 特徴量をスケール変換
scaled_feature = minmax_scale.fit_transform(feature)

# 特徴量を表示
scaled_feature

特徴量の標準化¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn import preprocessing

# 特徴量を作成
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])

# スケール変換器を作成
scaler = preprocessing.StandardScaler()

# 特徴量を変換
standardized = scaler.fit_transform(x)

# 特徴量を表示
standardized

##########

# 平均と標準偏差を表示
print("平均:", round(standardized.mean()))
print("標準偏差:", standardized.std())

##########

# スケール変換器を作成
# 外れ値が多い場合の対処（中央値/分位数でスケーリング）
robust_scaler = preprocessing.RobustScaler()

# 特徴量を変換
robust_scaler.fit_transform(x)

#データフレームへの落とし込み
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(X)
X = pd.DataFrame(columns=X_columns,data=standardized)

特徴量の正規化¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.preprocessing import Normalizer

# 特徴量行列を作成
features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

# 正規化器を作成
normalizer = Normalizer(norm="l2")

# 特徴量行列を変換
normalizer.transform(features)

##########

# 特徴量行列を変換
features_l2_norm = Normalizer(norm="l2").transform(features)

# 特徴量行列を表示
features_l2_norm

##########

# 特徴量行列を変換
features_l1_norm = Normalizer(norm="l1").transform(features)

# 特徴量行列を表示
features_l1_norm

##########

# 総計を表示
print("最初の観測値の値の総計:",
      features_l1_norm[0, 0] + features_l1_norm[0, 1])

特徴量のスケール変換¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# 特徴量行列を作成
features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

# PolynomialFeatures オブジェクトを作成
polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

# 多項式特徴量を作成
polynomial_interaction.fit_transform(features)

##########

interaction = PolynomialFeatures(degree=2,
              interaction_only=True, include_bias=False)
interaction.fit_transform(features)

独自の特徴量の変換器を作る¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.preprocessing import FunctionTransformer

# 特徴量行列を作成
features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

# 簡単な関数を定義
def add_ten(x):
    return x + 10

# 変換器を作成
ten_transformer = FunctionTransformer(add_ten)

# 特徴量行列を変換
ten_transformer.transform(features)

##########

# ライブラリをロード
import pandas as pd

# DataFrameを作成
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# 関数を適用
df.apply(add_ten)

外れ値の検出と削除¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# 簡単な人工データを生成
features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)

# 最初の特徴量の値を極端な値に置換
features[0,0] = 10000
features[0,1] = 10000

# 検出器を作成
outlier_detector = EllipticEnvelope(contamination=.1)

# 検出器を訓練
outlier_detector.fit(features)

# 外れ値を予測
outlier_detector.predict(features)

##########

# 特徴量を1つ作成
feature = features[:,0]

# 外れ値のインデックスを返す関数を作る
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

# 関数を実行
indicies_of_outliers(feature)

外れ値の取扱い（削除、外れ値のラベリング、特徴量を対数化）¶

対数化することで外れ値の影響が小さくなる

# -*- coding: utf-8 -*-

# ライブラリをロード
import pandas as pd

# DataFrameを作成
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

# 観測値をフィルタリング
houses[houses['Bathrooms'] < 20]

##########

# ライブラリをロード
import numpy as np

# 真偽条件に基づいて特徴量を作る
houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)

# データを表示
houses

##########

# 特徴量を対数にする
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]

# データを表示
houses

特徴量の離散化（ビン化）¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.preprocessing import Binarizer

# 特徴量を作成
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

# 二値化器を作成
# Create binarizer
binarizer = Binarizer(18)

# 特徴量を変換
binarizer.fit_transform(age)

##########

# 特徴量を複数のビンに分割
np.digitize(age, bins=[20,30,64])

##########

# 特徴量を複数のビンに分割
np.digitize(age, bins=[20,30,64], right=True)

##########

# 特徴量を複数のビンに分割
np.digitize(age, bins=[18])

クラスタリングによる観測値のグループ分け¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# 人工的な特徴量行列を作成
features, _ = make_blobs(n_samples = 50,
                         n_features = 2,
                         centers = 3,
                         random_state = 1)

# DataFrameを作成
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# k-meansクラスタ分け器を作成
clusterer = KMeans(3, random_state=0)

# クラスタ分け器を訓練
clusterer.fit(features)

# クラスタ分けを実行
dataframe["group"] = clusterer.predict(features)

# 観測値の最初の数個を表示
dataframe.head(5)

欠損値がある観測値の除外¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np

# 特徴量行列を作成
features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

# 欠損値のない(~ で条件を反転している)観測値だけを残す
features[~np.isnan(features).any(axis=1)]

##########

# ライブラリをロード
import pandas as pd

# データをロード
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# 欠損値のある観測値を削除
dataframe.dropna()

欠損値の補完（kNNで予測して補完/最頻値/平均値等で補完)¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# 人工的な特徴量行列を作成
features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 1)

# 特徴量を標準化
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# 最初の特徴量の最初の値を欠損値に置換
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# 特徴量行列中の欠損値を補完
features_knn_imputed = KNN(k=5, verbose=0).complete(standardized_features)

# 真の値と補完された値を比較
print("真の値:", true_value)
print("補完された値:", features_knn_imputed[0,0])

##########

# ライブラリをロード
from sklearn.preprocessing import Imputer

# 欠損値補完器(imputer)を作る
# Create imputer
mean_imputer = Imputer(strategy="mean", axis=0)

# 欠損値を補完する
features_mean_imputed = mean_imputer.fit_transform(features)

# 真の値と補完された値を比較
print("真の値:", true_value)
print("補完された値:", features_mean_imputed[0,0])

名義カテゴリ特徴量の数値化¶

数値の大小関係がない場合は注意すること

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

# 特徴量を作成
feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])

# ワンホットエンコーダを作成
one_hot = LabelBinarizer()

# 特徴量をワンホットエンコード
one_hot.fit_transform(feature)

##########

# 特徴量クラスを表示
one_hot.classes_

##########

# ワンホットエンコードされた特徴量を逆変換
one_hot.inverse_transform(one_hot.transform(feature))

##########

# ライブラリをロード
import pandas as pd

# 特徴量からダミー変数を生成
pd.get_dummies(feature[:,0])

##########

# 複数クラス特徴量を作成
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delware", "Florida"),
                      ("Texas", "Alabama")]

# 複数クラス用ワンホットエンコーダを作成
one_hot_multiclass = MultiLabelBinarizer()

# 複数クラス特徴量をワンホットエンコード
one_hot_multiclass.fit_transform(multiclass_feature)

##########

# クラスを表示
one_hot_multiclass.classes

順序カテゴリ特徴量の数値化¶

値の強度の順序が明確になるように数値変換する

# -*- coding: utf-8 -*-

# ライブラリをロード
import pandas as pd

# 特徴量を作成
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

# マップを作成
scale_mapper = {"Low":1,
                "Medium":2,
                "High":3}

# 特徴量の値をマップを使って置換
dataframe["Score"].replace(scale_mapper)

##########

dataframe = pd.DataFrame({"Score": ["Low",
                                    "Low",
                                    "Medium",
                                    "Medium",
                                    "High",
                                    "Barely More Than Medium"]})

scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium": 3,
                "High":4}

dataframe["Score"].replace(scale_mapper)

##########

scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium": 2.1,
                "High":3}

dataframe["Score"].replace(scale_mapper)

特徴量辞書の数値化¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from sklearn.feature_extraction import DictVectorizer

# 辞書を作成
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

# 辞書ベクトル変換器を作成
dictvectorizer = DictVectorizer(sparse=False)

# 辞書を特徴量行列に変換
features = dictvectorizer.fit_transform(data_dict)

# 特徴量行列を表示
features

##########

# 特徴量の名前を取得
feature_names = dictvectorizer.get_feature_names()

# 特徴量の名前を表示
feature_names

##########

# ライブラリをロード
import pandas as pd

# 特徴量からDataFrameを作成
pd.DataFrame(features, columns=feature_names)

##########

# 4つの文書に対する単語カウント辞書を作成
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

# リストを作成
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

# 単語カウント辞書のリストを特徴量行列に変換
dictvectorizer.fit_transform(doc_word_counts)

欠損クラス値の補完¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# カテゴリ特徴量を持つ特徴量行列を作成
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# カテゴリ特徴量に欠損値を持つ特徴量行列を作成
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])
# KNNクラス分類器を訓練
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])

# 欠損値のクラスを予測
imputed_values = trained_model.predict(X_with_nan[:,1:])

# 予測されたクラス値と他の特徴量を結合
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

# 2つの特徴量行列を結合
np.vstack((X_with_imputed, X))

##########

# ライブラリをロード
from sklearn.preprocessing import Imputer

# 2つの特徴量行列を結合
X_complete = np.vstack((X_with_nan, X))

imputer = Imputer(strategy='most_frequent', axis=0)

imputer.fit_transform(X_complete)

不均衡なクラスの取扱¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# irisデータセットをロード
iris = load_iris()

# 特徴量行列を作成 
features = iris.data

# ターゲットベクトルを作成
target = iris.target

# 最初の40の観測値を削除
features = features[40:,:]
target = target[40:]

# クラス0であるかどうかを示す2値ターゲットベクトルを作成
target = np.where((target == 0), 0, 1)

# バランスの崩れたターゲットベクトルを表示
target

##########

# 重みを作成
weights = {0: .9, 1: 0.1}

# ランダムフォレストクラス分類器を、重みを指定して作成
RandomForestClassifier(class_weight=weights)

##########

# ランダムフォレストクラス分類器を、重みをbalancedに指定して作成
RandomForestClassifier(class_weight="balanced")

##########

# それぞれのクラスの観測値のインデックスを取得
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

# それぞれのクラスの観測値数を計算
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# クラス0のそれぞれの観測値に対して、ランダムに
# クラス1から非復元抽出
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

# クラス0のターゲットベクトルと、
# ダウンサンプリングしたクラス1のターゲットベクトルを結合
np.hstack((target[i_class0], target[i_class1_downsampled]))

##########

# クラス0の特徴量行列と、
# ダウンサンプリングしたクラス1の特徴量行列を結合
np.vstack((features[i_class0,:], features[i_class1_downsampled,:]))[0:5]

##########

# クラス1のそれぞれの観測値に対して、ランダムにクラス0から復元抽出
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

# クラス0のアップサンプリングされたターゲットベクトルとクラス1のターゲットベクトルを結合
np.concatenate((target[i_class0_upsampled], target[i_class1]))

##########

# クラス0をアップサンプリングした特徴量行列と、クラス1の特徴量行列を結合
np.vstack((features[i_class0_upsampled,:], features[i_class1,:]))[0:5]

テキストのクリーニング(ピリオドの削除/大文字変換/正規抽出値の変換)¶

# -*- coding: utf-8 -*-

# テキストを生成
text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

# ホワイトスペースを削除
strip_whitespace = [string.strip() for string in text_data]

# テキストを表示
strip_whitespace

##########

# ピリオドを削除
remove_periods = [string.replace(".", "") for string in strip_whitespace]

# テキストを表示
remove_periods

##########

# 関数を定義
def capitalizer(string: str) -> str:
    return string.upper()

# 関数を適用
[capitalizer(string) for string in remove_periods]

##########

# ライブラリをロード
import re

# 関数を定義
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

# 関数を適用
[replace_letters_with_X(string) for string in remove_periods]

HTMLのパースとクリーニング¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from bs4 import BeautifulSoup

# HTMLテキストを作成
html = """
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>
       """

# HTMLをパース
soup = BeautifulSoup(html, "lxml")

# classが"full_name"となっているdivを見つけて、そのテキストを表示
soup.find("div", { "class" : "full_name" }).text.strip()

Cross_val_Scoring_method¶

https://scikit-learn.org/stable/modules/model_evaluation.html

主成分分析

from sklearn.decomposition import PCA

# パイプラインにPCAを埋め込めば自動的に次元圧縮してくれる
pca = PCA(n_components=10,random_state=1)

# 学習時に自動的にPCA処理が施される
pca.fit(X)
X = pca.transform(X)

ラベルエンコーディング¶

cols = ('AC', 'SOURCE', 'QUADRANT', 'CNDTN', 'BATHRM_ALL',
       'ROOM_ALL', 'AYB_GROUP', 'SALEDATE_YEAR')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train_df[c].values)) 
    train_df[c] = lbl.transform(list(train_df[c].values))

# shape        
print('Shape all_data: {}'.format(train_df.shape))

Scikit-Learn Basic¶

サンプルデータのロード¶

シミュレーションによるサンプルデータの作成¶

特徴量のスケール変換¶

特徴量の標準化¶

特徴量の正規化¶

特徴量のスケール変換¶

独自の特徴量の変換器を作る¶

外れ値の検出と削除¶

外れ値の取扱い（削除、外れ値のラベリング、特徴量を対数化）¶

特徴量の離散化（ビン化）¶

クラスタリングによる観測値のグループ分け¶

欠損値がある観測値の除外¶

欠損値の補完（kNNで予測して補完/最頻値/平均値等で補完)¶

名義カテゴリ特徴量の数値化¶

順序カテゴリ特徴量の数値化¶

特徴量辞書の数値化¶

欠損クラス値の補完¶

不均衡なクラスの取扱¶

テキストのクリーニング(ピリオドの削除/大文字変換/正規抽出値の変換)¶

HTMLのパースとクリーニング¶

Cross_val_Scoring_method¶

ラベルエンコーディング¶

値の傾いている数値データの絞り込みと変換¶