Stats Model
必要モジュールの読み込み
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression,LinearRegression
import random
import japanize_matplotlib
import copy
from sklearn.covariance import GraphicalLasso
from sklearn.covariance import GraphicalLassoCV
import pydot
from IPython.display import Image, display
# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from statsmodels.graphics.factorplots import interaction_plot
%matplotlib inline
pd.set_option('display.max_columns', 100)
#plt.style.use('seaborn-darkgrid')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, roc_curve, confusion_matrix, roc_auc_score
from sklearn.metrics import mean_squared_error, silhouette_samples
from sklearn.cluster import KMeans
# 小数第3位まで表示
%precision 3
# ランダムシードの固定
np.random.seed(0)
データ読み込み
input_dir = "/content/drive/MyDrive/11_Python/casual_inference/input/"
output_dir = "/content/drive/MyDrive/11_Python/casual_inference/output/"
pd.set_option("display.max_colwidth", 50)
df = pd.read_csv(input_dir + "sample.csv")
print(df.shape)
df.head()
基礎集計
sns.set(font="IPAexGothic")
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(1, 1, 1)
ax = sns.barplot(x='cm_dummy', y='gamesecond', data=df, ax=ax)
ax.set_xticklabels(['CM見てない(z=0)', 'CM見た(z=1)'])
ax.set_xlabel('')
ax.set_ylabel('平均ゲーム利用時間 [秒]');
モデル
target_col = 'cm_dummy'
tags_covariate=['gamedummy',
'area_kanto',
'area_keihan',
'area_tokai',
'area_keihanshin',
'age',
'sex',
'marry_dummy'
,'job_dummy1',
'job_dummy2',
'job_dummy3',
'job_dummy4',
'job_dummy5',
'job_dummy6',
'job_dummy7',
'job_dummy8',
'inc',
'pmoney',
'fam_str_dummy1',
'fam_str_dummy2',
'fam_str_dummy3',
'fam_str_dummy4',
'fam_str_dummy5',
'child_dummy',
'TVwatch_day',
'gamesecond']
df_cm_0 = df[df.cm_dummy==0].reset_index(drop=True)
print(df_cm_0.shape)
df_cm_1 = df[df.cm_dummy==1].reset_index(drop=True)
print(df_cm_1.shape)
AXES = ["ax_{}".format(i) for i in range(len(tags_covariate))]
figsize = (21, 5 * len(tags_covariate))
fig = plt.figure(figsize=figsize)
for i in range(len(tags_covariate)):
array0 = df_cm_0[tags_covariate[i]].values
array1 = df_cm_1[tags_covariate[i]].values
AXES[i] = plt.subplot2grid((len(tags_covariate), 1), (i, 0), rowspan=1, colspan=1)
dum_ax = AXES[i]
dum_ax.hist(array0, density=True, bins=50, alpha=0.4, label="with CM")
dum_ax.hist(array1, density=True, bins=50, alpha=0.4, label="w/o CM")
dum_ax.set_title(tags_covariate[i], fontsize=16)
dum_ax.legend(fontsize=16)
傾向スコアマッチング (ロジスティク回帰)
X = df[tags_covariate].values
y = df.cm_dummy.values
clf = LogisticRegression(solver="lbfgs", C=0.1)
#clf = GradientBoostingClassifier()
X = (X - X.mean(axis=0)) / X.std(axis=0)
clf.fit(X, y)
y_pred = clf.predict(X)
y_pred_proba = clf.predict_proba(X)
fpr, tpr, thresholds = roc_curve(y, y_pred_proba[:, 1])
#print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred, labels=[0, 1]))
fig, ax = plt.subplots(ncols=1, figsize=(7, 7))
ax.plot(fpr, tpr, marker="o")
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel("False Positive", fontsize=16)
ax.set_ylabel("True Positive", fontsize=16)
plt.title("AUC : {}".format(roc_auc_score(y, y_pred_proba[:, 1])), fontsize=16)
tags_ps=['cm_dummy',
'gamedummy',
'area_kanto',
'area_keihan',
'area_tokai',
'area_keihanshin',
'age',
'sex',
'marry_dummy'
,'job_dummy1',
'job_dummy2',
'job_dummy3',
'job_dummy4',
'job_dummy5',
'job_dummy6',
'job_dummy7',
'job_dummy8',
'inc',
'pmoney',
'fam_str_dummy1',
'fam_str_dummy2',
'fam_str_dummy3',
'fam_str_dummy4',
'fam_str_dummy5',
'child_dummy',
'TVwatch_day',
'gamesecond'
]
df = df[tags_ps].copy()
df["ps"] = clf.predict_proba(X)[:, 1]
df.head()
figsize = (10,8)
fig = plt.figure(figsize=figsize)
#可視化
propensity0 = df.ps[df.cm_dummy==0]
propensity1 = df.ps[df.cm_dummy==1]
bins = np.arange(0, 1.05, 0.05)
top0, _ = np.histogram(propensity0, bins=bins)
top1, _ = np.histogram(propensity1, bins=bins)
plt.ylim(-1000,1000)
plt.axhline(0, c="black")
plt.bar(bins[:-1]+ 0.025, top0, width=0.04, facecolor="tomato", label="0")
plt.bar(bins[:-1]+ 0.025, -top1, width=0.04, facecolor="cornflowerblue", label="1")
plt.axhline(0, c="black")
for x, y in zip(bins, top0):
plt.text(x + 0.025, y + 10, str(y), ha="center", va="bottom")
for x, y in zip(bins, top1):
plt.text(x + 0.025, -y - 10, str(y), ha="center", va="top")
plt.title("Propensity Score",fontsize=20)
plt.legend(fontsize=15)
マッチング
#マッチングの範囲を指定
inter = np.arange(0,1,0.005)
#マッチング後のデータを入れるためのからデータフレーム
dat_match=pd.DataFrame()
#一つ目の条件で、CM0とCM1のどちらか一方が0人の場合を除外する。
for i in inter:
match0 = df[(df.cm_dummy==0) &(i-0.005 < df.ps)& (df.ps<i)]
match1 = df[(df.cm_dummy==1) &(i-0.005 < df.ps)& (df.ps<i)]
#二つ目の条件で人数が少ない方に合わせてランダムにマッチングする
if (match0.shape[0]!=0)&(match1.shape[0]!=0):
if match0.shape[0]<=match1.shape[0]:
match1= match1.iloc[random.sample(range(0,match1.shape[0]),match0.shape[0]),:]
dat_match = pd.concat([dat_match,match0,match1])
else :
match0= match0.iloc[random.sample(range(0,match0.shape[0]),match1.shape[0]),:]
dat_match = pd.concat([dat_match,match0,match1])
figsize = (10,8)
fig = plt.figure(figsize=figsize)
propensity0 = dat_match.ps[dat_match.cm_dummy==0]
propensity1 = dat_match.ps[dat_match.cm_dummy==1]
bins = np.arange(0, 1.05, 0.05)
top0, _ = np.histogram(propensity0, bins=bins)
top1, _ = np.histogram(propensity1, bins=bins)
plt.ylim(-1000,1000)
plt.axhline(0, c="black")
plt.bar(bins[:-1]+ 0.025, top0, width=0.04, facecolor="tomato", label="0")
plt.bar(bins[:-1]+ 0.025, -top1, width=0.04, facecolor="cornflowerblue", label="1")
plt.axhline(0, c="black")
for x, y in zip(bins, top0):
plt.text(x + 0.025, y + 10, str(y), ha="center", va="bottom")
for x, y in zip(bins, top1):
plt.text(x + 0.025, -y - 10, str(y), ha="center", va="top")
plt.title("Propensity Score",fontsize=20)
plt.legend(fontsize=15)
dat_match["ps_gp"] = round(dat_match.ps/0.1)
dat0 = dat_match[dat_match.cm_dummy==0]
dat1 = dat_match[dat_match.cm_dummy==1]
for i in range(2,10):
res = stats.ttest_ind(dat0.gamesecond[dat0.ps_gp==i],dat1.gamesecond[dat1.ps_gp==i])
diff = np.mean(dat0.gamesecond[dat0.ps_gp==i])-np.mean(dat1.gamesecond[dat1.ps_gp==i])
col = "ps"+str(i)
print(" {:} {:5.1f} (p: {:1.3f})".format(col,diff,res.pvalue))