ndarray – ブロードキャスト

2020-07-04 / tau / コメントする

1次元の場合

以下の配列を元の配列とする。

a = np.arange(4)
print(a)

# [0 1 2 3]

a = np.arange(4)

print(a)

# [0 1 2 3]

数値は1次元配列に拡張されて、要素ごとに演算される。

b = 2
print(a + b)
print(a * b)

# 2 -> [2, 2, 2, 2]

# [2 3 4 5]
# [0 2 4 6]

b = 2

print(a + b)

print(a * b)

# 2 -> [2, 2, 2, 2]

# [2 3 4 5]

# [0 2 4 6]

要素が1つの配列（リスト）は同じサイズの配列に拡張されて、要素ごとに演算される。

b = np.array([2])
print(a + b)
print(a * b)

# [2] -> [2, 2, 2, 2]

# [2 3 4 5]
# [0 2 4 6]

b = np.array([2])

print(a + b)

print(a * b)

# [2] -> [2, 2, 2, 2]

# [2 3 4 5]

# [0 2 4 6]

2次元の場合

以下の配列を元の配列とする。

a = np.arange(9).reshape(3, 3)
print(a)

# [[0 1 2]
#  [3 4 5]
#  [6 7 8]]

a = np.arange(9).reshape(3, 3)

print(a)

# [[0 1 2]

# [3 4 5]

# [6 7 8]]

数値は2次元配列に拡張されて、要素ごとに計算される。

b = 2
print(a + b)

# 2 -> [[2 2 2]
#       [2 2 2]
#       [2 2 2]]

# [[ 2  3  4]
#  [ 5  6  7]
# [ 8  9 10]]

b = 2

print(a + b)

# 2 -> [[2 2 2]

# [2 2 2]

# [2 2 2]]

# [[ 2 3 4]

# [ 5 6 7]

# [ 8 9 10]]

要素が一つの配列は2次元に拡張されて、要素ごとに計算される。

b = [2]
print(a + b)

# [2] -> [[2 2 2]
#         [2 2 2]
#         [2 2 2]]

# [[ 2  3  4]
#  [ 5  6  7]
#  [ 8  9 10]]

b = [2]

print(a + b)

# [2] -> [[2 2 2]

# [2 2 2]

# [2 2 2]]

# [[ 2 3 4]

# [ 5 6 7]

# [ 8 9 10]]

列数と同じ要素数の1次元配列（リスト）は、同じ列数の2次元配列に拡張されて計算される。

b = [1, 2, 3]
print(a + b)

# [1 2 3] -> [[1 2 3]
#             [1 2 3]
#             [1 2 3]]

# [[ 1  3  5]
#  [ 4  6  8]
#  [ 7  9 11]]

b = [1, 2, 3]

print(a + b)

# [1 2 3] -> [[1 2 3]

# [1 2 3]

# [1 2 3]]

# [[ 1 3 5]

# [ 4 6 8]

# [ 7 9 11]]

行数と同じ要素数の列ベクトルは、同じ行数の2次元配列に拡張されて計算される。

b = np.array([1, 2, 3]).reshape(-1, 1)
print(a + b)

# [[1]      [[1 1 1]
#  [2]  ->   [2 2 2]
#  [3]]      [3 3 3]]

# [[ 1  2  3]
#  [ 5  6  7]
#  [ 9 10 11]]

b = np.array([1, 2, 3]).reshape(-1, 1)

print(a + b)

# [[1] [[1 1 1]

# [2] -> [2 2 2]

# [3]] [3 3 3]]

# [[ 1 2 3]

# [ 5 6 7]

# [ 9 10 11]]

Python/pyplot – 決定境界の描き方

2020-07-02 / tau / コメントする

決定境界の描き方として以前ループを使った泥臭い方法を考えたが、meshgridを使って数行で書けることを知ったのでまとめ。

結論としては以下の19～25行目の8行で、以下の手順で決定境界を書いている。

2つの特徴量の全領域をカバーする値をnumpy.linspace()で生成
numpy.meshgrid()で2次元のグリッドに変換
各特徴量のメッシュグリッドを1次元に変形し、縦2列の配列化
prediction()メソッドでその配列の各座標に対応する予測値を計算（結果は1次元配列）
結果の配列をmeshgridと同じ形状の2次元配列に変形
contour/contourf()で決定境界を描画

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

x0_min, x0_max = -2.0, 2.5
x1_min, x1_max = -1.0, 1.5

knn = KNeighborsClassifier(n_neighbors=3).fit(X, y)

fig, ax = plt.subplots()

color0, color1 = 'tab:blue', 'tab:orange'
ax.scatter(X[y==0][:, 0], X[y==0][:, 1], marker='o')
ax.scatter(X[y==1][:, 0], X[y==1][:, 1], marker='^')

f0 = np.linspace(x0_min, x0_max, 200)
f1 = np.linspace(x1_min, x1_max, 200)
f0, f1 = np.meshgrid(f0, f1)
pred = knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])) \
        .reshape(f0.shape)
ax.contour(f0, f1, pred, levels=[0.5])
ax.contourf(f0, f1, pred, levels=1, colors=[color0, color1], alpha=0.25)

ax.set_xlim(x0_min, x0_max)
ax.set_ylim(x1_min, x1_max)
ax.set_xlabel("Feature-0")
ax.set_ylabel("Feature-1")

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_moons

from sklearn.neighbors import KNeighborsClassifier

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

x0_min, x0_max = -2.0, 2.5

x1_min, x1_max = -1.0, 1.5

knn = KNeighborsClassifier(n_neighbors=3).fit(X, y)

fig, ax = plt.subplots()

color0, color1 = 'tab:blue', 'tab:orange'

ax.scatter(X[y==0][:, 0], X[y==0][:, 1], marker='o')

ax.scatter(X[y==1][:, 0], X[y==1][:, 1], marker='^')

f0 = np.linspace(x0_min, x0_max, 200)

f1 = np.linspace(x1_min, x1_max, 200)

f0, f1 = np.meshgrid(f0, f1)

pred = knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])) \

.reshape(f0.shape)

ax.contour(f0, f1, pred, levels=[0.5])

ax.contourf(f0, f1, pred, levels=1, colors=[color0, color1], alpha=0.25)

ax.set_xlim(x0_min, x0_max)

ax.set_ylim(x1_min, x1_max)

ax.set_xlabel("Feature-0")

ax.set_ylabel("Feature-1")

plt.show()

具体的な変数の変形状況を要素数4の少ない例で示すと以下の通り。

まず、2つの特徴量の範囲の数列を生成する。

f0 = np.linspace(x0_min, x0_max, 4)
f1 = np.linspace(x1_min, x1_max, 4)
print(f0)
print(f1)

# [-2.  -0.5  1.   2.5]
# [-1.         -0.16666667  0.66666667  1.5       ]

f0 = np.linspace(x0_min, x0_max, 4)

f1 = np.linspace(x1_min, x1_max, 4)

print(f0)

print(f1)

# [-2. -0.5 1. 2.5]

# [-1. -0.16666667 0.66666667 1.5 ]

それらの数列を、meshgridで2次元配列に変形する。

f0, f1 = np.meshgrid(f0, f1)
print(f0)
print(f1)

# [[-2.  -0.5  1.   2.5]
#  [-2.  -0.5  1.   2.5]
#  [-2.  -0.5  1.   2.5]
#  [-2.  -0.5  1.   2.5]]
# [[-1.         -1.         -1.         -1.        ]
#  [-0.16666667 -0.16666667 -0.16666667 -0.16666667]
#  [ 0.66666667  0.66666667  0.66666667  0.66666667]
#  [ 1.5         1.5         1.5         1.5       ]]

f0, f1 = np.meshgrid(f0, f1)

print(f0)

print(f1)

# [[-2. -0.5 1. 2.5]

# [-2. -0.5 1. 2.5]

# [-2. -0.5 1. 2.5]]

# [[-1. -1. -1. -1. ]

# [-0.16666667 -0.16666667 -0.16666667 -0.16666667]

# [ 0.66666667 0.66666667 0.66666667 0.66666667]

# [ 1.5 1.5 1.5 1.5 ]]

予測モデルに与える変数は各特徴量を列とする2次元配列とする必要があるので、まず上の2次元配列をそれぞれ1次元に変形。この変形では、2次元配列の各行を連ねていった1行の配列を列ベクトルにした形になる。

print(f0.reshape(-1, 1))
print(f1.reshape(-1, 1))

# [[-2. ]
#  [-0.5]
#  [ 1. ]
#  [ 2.5]
#  [-2. ]
#  [-0.5]
#  [ 1. ]
#  [ 2.5]
#  [-2. ]
#  [-0.5]
#  [ 1. ]
#  [ 2.5]
#  [-2. ]
#  [-0.5]
#  [ 1. ]
#  [ 2.5]]
# [[-1.        ]
#  [-1.        ]
#  [-1.        ]
#  [-1.        ]
#  [-0.16666667]
#  [-0.16666667]
#  [-0.16666667]
#  [-0.16666667]
#  [ 0.66666667]
#  [ 0.66666667]
#  [ 0.66666667]
#  [ 0.66666667]
#  [ 1.5       ]
#  [ 1.5       ]
#  [ 1.5       ]
#  [ 1.5       ]]

print(f0.reshape(-1, 1))

print(f1.reshape(-1, 1))

# [[-2. ]

# [-0.5]

# [ 1. ]

# [ 2.5]

# [-2. ]

# [-0.5]

# [ 1. ]

# [ 2.5]

# [-2. ]

# [-0.5]

# [ 1. ]

# [ 2.5]

# [-2. ]

# [-0.5]

# [ 1. ]

# [ 2.5]]

# [[-1. ]

# [-1. ]

# [-0.16666667]

# [ 0.66666667]

# [ 1.5 ]

# [ 1.5 ]]

次に2つの列ベクトルを横方向に並べて、総計算データ数×特徴量数(2)の2次元配列とする。

print(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)]))

# [[-2.         -1.        ]
#  [-0.5        -1.        ]
#  [ 1.         -1.        ]
#  [ 2.5        -1.        ]
#  [-2.         -0.16666667]
#  [-0.5        -0.16666667]
#  [ 1.         -0.16666667]
#  [ 2.5        -0.16666667]
#  [-2.          0.66666667]
#  [-0.5         0.66666667]
#  [ 1.          0.66666667]
#  [ 2.5         0.66666667]
#  [-2.          1.5       ]
#  [-0.5         1.5       ]
#  [ 1.          1.5       ]
#  [ 2.5         1.5       ]]

print(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)]))

# [[-2. -1. ]

# [-0.5 -1. ]

# [ 1. -1. ]

# [ 2.5 -1. ]

# [-2. -0.16666667]

# [-0.5 -0.16666667]

# [ 1. -0.16666667]

# [ 2.5 -0.16666667]

# [-2. 0.66666667]

# [-0.5 0.66666667]

# [ 1. 0.66666667]

# [ 2.5 0.66666667]

# [-2. 1.5 ]

# [-0.5 1.5 ]

# [ 1. 1.5 ]

# [ 2.5 1.5 ]]

この配列の各座標に対する予測値を、predict()メソッドで予測。この結果は、1次元化されたf0やf1と同じく、2次元のmeshgridの各行を横に連ねたものになっている。

print(knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])))

# [0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 1]

print(knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])))

# [0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 1]

この結果を、meshgrid化されたf0（またはf1）と同じ形に変形。これで予測結果がf0×f1平面の各座標に対応した予測値の2次元配列となっている。

print(knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])).reshape(f0.shape))

# [[0 0 1 1]
#  [0 1 1 1]
#  [0 0 0 1]
#  [0 0 0 1]]

print(knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])).reshape(f0.shape))

# [[0 0 1 1]

# [0 1 1 1]

# [0 0 0 1]

# [0 0 0 1]]

この結果を使い、contour()/contourf()で決定境界あるいは決定領域を描画。

pred = knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])) \
        .reshape(f0.shape)
ax.contour(f0, f1, pred, levels=[0.5])
ax.contourf(f0, f1, pred, levels=1, colors=[color0, color1], alpha=0.25)

pred = knn.predict(np.hstack([f0.reshape(-1, 1), f1.reshape(-1, 1)])) \

.reshape(f0.shape)

ax.contour(f0, f1, pred, levels=[0.5])

ax.contourf(f0, f1, pred, levels=1, colors=[color0, color1], alpha=0.25)

ここでlevelsの指定は以下のようにしている。

まずcontour()の場合、ドキュメンテーションには“If an int n, use n data intervals; i.e. draw n+1 contour lines. The level heights are automatically chosen.”と書かれているので、levels=0と指定すると0＋1本の線が描かれると考えたが以下のような警告が出て線の位置がずれた。

serWarning: No contour levels were found within the data range.
  ax.contour(f0, f1, pred, levels=0)

1 2	serWarning: No contour levels were found within the data range. ax.contour(f0, f1, pred, levels=0)

そこでlevels=[0.5]と2つのクラス値0と1の間をとると適切に表示される。

なおcontourf()のときは、levels=1として2つの領域が描かれる。

Breast Cancerデータセット – SVM

2020-06-29 / tau / コメントする

過学習？

書籍”Pythonではじめる機械学習”の”2.3.7.4 SVMパラメータの調整”の最後の方で、scikit-learnのSVMをBreast Cancerデータセットに適用した例が示されている（カーネル法によるSVMについてはこちらにまとめている）。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

ds = load_breast_cancer()
X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

svc = SVC(gamma='auto').fit(X_train, y_train)

print("Training score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test score    : {:.3f}".format(svc.score(X_test, y_test)))

# Training score: 1.000
# Test score    : 0.629

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

ds = load_breast_cancer()

X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

svc = SVC(gamma='auto').fit(X_train, y_train)

print("Training score: {:.3f}".format(svc.score(X_train, y_train)))

print("Test score : {:.3f}".format(svc.score(X_test, y_test)))

# Training score: 1.000

# Test score : 0.629

ここで、原典ではSVC()の引数を指定せずデフォルトのままとしているが、そのまま実行すると以下のような結果になった。

Training score: 0.904
Test score    : 0.937

1 2	Training score: 0.904 Test score : 0.937

scikit-learnのドキュメンテーションによると、

Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,
if ‘auto’, uses 1 / n_features.

Changed in version 0.22: The default value of gamma changed from ‘auto’ to ‘scale’.

とされていて、gammaのデフォルト設定が変わったようである。新しい仕様ではデフォルトでデータのスケーリングが行われるため、どちらかといえば適合不足の状態になる。先のコードでは明示的にgamma=autoを設定し、書籍と同じ結果を得ている。

特徴量データのサイズの違い

Breast Cancerデータの30の特徴量について、各々の分布状況を箱髭図で描いてみた。縦軸の対数スケールに対してでも、各特徴量がかなりばらついており、1万倍～100万倍ほどの違いがあることがわかる。

import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

ds = load_breast_cancer()
X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

fig, ax = plt.subplots(figsize=(6.4, 6))
fig.subplots_adjust(bottom=0.3)

ax.boxplot(X_train, showfliers=False)
ax.set_xticklabels(ds.feature_names, rotation=270, fontsize=8)
ax.set_yscale('log')

plt.show()

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

ds = load_breast_cancer()

X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

fig, ax = plt.subplots(figsize=(6.4, 6))

fig.subplots_adjust(bottom=0.3)

ax.boxplot(X_train, showfliers=False)

ax.set_xticklabels(ds.feature_names, rotation=270, fontsize=8)

ax.set_yscale('log')

plt.show()

データの前処理

データのスケールを揃えるために使われるMiniMaxScalorでは、各特徴量の訓練データを最小値と最大値でスケーリングし、0～1に納まるようにする。具体的には、特徴量ごとに最小値を引いて、最大値－最小値のレンジで除する。

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

ds = load_breast_cancer()
X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

min_features = X_train.min(axis=0)
max_features = X_train.max(axis=0)
ranges = max_features - min_features

X_train_scaled = (X_train - min_features) / ranges
print("Scaled training data:")
print("Minimum for each feature\n{}".format(X_train_scaled.min(axis=0)))
print("Maximum for each feature\n{}".format(X_train_scaled.max(axis=0)))
print()
X_test_scaled = (X_test - min_features) / ranges
print("Scaled test data:")
print("Minimum for each feature\n{}".format(X_test_scaled.min(axis=0)))
print("Maximum for each feature\n{}".format(X_test_scaled.max(axis=0)))

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

ds = load_breast_cancer()

X, y = ds.data, ds.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

min_features = X_train.min(axis=0)

max_features = X_train.max(axis=0)

ranges = max_features - min_features

X_train_scaled = (X_train - min_features) / ranges

print("Scaled training data:")

print("Minimum for each feature\n{}".format(X_train_scaled.min(axis=0)))

print("Maximum for each feature\n{}".format(X_train_scaled.max(axis=0)))

print()

X_test_scaled = (X_test - min_features) / ranges

print("Scaled test data:")

print("Minimum for each feature\n{}".format(X_test_scaled.min(axis=0)))

print("Maximum for each feature\n{}".format(X_test_scaled.max(axis=0)))

この結果、訓練データの各特徴量の最小値はすべて0となり、最大値はすべて1となる。

Scaled training data:
Minimum for each feature
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
Maximum for each feature
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]

Scaled training data:

Minimum for each feature

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

0. 0. 0. 0. 0. 0.]

Maximum for each feature

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

1. 1. 1. 1. 1. 1.]

テストデータに対してもスケーリングを行うが、ここで使う最小値とレンジは訓練データのものとし、訓練データとテストデータでスケーリングに歪がでないようにする。その結果、スケーリング後のテストデータには、最小値が0より小さい値や最大値が1より大きい値が出ている。

Scaled test data:
Minimum for each feature
[ 0.03540158  0.04190871  0.02895446  0.01497349  0.14260888  0.04999658
  0.          0.          0.07222222  0.00589722  0.00105015 -0.00057494
  0.00067851 -0.0007963   0.05148726  0.01434497  0.          0.
  0.04195752  0.01113138  0.03678406  0.01252665  0.03366702  0.01400904
  0.08531995  0.01833687  0.          0.          0.00749064  0.02367834]
Maximum for each feature
[0.76809125 1.22697095 0.75813696 0.64750795 1.20310633 1.11643038
 0.99906279 0.90606362 0.93232323 0.94903117 0.45573058 0.72623944
 0.48593507 0.31641282 1.36082713 1.2784499  0.36313131 0.77476795
 1.32643996 0.72672498 0.82106012 0.87553305 0.77887345 0.67803775
 0.78603975 0.87843331 0.93450479 1.0024113  0.76384782 0.58743277]

Scaled test data:

Minimum for each feature

[ 0.03540158 0.04190871 0.02895446 0.01497349 0.14260888 0.04999658

0. 0. 0.07222222 0.00589722 0.00105015 -0.00057494

0.00067851 -0.0007963 0.05148726 0.01434497 0. 0.

0.04195752 0.01113138 0.03678406 0.01252665 0.03366702 0.01400904

0.08531995 0.01833687 0. 0. 0.00749064 0.02367834]

Maximum for each feature

[0.76809125 1.22697095 0.75813696 0.64750795 1.20310633 1.11643038

0.99906279 0.90606362 0.93232323 0.94903117 0.45573058 0.72623944

0.48593507 0.31641282 1.36082713 1.2784499 0.36313131 0.77476795

1.32643996 0.72672498 0.82106012 0.87553305 0.77887345 0.67803775

0.78603975 0.87843331 0.93450479 1.0024113 0.76384782 0.58743277]

スケーリングされた訓練データとテストデータについてスコアを計算すると以下のようになり、先ほどの過学習の状態から適合不足の状態となった。尚この結果は、新しいSVCクラスにおいてデフォルトのgamma='auto'を指定したときの傾向と似ていて、若干の適合不足となっている。

svc = SVC(gamma='auto').fit(X_train_scaled, y_train)

print("Training score:{:.3f}".format(svc.score(X_train_scaled, y_train)))
print("Test score    :{:.3f}".format(svc.score(X_test_scaled, y_test)))

# Training score:0.948
# Test score    :0.951

svc = SVC(gamma='auto').fit(X_train_scaled, y_train)

print("Training score:{:.3f}".format(svc.score(X_train_scaled, y_train)))

print("Test score :{:.3f}".format(svc.score(X_test_scaled, y_test)))

# Training score:0.948

# Test score :0.951

パラメーター調整

上記の適合不足の結果に対して、パラメーターを変化させてみる。デフォルトのC=1からC=1000としてみると、訓練スコア、テストスコアとも改善された。テストスコアはランダムフォレストや決定木の勾配ブースティングの結果と同じになっている。

svc = SVC(C=1000, gamma='auto').fit(X_train_scaled, y_train)

print("Training score:{:.3f}".format(svc.score(X_train_scaled, y_train)))
print("Test score    :{:.3f}".format(svc.score(X_test_scaled, y_test)))

# Training score:0.988
# Test score    :0.972

svc = SVC(C=1000, gamma='auto').fit(X_train_scaled, y_train)

print("Training score:{:.3f}".format(svc.score(X_train_scaled, y_train)))

print("Test score :{:.3f}".format(svc.score(X_test_scaled, y_test)))

# Training score:0.988

# Test score :0.972

さらにいくつかのCとgammaで試してみると、特にスコアがいいのは以下のケースだった。なおgamma=1, 10の場合、C=100, 1000, 10000に対して訓練スコアが1.000、テストスコアが0.95程度で全て過学習となった。

C	gamma	訓練スコア	テストスコア
1000	auto	0.988	0.972
1000	0.01	0.986	0.979
100	0.1	0.986	0.972

ndarray.min/max – 配列の最小値と最大値

2020-06-28 / tau / コメントする

ndarray.min()/max()は、配列の最小値／最大値を返すメソッド。また、ndarray.argmin()/argmax()は、最小／最大の要素のインデックスを配列で返す。

なお、numpy.amin()/amax()、numpy.argmin()/argmax()もほぼ同じ動作をする。

以下、次の配列で動作を確認する。

import numpy as np

a = np.array([
    [1, 3, 2],
    [4, 6, 5],
    [7, 9, 8]
])

import numpy as np

a = np.array([

[1, 3, 2],

[4, 6, 5],

[7, 9, 8]

])

引数に何も指定しない場合、配列の全要素の中の最小値と最大値を返す。このとき、argmin/argmaxでは、配列をreshape(-1)で1次元化したときのインデックスが返される。

print(a.min(), a.max())
# 1 9

print(a.argmin(), a.argmax())
# 0 8

print(a.min(), a.max())

# 1 9

print(a.argmin(), a.argmax())

# 0 8

引数にaxis=0を指定すると、各列ベクトルの行方向の中での最小値／最大値を返す。以下の例では、各列ごとの最小値／最大値とそれらに対する行インデックスが配列で返されている。

axis=0の0を2次元配列の引数の位置と考えると0番目の引数で、各列における行の位置を表す。これはargmin/argmaxの意味合いと符合する。

print(a.min(axis=0), a.max(axis=0))
# [1 3 2] [7 9 8]

print(a.argmin(axis=0), a.argmax(axis=0))
# [0 0 0] [2 2 2]

print(a.min(axis=0), a.max(axis=0))

# [1 3 2] [7 9 8]

print(a.argmin(axis=0), a.argmax(axis=0))

# [0 0 0] [2 2 2]

引数にaxis=1を指定すると、各行ベクトルの列方向の中での最小値／最大値を返す。以下の例では、各行ごとの最小値／最大値とそれらに対する列インデックスが配列で返されている。

axis=1の1を2次元配列の引数の位置と考えると1番目の引数で、各行における列の位置を表す。これはargmin/argmaxの意味合いと符合する。

print(a.min(axis=1), a.max(axis=1))
# [1 4 7] [3 6 9]

print(a.argmin(axis=1), a.argmax(axis=1))
# [0 0 0] [1 1 1]

print(a.min(axis=1), a.max(axis=1))

# [1 4 7] [3 6 9]

print(a.argmin(axis=1), a.argmax(axis=1))

# [0 0 0] [1 1 1]

pyplot – グラフ要素のフォントサイズ

2020-06-28 / tau / コメントする

グラフ全体のフォントサイズ

pyplot.rcParams()で基準のフォントサイズを変更。デフォルトはfont.size=12。以下は全体のフォントサイズを大きくした例。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
ys = np.sin(x)
yc = np.cos(x)

plt.rcParams['font.size'] = 15
fig, ax = plt.subplots()
fig.subplots_adjust(left=0.2)

ax.set_title("Axes Title")
ax.plot(x, ys, label="sin x")
ax.plot(x, yc, label="cos x")
ax.set_ylabel("sin/cos")
ax.legend()

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

ys = np.sin(x)

yc = np.cos(x)

plt.rcParams['font.size'] = 15

fig, ax = plt.subplots()

fig.subplots_adjust(left=0.2)

ax.set_title("Axes Title")

ax.plot(x, ys, label="sin x")

ax.plot(x, yc, label="cos x")

ax.set_ylabel("sin/cos")

ax.legend()

plt.show()

個別要素のフォントサイズ

タイトル、軸ラベル、軸目盛、凡例について個別にフォントサイズを指定した例。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
ys = np.sin(x)
yc = np.cos(x)

fig, ax = plt.subplots()

ax.set_title("Axes Title", fontsize=20)
ax.plot(x, ys, label="sin x")
ax.plot(x, yc, label="cos x")
ax.set_ylabel("sin/cos", fontsize=10)
ax.tick_params(labelsize=7)
ax.legend(fontsize=7)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

ys = np.sin(x)

yc = np.cos(x)

fig, ax = plt.subplots()

ax.set_title("Axes Title", fontsize=20)

ax.plot(x, ys, label="sin x")

ax.plot(x, yc, label="cos x")

ax.set_ylabel("sin/cos", fontsize=10)

ax.tick_params(labelsize=7)

ax.legend(fontsize=7)

plt.show()

Python – 多重ループの一重化

2020-06-27 / tau / コメントする

概要

以下のような二重ループを、一重ループで実現する方法。

years = [2000, 2010, 2020]
seasons = ["Spring", "Summer", "Autumn", "Winter"]

for year in years:
    for season in seasons:
        print(year, season)

# 2000 Spring
# 2000 Summer
# 2000 Autumn
# 2000 Winter
# 2010 Spring
# 2010 Summer
# 2010 Autumn
# 2010 Winter
# 2020 Spring
# 2020 Summer
# 2020 Autumn
# 2020 Winter

years = [2000, 2010, 2020]

seasons = ["Spring", "Summer", "Autumn", "Winter"]

for year in years:

for season in seasons:

print(year, season)

# 2000 Spring

# 2000 Summer

# 2000 Autumn

# 2000 Winter

# 2010 Spring

# 2010 Summer

# 2010 Autumn

# 2010 Winter

# 2020 Spring

# 2020 Summer

# 2020 Autumn

# 2020 Winter

内包表記による方法

内包表記の中で二重ループを回し、1つのリストを生成する。

years = [2000, 2010, 2020]
seasons = ["Spring", "Summer", "Autumn", "Winter"]

lst = [(year, season) for year in years for season in seasons]

for year, season in lst:
    print(year, season)

years = [2000, 2010, 2020]

seasons = ["Spring", "Summer", "Autumn", "Winter"]

lst = [(year, season) for year in years for season in seasons]

for year, season in lst:

print(year, season)

`itertools.product`による方法

itertoolsライブラリーにあるproduct()は、引数のリストの各要素の直積を要素とするリストを返す。

from itertools import product

years = [2000, 2010, 2020]
seasons = ["Spring", "Summer", "Autumn", "Winter"]

iter = product(years, seasons)

for year, season in iter:
    print(year, season)

from itertools import product

years = [2000, 2010, 2020]

seasons = ["Spring", "Summer", "Autumn", "Winter"]

iter = product(years, seasons)

for year, season in iter:

print(year, season)

numpy – r_とc_

2020-06-27 / tau / コメントする

概要

numpy.r_ / numpy.c_は配列を結合するオブジェクト。r_は縦方向に配列を結合し、c_は横方向に配列を結合する。vstack() / hstack()やlinspace()と似たような使い方ができるが、少し癖がある。

配列と数値を混在させて結合できる
スライスでステップ数やか分割数を指定して数列をつくれる
vstack()やhstack()の代わりに使える

vstack()やhstack()と同じように使う。

import numpy as np

a1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
a2 = np.array([
    [10, 20, 30],
    [40, 50, 60]
])

print(np.r_[a1, a2])
# [[ 1  2  3]
#  [ 4  5  6]
#  [10 20 30]
#  [40 50 60]]

print(np.c_[a1, a2])
# [[ 1  2  3 10 20 30]
#  [ 4  5  6 40 50 60]]

import numpy as np

a1 = np.array([

[1, 2, 3],

[4, 5, 6]

])

a2 = np.array([

[10, 20, 30],

[40, 50, 60]

])

print(np.r_[a1, a2])

# [[ 1 2 3]

# [ 4 5 6]

# [10 20 30]

# [40 50 60]]

print(np.c_[a1, a2])

# [[ 1 2 3 10 20 30]

# [ 4 5 6 40 50 60]]

`r_`について

numpy.r_で2次元配列に1行だけ追加するとき、1次元配列のままだ”次元が異なる”とエラー。素直にvstack()を使った方がよい。

a3 = np.array([10, 20, 30])
#print(np.r_[a1, a3])
# -> ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

print(np.r_[a1, a3.reshape(1, -1)])
# [[ 1  2  3]
#  [ 4  5  6]
#  [10 20 30]]

a3 = np.array([10, 20, 30])

#print(np.r_[a1, a3])

# -> ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

print(np.r_[a1, a3.reshape(1, -1)])

# [[ 1 2 3]

# [ 4 5 6]

# [10 20 30]]

r_のデフォルトで1次元配列同士を結合すると、単に横方向に結合される。配列と要素が混在していてもok。文字列の配列も結合できるが、文字列要素が混在するとエラーになる。

print(np.r_[[1, 2, 3], 4, 5, [6, 7]])
# [1 2 3 4 5 6 7]

print(np.r_[['A', 'B', 'C'], ['D', 'E']])
# ['A' 'B' 'C' 'D' 'E']

#print(np.r_[['A', 'B', 'C'], 'D', 'E', ['E', 'F']])
# -> ValueError: special directives must be the first entry.

print(np.r_[[1, 2, 3], 4, 5, [6, 7]])

# [1 2 3 4 5 6 7]

print(np.r_[['A', 'B', 'C'], ['D', 'E']])

# ['A' 'B' 'C' 'D' 'E']

#print(np.r_[['A', 'B', 'C'], 'D', 'E', ['E', 'F']])

# -> ValueError: special directives must be the first entry.

スライスを使って数列を生成。

print(np.r_[:10])
# [0 1 2 3 4 5 6 7 8 9]

print(np.r_[4:10:2])
# [4 6 8]

print(np.r_[0.5:5.5:0.5])
# [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]

print(np.r_[:10])

# [0 1 2 3 4 5 6 7 8 9]

print(np.r_[4:10:2])

# [4 6 8]

print(np.r_[0.5:5.5:0.5])

# [0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5 5. ]

3つ目の引数に'j'をつけてnumpy.linspace()と同様の動作。このときは終了値が含まれる。

print(np.r_[0:10:5j])
# [ 0.   2.5  5.   7.5 10. ]

1 2	print(np.r_[0:10:5j]) # [ 0. 2.5 5. 7.5 10. ]

`c_`について

numpy.c_で2次元配列にその行数と同じ要素数の1次元配列を結合すると、列ベクトルとみなされて1列追加される。hstack()が1次元配列を列ベクトル化する必要があるのに比べると手軽。

a5 = np.array([10, 20])
print(np.c_[a1, a5])
# [[ 1  2  3 10]
#  [ 4  5  6 20]]

a5 = np.array([10, 20])

print(np.c_[a1, a5])

# [[ 1 2 3 10]

# [ 4 5 6 20]]

さらに要素数が同じ1次元配列同士を結合すると、それらが列ベクトルとみなされて結合される。

b1 = np.array([1, 2, 3])
b2 = np.array([4, 5, 6])
print(np.c_[b1, b2])

# [[1 4]
#  [2 5]
#  [3 6]]

b1 = np.array([1, 2, 3])

b2 = np.array([4, 5, 6])

print(np.c_[b1, b2])

# [[1 4]

# [2 5]

# [3 6]]

空の配列に対して順次列ベクトルを追加する場合には、empty(n, 0, dtype=type)を準備する。

b0 = np.empty((3, 0), dtype=int)
b0 = np.c_[b0, b1]
print(b0)
# [[1]
#  [2]
#  [3]]

b0 = np.c_[b0, b2]
print(b0)
# [[1 4]
#  [2 5]
#  [3 6]]

b0 = np.empty((3, 0), dtype=int)

b0 = np.c_[b0, b1]

print(b0)

# [[1]

# [2]

# [3]]

b0 = np.c_[b0, b2]

print(b0)

# [[1 4]

# [2 5]

# [3 6]]

SVM～カーネル法

2020-06-24 / tau / コメントする

概要

書籍”Pythonではじめる機械学習”の2.3.7 カーネル法を用いた”サポートベクタマシン”の写経

線形特徴量の非線形化

線形モデルでは分離不可能なデータ

以下は、scikit-learnのmake_blobs()により生成した2特徴量、2クラス分類のデータに線形サポートベクターマシンを適用した例。このとき、決定境界は以下のように得られる。

(1) $\begin{gather*} b + w_0 f_0 + w_1 f_1 = 0\\ b \approx -0.2817,\; w_0 \approx 0.1261,\; w_1 \approx -0.0918 \end{gather*}$

決定境界より上側では多項式の値は負となり、下側では正となるが、この境界は明らかに2つのクラスを分割していない。このように単純な例でも、線形モデルでは的確なクラス分類はできない。

以下のコードでは、原典と以下が異なっている。

収束しないという警告を受けて、LinearSVCのmax_iterをデフォルトの1000より大きな値としている

import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC

X, y = make_blobs(centers=4, random_state=8)
y = y % 2

linsvm = LinearSVC(max_iter=5500).fit(X, y)

y_min, y_max = -15, 15
b = linsvm.intercept_
w0 = linsvm.coef_[0][0]
w1 = linsvm.coef_[0][1]
x_lower = -(b + w1 * y_min) / w0
x_upper = -(b + w1 * y_max) / w0

fig, ax = plt.subplots()
X0 = X[y==0]
X1 = X[y==1]
ax.scatter(X0[:, 0], X0[:, 1], marker='o', s=60, ec='k')
ax.scatter(X1[:, 0], X1[:, 1], marker='^', s=60, ec='k')
ax.plot([x_lower, x_upper], [y_min, y_max], linewidth=2, c='tab:green')
ax.set_ylim(y_min, y_max)
ax.set_xlabel("Feature-0")
ax.set_ylabel("Feature-1")
plt.show()

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

from sklearn.svm import LinearSVC

X, y = make_blobs(centers=4, random_state=8)

y = y % 2

linsvm = LinearSVC(max_iter=5500).fit(X, y)

y_min, y_max = -15, 15

b = linsvm.intercept_

w0 = linsvm.coef_[0][0]

w1 = linsvm.coef_[0][1]

x_lower = -(b + w1 * y_min) / w0

x_upper = -(b + w1 * y_max) / w0

fig, ax = plt.subplots()

X0 = X[y==0]

X1 = X[y==1]

ax.scatter(X0[:, 0], X0[:, 1], marker='o', s=60, ec='k')

ax.scatter(X1[:, 0], X1[:, 1], marker='^', s=60, ec='k')

ax.plot([x_lower, x_upper], [y_min, y_max], linewidth=2, c='tab:green')

ax.set_ylim(y_min, y_max)

ax.set_xlabel("Feature-0")

ax.set_ylabel("Feature-1")

plt.show()

非線形特徴量の追加

ここで、特徴量1の2乗を新たな特徴量として加える。この場合、3つの特徴量に対して3次元空間内に各点が位置し、それぞれがクラス0/1に属している。新たな特徴量の追加によって、その軸の方向に各点が立ち上がり、真ん中の三角形の点群と両側の丸印の点群が平面でうまく分割できそうである。

このデータセットに対して、線形SVMを適用し、決定境界を描いたのが以下の画像。特徴量が2つの場合の決定境界は直線だったが、特徴が3つになると決定境界は平面となる。予想通り、単純な平面で2つのクラスが分けられている。この決定境界の式は以下のようになる。

(2) $\begin{gather*} b + w_0 f_0 + w_1 f_1 + w_2 {f_1}^2 = 0\\ b \approx 1.1734,\; w_0 \approx 0.1301,\; w_1 \approx -0.2203,\; w_2 = -0.0597 \end{gather*}$

この平面に対して上側（f₁²が小さい側）では多項式の値は正となり、その反対側では負となる。

以下のコードは、原典と以下が異なっている。

収束しないという警告を受けて、LinearSVCのmax_iterをデフォルトの1000より大きな値としている
Axes3Dの生成の仕方を最新のバージョンに合ったものとしている
- 原典ではFigureオブジェクトを生成し、それをビューに関する引数とともにAxes3Dコンストラクターに渡している
- 本コードでは、subplotsの引数でprojectionを指定してFigureとAxes3Dを同時に生成し、veiw_init()でビューに関する引数を指定

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC
from mpl_toolkits.mplot3d import Axes3D

X, y = make_blobs(centers=4, random_state=8)
y = y % 2
X_new = np.hstack((X, X[:, 1].reshape(-1, 1)**2))
X0, X1 = X_new[y==0], X_new[y==1]

linsvc = LinearSVC(max_iter=3700).fit(X_new, y)
intercept = linsvc.intercept_
coef = linsvc.coef_.ravel()

fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
ax.view_init(elev=-152, azim=-23)

u = np.linspace(X_new[:, 0].min() - 2, X_new[:, 1].max() + 2)
v = np.linspace(X_new[:, 0].min() - 2, X_new[:, 1].max() + 2)
u, v = np.meshgrid(u, v)
w = -(coef[0] * u + coef[1] * v + intercept) / coef[2]

ax.scatter(X0[:, 0], X0[:, 1], X0[:, 2], marker='o', s=40, ec='k')
ax.scatter(X1[:, 0], X1[:, 1], X1[:, 2], marker='^', s=40, ec='k')
ax.plot_wireframe(u, v, w, rstride=8, cstride=8, color='tab:green', alpha=0.5)

ax.set_xlabel("Feature-0")
ax.set_ylabel("Feature-1")
ax.set_zlabel("Feature-1**2")

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

from sklearn.svm import LinearSVC

from mpl_toolkits.mplot3d import Axes3D

X, y = make_blobs(centers=4, random_state=8)

y = y % 2

X_new = np.hstack((X, X[:, 1].reshape(-1, 1)**2))

X0, X1 = X_new[y==0], X_new[y==1]

linsvc = LinearSVC(max_iter=3700).fit(X_new, y)

intercept = linsvc.intercept_

coef = linsvc.coef_.ravel()

fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))

ax.view_init(elev=-152, azim=-23)

u = np.linspace(X_new[:, 0].min() - 2, X_new[:, 1].max() + 2)

v = np.linspace(X_new[:, 0].min() - 2, X_new[:, 1].max() + 2)

u, v = np.meshgrid(u, v)

w = -(coef[0] * u + coef[1] * v + intercept) / coef[2]

ax.scatter(X0[:, 0], X0[:, 1], X0[:, 2], marker='o', s=40, ec='k')

ax.scatter(X1[:, 0], X1[:, 1], X1[:, 2], marker='^', s=40, ec='k')

ax.plot_wireframe(u, v, w, rstride=8, cstride=8, color='tab:green', alpha=0.5)

ax.set_xlabel("Feature-0")

ax.set_ylabel("Feature-1")

ax.set_zlabel("Feature-1**2")

plt.show()

元の特徴量に対する決定境界

上の例では特徴量は3つだが、最後の特徴量は2つ目の特徴量f₁から計算される量であり、実質は2つの特徴量が決まれば決定境界が決まる。3次元空間内の平面の決定境界を以下のように書きなおすと、このことが確認できる。

(3) $\begin{align*} f_0 &= \frac{-b - w_1 f_1 - w_2 {f_1}^2}{w_0} \\ &= -9.02 +1.69 f_1 + 0.46 {f_1}^2 \\ &= 0.46(f_1 +1.83)^2 - 10.56 \end{align*}$

これを2つの特徴量に対する決定境界として描画したのが以下の図で、境界が2次関数となっているのが確認できる。

SVMそのものは線形の決定境界しか得られないが、非線形化した特徴量を追加することによって、より複雑な決定境界とすることができる。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC
from mpl_toolkits.mplot3d import Axes3D

X, y = make_blobs(centers=4, random_state=8)
y = y % 2
X_new = np.hstack((X, X[:, 1].reshape(-1, 1)**2))

linsvc = LinearSVC(max_iter=3700).fit(X_new, y)
intercept = linsvc.intercept_

x_min, x_max = -12, 12
y_min, y_max = -15, 15
u = np.linspace(x_min, x_max, 500)
v = np.linspace(y_min, y_max, 500)
u, v = np.meshgrid(u, v)
w = v**2
decision = linsvc.predict(np.c_[u.ravel(), v.ravel(), w.ravel()])

fig, ax = plt.subplots()

ax.scatter(X_new[y==0, 0], X_new[y==0, 1], marker='o', s=60, ec='k')
ax.scatter(X_new[y==1, 0], X_new[y==1, 1], marker='^', s=60, ec='k')
ax.contourf(u, v, decision.reshape(u.shape),
    levels=1, colors=['tab:blue', 'tab:orange'], alpha=0.4)
ax.contour(u, v, decision.reshape(u.shape), levels=1, colors='k')

ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xlabel("Feature-0")
ax.set_ylabel("Feature-1")

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

from sklearn.svm import LinearSVC

from mpl_toolkits.mplot3d import Axes3D

X, y = make_blobs(centers=4, random_state=8)

y = y % 2

X_new = np.hstack((X, X[:, 1].reshape(-1, 1)**2))

linsvc = LinearSVC(max_iter=3700).fit(X_new, y)

intercept = linsvc.intercept_

x_min, x_max = -12, 12

y_min, y_max = -15, 15

u = np.linspace(x_min, x_max, 500)

v = np.linspace(y_min, y_max, 500)

u, v = np.meshgrid(u, v)

w = v**2

decision = linsvc.predict(np.c_[u.ravel(), v.ravel(), w.ravel()])

fig, ax = plt.subplots()

ax.scatter(X_new[y==0, 0], X_new[y==0, 1], marker='o', s=60, ec='k')

ax.scatter(X_new[y==1, 0], X_new[y==1, 1], marker='^', s=60, ec='k')

ax.contourf(u, v, decision.reshape(u.shape),

levels=1, colors=['tab:blue', 'tab:orange'], alpha=0.4)

ax.contour(u, v, decision.reshape(u.shape), levels=1, colors='k')

ax.set_xlim(x_min, x_max)

ax.set_ylim(y_min, y_max)

ax.set_xlabel("Feature-0")

ax.set_ylabel("Feature-1")

plt.show()

カーネルトリック

概要

上記の例では特徴量の1つを2次として新たな特徴量とした。特徴量の非線形化としては、このように特徴量の累乗とするほか、異なる特徴量同士の積を交互作用として導入することが考えられる。ただし、特徴量の数が多くなった時に、それらの全ての組み合わせに対する積を考えると、計算量が膨れ上がる。カーネルトリック(kernel trick)とは、拡張された特徴量空間でのデータ間の距離を、実際の拡張計算をせずに行う方法らしい。

受け売りをそのまま書いておくと、SVMで広く用いられているカーネルトリックのマッピング方法は以下の2つとのこと。

多項式カーネル(polynomial kernel)：もとの特徴量の特定の次数までの全ての多項式を計算
放射既定関数(radial basis function: RBF)カーネルとも呼ばれるガウシアンカーネル：直感的には全次数の全ての多項式を考えるが、次数が高くなるにつれて特徴量の重要性を小さくする

以下はforgeデータセットに対して、カーネルトリックを用いたSVCを適用した例。直線はLinearSVCによる決定境界で、曲線はガウシアンカーネル(RBF)によるSVCの決定境界で、カーネル関数は以下のような形。

(4) $\begin{equation*} k_{\rm rbf}(x_1, x_2) = \exp \left( -\gamma || x_1 - x_2 ||^2 \right) \end{equation*}$

scikit-learnのSVCの引数で、kernel='rbf'、C=10、gamma=0.1と指定している。

線形モデルの決定境界が直線なのに対して、カーネルトリックによる決定境界は、非線形化した特徴量を導入していることから曲線となっている。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC

X = np.array( \
[[ 9.96346605,  4.59676542],
 [11.0329545,  -0.16816717],
 [11.54155807,  5.21116083],
 .....
 [ 9.50169345,  1.93824624],
 [ 9.15072323,  5.49832246],
 [11.563957,    1.3389402 ]]
)
y = np.array( \
[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]
)

f0min, f0max = 7.5, 12.5
f1min, f1max = -1, 6

linsvc = LinearSVC(max_iter=4000).fit(X, y)
intercept = linsvc.intercept_
coef = linsvc.coef_.ravel()

svc = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y)
u = np.linspace(f0min, f0max, 400)
v = np.linspace(f1min, f1max, 400)
u, v = np.meshgrid(u, v)
pred = svc.predict(np.c_[u.ravel(), v.ravel()]).reshape(u.shape)

fig, ax = plt.subplots()

ax.scatter(X[y==0][:, 0], X[y==0][:, 1],
    marker='o', s=60, fc='tab:blue', ec='k')
ax.scatter(X[y==1][:, 0], X[y==1][:, 1],
    marker='^', s=60, fc='tab:orange', ec='k')

sv_class = y[svc.support_]
ax.scatter(svc.support_vectors_[sv_class==0][:, 0],
           svc.support_vectors_[sv_class==0][:, 1],
           marker='o', s=150, fc='tab:blue', ec='blue', linewidth=3)
ax.scatter(svc.support_vectors_[sv_class==1][:, 0],
           svc.support_vectors_[sv_class==1][:, 1],
           marker='^', s=150, fc='tab:orange', ec='red', linewidth=3)

f1 = lambda f0: -(intercept + coef[0]*f0) / coef[1]
ax.plot([f0min, f0max], [f1(f0min), f1(f0max)])

ax.contour(u, v, pred, levels=[0.5])

ax.set_xlim(f0min, f0max)
ax.set_ylim(f1min, f1max)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
ax.set_xlabel("Feature-0")
ax.set_ylabel("Feature-1")

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.svm import SVC, LinearSVC

X = np.array( \

[[ 9.96346605, 4.59676542],

[11.0329545, -0.16816717],

[11.54155807, 5.21116083],

.....

[ 9.50169345, 1.93824624],

[ 9.15072323, 5.49832246],

[11.563957, 1.3389402 ]]

)

y = np.array( \

[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]

)

f0min, f0max = 7.5, 12.5

f1min, f1max = -1, 6

linsvc = LinearSVC(max_iter=4000).fit(X, y)

intercept = linsvc.intercept_

coef = linsvc.coef_.ravel()

svc = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y)

u = np.linspace(f0min, f0max, 400)

v = np.linspace(f1min, f1max, 400)

u, v = np.meshgrid(u, v)

pred = svc.predict(np.c_[u.ravel(), v.ravel()]).reshape(u.shape)

fig, ax = plt.subplots()

ax.scatter(X[y==0][:, 0], X[y==0][:, 1],

marker='o', s=60, fc='tab:blue', ec='k')

ax.scatter(X[y==1][:, 0], X[y==1][:, 1],

marker='^', s=60, fc='tab:orange', ec='k')

sv_class = y[svc.support_]

ax.scatter(svc.support_vectors_[sv_class==0][:, 0],

svc.support_vectors_[sv_class==0][:, 1],

marker='o', s=150, fc='tab:blue', ec='blue', linewidth=3)

ax.scatter(svc.support_vectors_[sv_class==1][:, 0],

svc.support_vectors_[sv_class==1][:, 1],

marker='^', s=150, fc='tab:orange', ec='red', linewidth=3)

f1 = lambda f0: -(intercept + coef[0]*f0) / coef[1]

ax.plot([f0min, f0max], [f1(f0min), f1(f0max)])

ax.contour(u, v, pred, levels=[0.5])

ax.set_xlim(f0min, f0max)

ax.set_ylim(f1min, f1max)

ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)

ax.set_xlabel("Feature-0")

ax.set_ylabel("Feature-1")

plt.show()

scikit-learnのSVCクラスには、サポートベクターに関する以下のパラメーターがある。

support_：データセットにおけるサポートベクターのインデックス（1次元配列）
support_vector_：サポートベクターの配列（2次元配列）

38～44行目で、これらのパラメーターを使ってサポートベクターを強調表示している。

パラメータ調整

SVCモデルでパラメーターCとgammaの値を変化させたときの決定境界は以下の通り。

gammaはガウシアンカーネルの直径（σ2に相当）の逆数で、この値が小さいと直径が大きくなり、より多くの点を近いと判断するようになる。左の方はgammaが小さく広域のデータをまとめようとするため、決定境界は大まかとなり、右の方はgammaが大きく近いもの同士をまとめようとする傾向となる。

Cは正則化の強さの逆数で、上の方ほどCの値が小さく正則化が強く効くため、決定境界はよりまっすぐとなり、下の方ほど正則化が弱く個々のデータの影響を受ける。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from itertools import product

X = np.array( \
[[ 9.96346605,  4.59676542],
 [11.0329545,  -0.16816717],
 [11.54155807,  5.21116083],
 .....
 [ 9.50169345,  1.93824624],
 [ 9.15072323,  5.49832246],
 [11.563957,    1.3389402 ]]
)
y = np.array( \
[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]
)

f0min, f0max = 7.5, 12.5
f1min, f1max = -1, 6
u = np.linspace(f0min, f0max, 400)
v = np.linspace(f1min, f1max, 400)
u, v = np.meshgrid(u, v)

C_list = [0.1, 1, 1000]
gamma_list = [0.1, 1, 10]
params = product(C_list, gamma_list)

plt.rcParams['font.size'] = 6
fig, axs = plt.subplots(3, 3, figsize=(6.4, 4.8))
axs_1d = axs.ravel()
fig.subplots_adjust(hspace=0.3)

for ax, param in zip(axs_1d, params):
    svc = SVC(kernel='rbf', C=param[0], gamma=param[1]).fit(X, y)
    pred = svc.predict(
            np.hstack([u.ravel().reshape(-1, 1), v.ravel().reshape(-1, 1)])
        ).reshape(u.shape)
    ax.scatter(X[y==0][:, 0], X[y==0][:, 1], marker='o')
    ax.scatter(X[y==1][:, 0], X[y==1][:, 1], marker='^')
    ax.contour(u, v, pred, levels=[0.5])
    ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
    ax.set_title("C={:.1f}, gamma={:.1f}".format(param[0], param[1]))

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.svm import SVC

from itertools import product

X = np.array( \

[[ 9.96346605, 4.59676542],

[11.0329545, -0.16816717],

[11.54155807, 5.21116083],

.....

[ 9.50169345, 1.93824624],

[ 9.15072323, 5.49832246],

[11.563957, 1.3389402 ]]

)

y = np.array( \

[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]

)

f0min, f0max = 7.5, 12.5

f1min, f1max = -1, 6

u = np.linspace(f0min, f0max, 400)

v = np.linspace(f1min, f1max, 400)

u, v = np.meshgrid(u, v)

C_list = [0.1, 1, 1000]

gamma_list = [0.1, 1, 10]

params = product(C_list, gamma_list)

plt.rcParams['font.size'] = 6

fig, axs = plt.subplots(3, 3, figsize=(6.4, 4.8))

axs_1d = axs.ravel()

fig.subplots_adjust(hspace=0.3)

for ax, param in zip(axs_1d, params):

svc = SVC(kernel='rbf', C=param[0], gamma=param[1]).fit(X, y)

pred = svc.predict(

np.hstack([u.ravel().reshape(-1, 1), v.ravel().reshape(-1, 1)])

).reshape(u.shape)

ax.scatter(X[y==0][:, 0], X[y==0][:, 1], marker='o')

ax.scatter(X[y==1][:, 0], X[y==1][:, 1], marker='^')

ax.contour(u, v, pred, levels=[0.5])

ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

ax.set_title("C={:.1f}, gamma={:.1f}".format(param[0], param[1]))

plt.show()

Breast cancerデータへの適用例

Breast cancerデータへの適用例で、特徴量データの大きさやレンジが大きくばらついていること、特徴量データをそのまま使った場合に過学習となること、特徴量データに前処理を施して正規化(normalize)した場合に精度が向上することを示している。

SVMの特徴

SVMの特徴量を受け売りのまままとめておく。

データにわずかな特徴量しかない場合も複雑な決定境界を生成可能（低次元でも高次元でもうまく機能）
サンプルの個数が大きくなるとうまく機能しない（10万サンプルくらいになると、実行時間やメモリ使用量の面で難しくなる
注意深いデータの前処理とパラメーター調整が必要
検証が難しい（予測に対する理由を理解することが難しい）
RBFの場合、Cやgammaを大きくするとより複雑なモデルになる（2つのパラメーターは強く相関するため、同時に調整する必要がある）

今後の課題～覚え書き

カーネル関数

(5) $\begin{equation*} K(\boldsymbol{x}_1, \boldsymbol{x}_2) = \sum \phi(\boldsymbol{x}_1) \phi(\boldsymbol{x}_2) \end{equation*}$

多項式カーネル

(6) $\begin{equation*} K(\boldsymbol{x}_1, \boldsymbol{x}_2) = (\boldsymbol{x}_1 \cdot \boldsymbol{x}_2 + 1 )^d \end{equation*}$

ガウシアンカーネル

(7) $\begin{equation*} K(\boldsymbol{x}_1, \boldsymbol{x}_2) = \exp \left(- \frac{||(\boldsymbol{x}_1 - \boldsymbol{x}_2 ||^2}{2\sigma^2} \right) \end{equation*}$

SVMの定式化

2020-06-24 / tau / コメントする

SVMの定式化

SVMのクラス分類の条件

2つの特徴量を持つデータが2つのクラスに分かれているとする。ここで下図のように、1つの直線によって、2つのクラスを完全に分離できるとする。

このとき、直線lによって分割したとして、以下の符号によってクラスを分離する。

(1) $\begin{equation*} \left\{ \begin{align} a x_1 + b x_2 + c > 0 &\rightarrow \rm{Class1} \\ a x_1 + b x_2 + c < 0 & \rightarrow \rm{Class2} \end{align} \right. \end{equation*}$

ここでラベル変数t_iを導入する。t_iはデータiがClass1/2のいずれに属するかを示す変数で、Class1ならt_i > 0、Class2ならt_i < 0と定義する。

(2) $\begin{equation*} \left\{ \begin{array}{lll} t_i = 1 & x_i \in \rm{Class1} & (a x_{i1} + b x_{i2} + c > 0) \\ t_i = -1 & x_i \in \rm{Class2} & (a x_{i1} + b x_{i2} + c < 0) \\ \end{array} \right. \end{equation*}$

このラベル変数を用いて、クラスの条件式は以下のように統一される。

(3) $\begin{equation*} t_i (a x_{i1} + b x_{i2} + c) > 0 \end{equation*}$

SVMにおいては、すべてのデータについてこの式が満足されるようにa, b, cを決定する。これらはすべてa, b, cに対する制約条件だが、どのようにこれらの値を求めるべきか、その目的関数が必要になる。SVMでは、これをマージン最大化により行う。

マージン最大化

ある直線l₁によって、下図のようにデータセットがClass1/2に分類できるとする。このときl₁に対してClass1/2の最も直線に近いデータを”サポートベクター”と呼ぶ。また、これらのサポートベクターに対応するl₁と平行な直線間の距離を”マージン”と呼ぶ。

ところで、l₁とは異なる別の直線l₂を選ぶと、異なるサポートベクターに対してより大きなマージンを得ることができる。SVMでは、式(3)のもとでこのマージンを最大化するような直線lを探すこととなる。

直線lに対するサポートベクターの対を(x⁺, x⁻)とすると、それぞれからlへの距離dは以下のように表現される。

(4) $\begin{equation*} d = \frac{|a x^+_1 + b x^+_2 + c|}{a^2 + b^2} = \frac{|a x^-_1 + b x^-_2 + c|}{a^2 + b^2} \end{equation*}$

ここで直線lはマージンの端にある平行な2つの直線の中央にあることから、上式の分子は同じ値となる。この値でdを除したものを改めて $\tilde{d}$ と置くと、dの最大化問題は $1/\tilde{d}=\sqrt{a^2+b^2}$ の最小化問題となる。これに式(3)の制約条件を加味して、問題は以下の制約条件付き最小化問題となる。

(5) $\begin{align*} \min a^2 + b^2 \quad {\rm s.t.} \; t_i (a x_{i1} + b x_{i2} + c) > 0 \; (i=1~n) \end{align*}$

今後の課題

ここから先の定式化
ソフトマージンの導出

勾配ブースティング

2020-06-21 / tau / コメントする

概要

勾配ブースティング(gradient boosthing)は、ランダムフォレストと同じく複数の決定木を組み合わせてモデルを強化する手法。ランダムフォレストと異なる点は、最初から複数の決定木を使うのではなく、1つずつ順番に決定木を増やしていく。その際に追加される決定木はそれぞれ深さ1～5くらいの浅い木（弱学習機：weak learner）で、直前の適合不足を補うように学習する。

勾配ブースティングの主なパラメーターは弱学習機の数(n_estimators)と学習率(learning_rate)で、学習率を大きくすると個々の弱学習機の補正を強化しモデルは複雑になる。

cancerデータへの適用

Pythonのscikit-learnにあるGradienBoostingClassifierをbreast_cancerデータに適用する例。”Pythonではじめる機械学習”の”2.3.6.2 勾配ブースティング回帰木”掲載のコードに沿って確認するが、バージョンの違いのためか、結果が異なる。いくつかのデフォルトのパラメーターを明示的に設定／変更してみたが、書籍に掲載されている結果には至っていない。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

ds = load_breast_cancer()
X_train, X_test, y_train, y_test =\
    train_test_split(ds.data, ds.target, random_state=0)

gbcf = GradientBoostingClassifier(random_state=1)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

fig, ax = plt.subplots(figsize=(8, 4.8))
fig.subplots_adjust(left=0.3)
ax.barh(ds.feature_names, gbcf.feature_importances_)
ax.set_xlabel("feature importance")
plt.show()

gbcf = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier

ds = load_breast_cancer()

X_train, X_test, y_train, y_test =\

train_test_split(ds.data, ds.target, random_state=0)

gbcf = GradientBoostingClassifier(random_state=1)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

fig, ax = plt.subplots(figsize=(8, 4.8))

fig.subplots_adjust(left=0.3)

ax.barh(ds.feature_names, gbcf.feature_importances_)

ax.set_xlabel("feature importance")

plt.show()

gbcf = GradientBoostingClassifier(learning_rate=0.01, random_state=0)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

最初に試したのが以下のコード。ここでテストスコアが書籍にある0.958にならない。min_samples_split=5とすると書籍と同じ結果になるが、以降の特徴量重要度やlearning_rateの変更結果は再現されない。

gbcf = GradientBoostingClassifier(random_state=0)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 1.000
# Test score    : 0.965

gbcf = GradientBoostingClassifier(random_state=0)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 1.000

# Test score : 0.965

過剰適合に対してmax_depth=1と強力な枝刈りをした場合。この結果は小数点以下3桁の表示で書籍と一致している。

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 0.991
# Test score    : 0.972

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 0.991

# Test score : 0.972

learning_rateをデフォルトの0.1から0.01に変更した場合の結果も書籍と一致する。今回の再現結果では、デフォルト状態からテストスコアは改善されていない。

gbcf = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
gbcf.fit(X_train, y_train)
print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))
print("Test score    : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 0.988
# Test score    : 0.965

gbcf = GradientBoostingClassifier(learning_rate=0.01, random_state=0)

gbcf.fit(X_train, y_train)

print("Training score: {:.3f}".format(gbcf.score(X_train, y_train)))

print("Test score : {:.3f}".format(gbcf.score(X_test, y_test)))

# Training score: 0.988

# Test score : 0.965

なお、事前剪定を強化したケースのグラフが、書籍と大きく異なる。横軸の値が倍ほどになっており、worst concave points、worst perimeter、mean concave pointsが重要度の大半を占めている。書籍では他の多くの特徴量も重要度がある程度高い点と異なっている。

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)
gbcf.fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(8, 4.8))
fig.subplots_adjust(left=0.3)
ax.barh(ds.feature_names, gbcf.feature_importances_)
ax.set_xlabel("feature importance")
plt.show()

gbcf = GradientBoostingClassifier(max_depth=1, random_state=0)

gbcf.fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(8, 4.8))

fig.subplots_adjust(left=0.3)

ax.barh(ds.feature_names, gbcf.feature_importances_)

ax.set_xlabel("feature importance")

plt.show()

今後確認したい点

勾配ブースティングの基本的な考え方の整理
簡単な事例での勾配ブースティングの挙動確認
回帰への適用
異なるモデルの組み合わせ

1次元の場合

2次元の場合

過学習？

特徴量データのサイズの違い

データの前処理

パラメーター調整

グラフ全体のフォントサイズ

個別要素のフォントサイズ

概要

内包表記による方法

itertools.productによる方法

概要

r_について

c_について

概要

線形特徴量の非線形化

線形モデルでは分離不可能なデータ

非線形特徴量の追加

元の特徴量に対する決定境界

カーネルトリック

概要

パラメータ調整

Breast cancerデータへの適用例

SVMの特徴

今後の課題～覚え書き

SVMの定式化

SVMのクラス分類の条件

マージン最大化

今後の課題

概要

cancerデータへの適用

今後確認したい点

`itertools.product`による方法

`r_`について

`c_`について