Breast cancer データセット – k-近傍法

2020-03-20 / tau / コメントする

概要

breast_cancerデータセットにscikit-learnのKNeighborsClassifierクラスでk-最近傍法を適用した結果。

学習率曲線

breast_cancerデータセットにk-最近傍法を適用し、近傍点数を変化させて学習率の変化をチェック。データセットを学習データとテストデータに分けるときのrandom_stateを変え、近傍点数に伴う変化を見てみた。

irisデータセットの場合に比べると、学習データとテストデータの傾向は落ち着いていて、近傍点数=8で制度が0.92～0.95程度。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


def train_and_test(X, y, n_neighbors_list, random_state):

    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=random_state)

    training_scores = []
    test_scores = []

    for n_neighbors in n_neighbors_list:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        classifier.fit(X_train, y_train)

        training_scores.append(classifier.score(X_train, y_train))
        test_scores.append(classifier.score(X_test, y_test))

    return training_scores, test_scores


cancer_ds = load_breast_cancer()
X = cancer_ds.data
y = cancer_ds.target

n_neighbors_list = np.arange(1, 16, dtype=int)
random_state_list = np.array([0, 1, 2, 3])

fig, axs = plt.subplots(2, 2, figsize=(9.6, 7.2))
plt.subplots_adjust(hspace=0.4)

axs_1d = axs.reshape(1, -1)[0]

# random_stateを変化させて学習率の違いを見る
for ax, random_state in zip(axs_1d, random_state_list):
    training_scores, test_scores =\
        train_and_test(X, y, n_neighbors_list, random_state)

    ax.plot(n_neighbors_list, training_scores)
    ax.plot(n_neighbors_list, test_scores)

    ax.set_title("random_state={}".format(random_state))
    ax.set_xlabel("number of neighbors")
    ax.set_ylim(0.9, 1.01)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

def train_and_test(X, y, n_neighbors_list, random_state):

X_train, X_test, y_train, y_test =\

train_test_split(X, y, stratify=y, random_state=random_state)

training_scores = []

test_scores = []

for n_neighbors in n_neighbors_list:

classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

classifier.fit(X_train, y_train)

training_scores.append(classifier.score(X_train, y_train))

test_scores.append(classifier.score(X_test, y_test))

return training_scores, test_scores

cancer_ds = load_breast_cancer()

X = cancer_ds.data

y = cancer_ds.target

n_neighbors_list = np.arange(1, 16, dtype=int)

random_state_list = np.array([0, 1, 2, 3])

fig, axs = plt.subplots(2, 2, figsize=(9.6, 7.2))

plt.subplots_adjust(hspace=0.4)

axs_1d = axs.reshape(1, -1)[0]

# random_stateを変化させて学習率の違いを見る

for ax, random_state in zip(axs_1d, random_state_list):

training_scores, test_scores =\

train_and_test(X, y, n_neighbors_list, random_state)

ax.plot(n_neighbors_list, training_scores)

ax.plot(n_neighbors_list, test_scores)

ax.set_title("random_state={}".format(random_state))

ax.set_xlabel("number of neighbors")

ax.set_ylim(0.9, 1.01)

plt.show()

irisデータセット – knn

2020-03-20 / tau / コメントする

概要

irisデータセットにscikit-learnのKNeighborsClassifierクラスでk-最近傍法を適用した結果。

学習率曲線

irisデータセットにk-最近傍法を適用し、近傍点数を変化させて学習率の変化をチェック。データセットを学習データとテストデータに分けるときのrandom_stateを変え、近傍点数に伴う変化を見てみた。

レコード数が150と少ないこともあって、random_stateを変えるごとにかなり推移が異なるが、概ね95%の精度が保たれている。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


def train_and_test(X, y, n_neighbors_list, random_state):

    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=random_state)

    training_scores = []
    test_scores = []

    for n_neighbors in n_neighbors_list:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        classifier.fit(X_train, y_train)

        training_scores.append(classifier.score(X_train, y_train))
        test_scores.append(classifier.score(X_test, y_test))

    return training_scores, test_scores


iris_ds = load_iris()
X = iris_ds.data
y = iris_ds.target

n_neighbors_list = np.arange(1, 16, dtype=int)
random_state_list = np.array([0, 1, 2, 3])

fig, axs = plt.subplots(2, 2, figsize=(9.6, 7.2))
plt.subplots_adjust(hspace=0.4)

axs_1d = axs.reshape(1, -1)[0]

# random_stateを変化させて学習率の違いを見る
for ax, random_state in zip(axs_1d, random_state_list):
    training_scores, test_scores =\
        train_and_test(X, y, n_neighbors_list, random_state)

    ax.plot(n_neighbors_list, training_scores)
    ax.plot(n_neighbors_list, test_scores)

    ax.set_title("random_state={}".format(random_state))
    ax.set_xlabel("number of neighbors")
    ax.set_ylim(0.9, 1.01)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

def train_and_test(X, y, n_neighbors_list, random_state):

X_train, X_test, y_train, y_test =\

train_test_split(X, y, stratify=y, random_state=random_state)

training_scores = []

test_scores = []

for n_neighbors in n_neighbors_list:

classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

classifier.fit(X_train, y_train)

training_scores.append(classifier.score(X_train, y_train))

test_scores.append(classifier.score(X_test, y_test))

return training_scores, test_scores

iris_ds = load_iris()

X = iris_ds.data

y = iris_ds.target

n_neighbors_list = np.arange(1, 16, dtype=int)

random_state_list = np.array([0, 1, 2, 3])

fig, axs = plt.subplots(2, 2, figsize=(9.6, 7.2))

plt.subplots_adjust(hspace=0.4)

axs_1d = axs.reshape(1, -1)[0]

# random_stateを変化させて学習率の違いを見る

for ax, random_state in zip(axs_1d, random_state_list):

training_scores, test_scores =\

train_and_test(X, y, n_neighbors_list, random_state)

ax.plot(n_neighbors_list, training_scores)

ax.plot(n_neighbors_list, test_scores)

ax.set_title("random_state={}".format(random_state))

ax.set_xlabel("number of neighbors")

ax.set_ylim(0.9, 1.01)

plt.show()

pyplot – legend～凡例

2020-03-20 / tau / コメントする

概要

pyplotの各グラフに凡例を入れるには、legend()メソッドを使う。基本の使い方は以下の通り。

plotやscatterなどでグラフを描く時の引数にlabel=”…”でラベルを定義する。ここで設定した文字列が凡例に使われる。
グラフフィールドのオブジェクトのlegend()メソッドを実行する。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
s = np.sin(x)
c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")
ax.plot(x, c, label="cos")

ax.legend()

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

s = np.sin(x)

c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")

ax.plot(x, c, label="cos")

ax.legend()

plt.show()

凡例の位置

標準的な位置指定

凡例の位置はloc引数に対して定義された文字列で指定する。

legend(loc='...')

1	legend(loc='...')

位置しての文字列は’[縦位置] [横位置]’で指定。

縦位置はupper, center, lowerの何れか、横位置はleft, center, rightの何れかで、縦位置と横位置の間には半角スペースを入れる（たとえば'upper right'）。ただし縦横中心の場合は'center'。

デフォルトは'best'で最も適切な位置が自動で設定される。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
s = np.sin(x)
c = np.cos(x)

fig, axs = plt.subplots(3, 3, figsize=(12.8, 9.6))

plt.subplots_adjust(hspace=0.4)

legends = np.empty((3, 3), dtype=object)
legends[0, 0] = 'upper left'
legends[0, 1] = 'upper center'
legends[0, 2] = 'upper right'
legends[1, 0] = 'center left'
legends[1, 1] = 'center'
legends[1, 2] = 'center right'
legends[2, 0] = 'lower left'
legends[2, 1] = 'lower center'
legends[2, 2] = 'lower right'

for row in range(3):
    for col in range(3):
        axs[row, col].set_title(legends[row, col])
        axs[row, col].plot(x, s, label="sin")
        axs[row, col].plot(x, c, label="cos")
        axs[row, col].legend(loc=legends[row, col])

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

s = np.sin(x)

c = np.cos(x)

fig, axs = plt.subplots(3, 3, figsize=(12.8, 9.6))

plt.subplots_adjust(hspace=0.4)

legends = np.empty((3, 3), dtype=object)

legends[0, 0] = 'upper left'

legends[0, 1] = 'upper center'

legends[0, 2] = 'upper right'

legends[1, 0] = 'center left'

legends[1, 1] = 'center'

legends[1, 2] = 'center right'

legends[2, 0] = 'lower left'

legends[2, 1] = 'lower center'

legends[2, 2] = 'lower right'

for row in range(3):

for col in range(3):

axs[row, col].set_title(legends[row, col])

axs[row, col].plot(x, s, label="sin")

axs[row, col].plot(x, c, label="cos")

axs[row, col].legend(loc=legends[row, col])

plt.show()

bboxによる位置指定～凡例の外側への設置

Axes.legend()の位置指定で引数としてbbox_to_anchorを指定することで、グラフの描画領域の相対位置を細かく指定することもできる。

Axes.legend(bbox_to_anchor=(x, y), loc=location_str)

1	Axes.legend(bbox_to_anchor=(x, y), loc=location_str)

bbox_to_anchor=(x, y)

x, yはグラフ描画行きの左下を(0, 0)、右上を(1, 1)としたときの相対位置。ここで指定した位置とlocで指定した凡例の基準点を一致させる。

以下のコードは、全てbbox_to_anchor=(1, 1)として凡例を描画域の右上に合わせている。その上でlocで指定した凡例の位置がこの点と一致させられる。

たとえばloc="lower right"とすると凡例の左下の位置が描画域の右上と同じになるように配置される。また、標準では凡例の枠の周りに少しパディングが行われるが、borderaxespad引数で数値を指定することでその間隔を調整できる。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
ys = np.sin(x)
yc = np.cos(x)

fig, axes = plt.subplots(2, 2)
ax_1d = axes.reshape(axes.size)

for ax in ax_1d:
    ax.plot(x, ys, label="sin x")
    ax.plot(x, yc, label="cos x")

ax_1d[0].legend(bbox_to_anchor=(1, 1), loc='lower right', borderaxespad=0)
ax_1d[1].legend(bbox_to_anchor=(1, 1), loc='lower left')
ax_1d[2].legend(bbox_to_anchor=(1, 1), loc='upper right')
ax_1d[3].legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

ys = np.sin(x)

yc = np.cos(x)

fig, axes = plt.subplots(2, 2)

ax_1d = axes.reshape(axes.size)

for ax in ax_1d:

ax.plot(x, ys, label="sin x")

ax.plot(x, yc, label="cos x")

ax_1d[0].legend(bbox_to_anchor=(1, 1), loc='lower right', borderaxespad=0)

ax_1d[1].legend(bbox_to_anchor=(1, 1), loc='lower left')

ax_1d[2].legend(bbox_to_anchor=(1, 1), loc='upper right')

ax_1d[3].legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0)

plt.show()

ただし、この例では凡例が画面の右側ではみ出て切れてしまっている。このようなときは、tightlayout()をFigureに対して実行することで描画領域に全体を収めることができる。

ax_1d[0].legend(bbox_to_anchor=(1, 1), loc='lower right', borderaxespad=0)
ax_1d[1].legend(bbox_to_anchor=(1, 1), loc='lower left')
ax_1d[2].legend(bbox_to_anchor=(1, 1), loc='upper right')
ax_1d[3].legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0)
fig.tight_layout()

ax_1d[0].legend(bbox_to_anchor=(1, 1), loc='lower right', borderaxespad=0)

ax_1d[1].legend(bbox_to_anchor=(1, 1), loc='lower left')

ax_1d[2].legend(bbox_to_anchor=(1, 1), loc='upper right')

ax_1d[3].legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0)

fig.tight_layout()

凡例の並べ方

凡例はデフォルトでは縦に並べられるが、ncolに整数を指定して凡例の列数を指定できる。

legend(ncol=<em>n</em>)

1	legend(ncol=<em>n</em>)

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
s1 = np.sin(x)
c1 = np.cos(x)
s2 = np.sin(x * 2)
c2 = np.cos(x * 2)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s1, label="sin x")
ax.plot(x, c1, label="cos x")
ax.plot(x, s2, label="sin 2x")
ax.plot(x, c2, label="cos 2x")

ax.legend(ncol=2, loc='lower center')

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

s1 = np.sin(x)

c1 = np.cos(x)

s2 = np.sin(x * 2)

c2 = np.cos(x * 2)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s1, label="sin x")

ax.plot(x, c1, label="cos x")

ax.plot(x, s2, label="sin 2x")

ax.plot(x, c2, label="cos 2x")

ax.legend(ncol=2, loc='lower center')

plt.show()

デザイン等

このほか、デザイン関連で以下のような引数がある

title=[文字列]: 凡例内にタイトルを設定。
fancybox=False/True: Trueを指定すると凡例の枠の角が丸くなる。
shadow=False/True: Trueを指定すると凡例に影がつけられる。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
s = np.sin(x)
c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")
ax.plot(x, c, label="cos")

ax.legend(title="legend title", fancybox=True, shadow=True)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

s = np.sin(x)

c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")

ax.plot(x, c, label="cos")

ax.legend(title="legend title", fancybox=True, shadow=True)

plt.show()

凡例の文字サイズ

凡例本体の文字サイズは、legend()の引数fontsizeで指定する。

凡例のタイトルの文字サイズは、凡例オブジェクトからget_title()でタイトルオブジェクトを取得し、set_fontsize()で設定する。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
y_sin = np.sin(x)
y_cos = np.cos(x)

fig, ax = plt.subplots()

ax.plot(x, y_sin, label="sin")
ax.plot(x, y_cos, label="cos")

lg = ax.legend(title="sin/cos", fontsize=12)
lg.get_title().set_fontsize(15)

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

y_sin = np.sin(x)

y_cos = np.cos(x)

fig, ax = plt.subplots()

ax.plot(x, y_sin, label="sin")

ax.plot(x, y_cos, label="cos")

lg = ax.legend(title="sin/cos", fontsize=12)

lg.get_title().set_fontsize(15)

plt.show()

この2行の手続きは、以下のようにチェインによって1行で書くこともできる。

ax.legend(title="sin/cos", fontsize=12).get_title().set_fontsize(15)

1	ax.legend(title="sin/cos", fontsize=12).get_title().set_fontsize(15)

引数`handles`、`labels`

legend(handles, labels)という指定方法。公式ドキュメントではこちらが先に示されている。

Axes.get_legend_handles_labels()の戻り値として、グラフ要素のhandleとそれに対するlabelのリストが得られる。

この方法は、後述のように複数のグラフの凡例をまとめて扱うときに利用する。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
s = np.sin(x)
c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")
ax.plot(x, c, label="cos")
handles, labels = ax.get_legend_handles_labels()

ax.legend(handles, labels)

print(handles)
print(labels)

plt.show()

# [<matplotlib.lines.Line2D object at 0x142220F0>, <matplotlib.lines.Line2D object at 0x14222370>]
# ['sin', 'cos']

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

s = np.sin(x)

c = np.cos(x)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.plot(x, s, label="sin")

ax.plot(x, c, label="cos")

handles, labels = ax.get_legend_handles_labels()

ax.legend(handles, labels)

print(handles)

print(labels)

plt.show()

# [<matplotlib.lines.Line2D object at 0x142220F0>, <matplotlib.lines.Line2D object at 0x14222370>]

# ['sin', 'cos']

複数グラフの場合の凡例

複数のAxesのグラフの凡例を1つにまとめて表示したい場合は、それぞれのAxesでhandleとlabelを取得しておき、それらを結合してlegend()の引数とする。

import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
sn = np.sin(x)
cs = np.cos(x)

fig, axs = plt.subplots(1, 2, figsize=(8, 3.6))

axs[0].plot(sn, c='b', label="sin x")
handles0, labels0 = axs[0].get_legend_handles_labels()

axs[1].plot(cs, c='r', label="cos x")
handles1, labels1 = axs[1].get_legend_handles_labels()

axs[0].legend(handles0 + handles1, labels0 + labels1)

plt.show()

import numpy as np

import numpy.random as rnd

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

sn = np.sin(x)

cs = np.cos(x)

fig, axs = plt.subplots(1, 2, figsize=(8, 3.6))

axs[0].plot(sn, c='b', label="sin x")

handles0, labels0 = axs[0].get_legend_handles_labels()

axs[1].plot(cs, c='r', label="cos x")

handles1, labels1 = axs[1].get_legend_handles_labels()

axs[0].legend(handles0 + handles1, labels0 + labels1)

plt.show()

上の方法だと特定のAxesに凡例が表示されるが、これをまとめたいときには、figureに凡例を表示させる。

import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
sn = np.sin(x)
cs = np.cos(x)

fig, axs = plt.subplots(1, 2, figsize=(8, 3.6))

axs[0].plot(sn, c='b', label="sin x")
handles0, labels0 = axs[0].get_legend_handles_labels()

axs[1].plot(cs, c='r', label="cos x")
handles1, labels1 = axs[1].get_legend_handles_labels()

fig.legend(handles0 + handles1, labels0 + labels1, ncol=2, loc='upper center')

plt.show()

import numpy as np

import numpy.random as rnd

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

sn = np.sin(x)

cs = np.cos(x)

fig, axs = plt.subplots(1, 2, figsize=(8, 3.6))

axs[0].plot(sn, c='b', label="sin x")

handles0, labels0 = axs[0].get_legend_handles_labels()

axs[1].plot(cs, c='r', label="cos x")

handles1, labels1 = axs[1].get_legend_handles_labels()

fig.legend(handles0 + handles1, labels0 + labels1, ncol=2, loc='upper center')

plt.show()

Axes.twinx – 2つのy軸のグラフ

2020-03-20 / tau / コメントする

基本

Axes.twinx()は元のAxesオブジェクトを複製する。ただし新たなAxesには横軸がなく縦軸が逆側にある。2つのグラフを、それぞれに対するy軸とともに重ねて表示したいときに使う。手順は以下の通り。

基本のAxesインスタンスを生成
基本のAxesインスタンスでtwinx()メソッドを実行して2軸目のAxesインスタンスを得る
それぞれのAxesオブジェクトに対して描画、設定

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
sin = np.sin(x)
exp = np.exp(x)

fig, axsin = plt.subplots(figsize=(7.2, 4.8))
plt.subplots_adjust(wspace=0.2)
axexp = axsin.twinx()

axsin.plot(x, sin, c='b')
axexp.plot(x, exp, c='g')

axsin.set_ylabel("sin")
axexp.set_ylabel("exp")

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

sin = np.sin(x)

exp = np.exp(x)

fig, axsin = plt.subplots(figsize=(7.2, 4.8))

plt.subplots_adjust(wspace=0.2)

axexp = axsin.twinx()

axsin.plot(x, sin, c='b')

axexp.plot(x, exp, c='g')

axsin.set_ylabel("sin")

axexp.set_ylabel("exp")

plt.show()

凡例

twinx()で得られたAxesと元のAxesは異なるインスタンスなので、それぞれの凡例を表示させると、ばらばらの位置になったり完全に重なったりしてしまう。

これらの凡例を一括して扱うには、handleとlabelを取得して結合する方法を使う。

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)
sin = np.sin(x)
exp = np.exp(x)

fig, axsin = plt.subplots(figsize=(7.2, 4.8))
axexp = axsin.twinx()

axsin.plot(x, sin, c='b', label="sin")
hs, ls = axsin.get_legend_handles_labels()

axexp.plot(x, exp, c='g', label="exp")
he, le = axexp.get_legend_handles_labels()

axsin.set_ylabel("sin")
axexp.set_ylabel("exp")

axsin.legend(hs + he, ls + le, loc='upper left')

plt.show()

import numpy as np

import matplotlib.pyplot as plt

x = np.linspace(-np.pi, np.pi)

sin = np.sin(x)

exp = np.exp(x)

fig, axsin = plt.subplots(figsize=(7.2, 4.8))

axexp = axsin.twinx()

axsin.plot(x, sin, c='b', label="sin")

hs, ls = axsin.get_legend_handles_labels()

axexp.plot(x, exp, c='g', label="exp")

he, le = axexp.get_legend_handles_labels()

axsin.set_ylabel("sin")

axexp.set_ylabel("exp")

axsin.legend(hs + he, ls + le, loc='upper left')

plt.show()

Breast cancer wisconsinデータセット

2020-03-20 / tau / コメントする

概要

breast cancerデータはUCIの機械学習リポジトリ―にあるBreast Cancer Wisconsin (Diagnostic) Data Setのコピーで、乳腺腫瘤の穿刺吸引細胞診(fine needle aspirate (FNA) of a breast mass)のデジタル画像から計算されたデータ。

乳癌に関する細胞テストの様々な数値と、その結果のデータセット。569人の被検者の複数の腫瘤に関する細胞診の結果得られた30個の特徴量と、各被験者の診断結果（悪性／良性：benign/malignant）が格納されている。

ここではPythonのscikit-learnにあるbreast_cancerデータの使い方をまとめる。

データの取得とデータ構造

Pythonで扱う場合、scikit-learnのdatasetsモジュールにあるload_breast_cancer()でデータを取得できる。データはBunchクラスのオブジェクト。

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

for key, value in zip(cancer_ds.keys(), cancer_ds.values()):
    print("{}:\n{}\n".format(key, value))

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

for key, value in zip(cancer_ds.keys(), cancer_ds.values()):

print("{}:\n{}\n".format(key, value))

データの構造は辞書型で、569人の細胞診の結果に関する30個の特徴量をレコードとしたの配列、各腫瘤の診断結果など。

data:
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]

target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 .....
 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1]

target_names:
['malignant' 'benign']

DESCR:
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        - class:
                - WDBC-Malignant
                - WDBC-Benign

    :Summary Statistics:

    ===================================== ====== ======
                                           Min    Max
    ===================================== ====== ======
    radius (mean):                        6.981  28.11
    texture (mean):                       9.71   39.28
    perimeter (mean):                     43.79  188.5
    area (mean):                          143.5  2501.0
    smoothness (mean):                    0.053  0.163
    compactness (mean):                   0.019  0.345
    concavity (mean):                     0.0    0.427
    concave points (mean):                0.0    0.201
    symmetry (mean):                      0.106  0.304
    fractal dimension (mean):             0.05   0.097
    radius (standard error):              0.112  2.873
    texture (standard error):             0.36   4.885
    perimeter (standard error):           0.757  21.98
    area (standard error):                6.802  542.2
    smoothness (standard error):          0.002  0.031
    compactness (standard error):         0.002  0.135
    concavity (standard error):           0.0    0.396
    concave points (standard error):      0.0    0.053
    symmetry (standard error):            0.008  0.079
    fractal dimension (standard error):   0.001  0.03
    radius (worst):                       7.93   36.04
    texture (worst):                      12.02  49.54
    perimeter (worst):                    50.41  251.2
    area (worst):                         185.2  4254.0
    smoothness (worst):                   0.071  0.223
    compactness (worst):                  0.027  1.058
    concavity (worst):                    0.0    1.252
    concave points (worst):               0.0    0.291
    symmetry (worst):                     0.156  0.664
    fractal dimension (worst):            0.055  0.208
    ===================================== ====== ======

    :Missing Attribute Values: None

    :Class Distribution: 212 - Malignant, 357 - Benign

    :Creator:  Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian

    :Donor: Nick Street

    :Date: November, 1995

This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.
https://goo.gl/U2Uwz2

Features are computed from a digitized image of a fine needle
aspirate (FNA) of a breast mass.  They describe
characteristics of the cell nuclei present in the image.

Separating plane described above was obtained using
Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
Construction Via Linear Programming." Proceedings of the 4th
Midwest Artificial Intelligence and Cognitive Science Society,
pp. 97-101, 1992], a classification method which uses linear
programming to construct a decision tree.  Relevant features
were selected using an exhaustive search in the space of 1-4
features and 1-3 separating planes.

The actual linear program used to obtain the separating plane
in the 3-dimensional space is that described in:
[K. P. Bennett and O. L. Mangasarian: "Robust Linear
Programming Discrimination of Two Linearly Inseparable Sets",
Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server:

ftp ftp.cs.wisc.edu
cd math-prog/cpo-dataset/machine-learn/WDBC/

.. topic:: References

   - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction 
     for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on 
     Electronic Imaging: Science and Technology, volume 1905, pages 861-870,
     San Jose, CA, 1993.
   - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and 
     prognosis via linear programming. Operations Research, 43(4), pages 570-577, 
     July-August 1995.
   - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques
     to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) 
     163-171.

feature_names:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

filename:
C:\Users\tomo\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\datasets\data\breast_cancer.csv

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

data:

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]

[2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]

[1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]

...

[1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]

[2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]

[7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]

target:

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0

1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1

.....

1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0

1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1 1 1 1 0 0 0 0 0 0 1]

target_names:

['malignant' 'benign']

DESCR:

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset

--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:

- radius (mean of distances from center to points on the perimeter)

- texture (standard deviation of gray-scale values)

- perimeter

- area

- smoothness (local variation in radius lengths)

- compactness (perimeter^2 / area - 1.0)

- concavity (severity of concave portions of the contour)

- concave points (number of concave portions of the contour)

- symmetry

- fractal dimension ("coastline approximation" - 1)

The mean, standard error, and "worst" or largest (mean of the three

largest values) of these features were computed for each image,

resulting in 30 features. For instance, field 3 is Mean Radius, field

13 is Radius SE, field 23 is Worst Radius.

- class:

- WDBC-Malignant

- WDBC-Benign

:Summary Statistics:

===================================== ====== ======

Min Max

===================================== ====== ======

radius (mean): 6.981 28.11

texture (mean): 9.71 39.28

perimeter (mean): 43.79 188.5

area (mean): 143.5 2501.0

smoothness (mean): 0.053 0.163

compactness (mean): 0.019 0.345

concavity (mean): 0.0 0.427

concave points (mean): 0.0 0.201

symmetry (mean): 0.106 0.304

fractal dimension (mean): 0.05 0.097

radius (standard error): 0.112 2.873

texture (standard error): 0.36 4.885

perimeter (standard error): 0.757 21.98

area (standard error): 6.802 542.2

smoothness (standard error): 0.002 0.031

compactness (standard error): 0.002 0.135

concavity (standard error): 0.0 0.396

concave points (standard error): 0.0 0.053

symmetry (standard error): 0.008 0.079

fractal dimension (standard error): 0.001 0.03

radius (worst): 7.93 36.04

texture (worst): 12.02 49.54

perimeter (worst): 50.41 251.2

area (worst): 185.2 4254.0

smoothness (worst): 0.071 0.223

compactness (worst): 0.027 1.058

concavity (worst): 0.0 1.252

concave points (worst): 0.0 0.291

symmetry (worst): 0.156 0.664

fractal dimension (worst): 0.055 0.208

===================================== ====== ======

:Missing Attribute Values: None

:Class Distribution: 212 - Malignant, 357 - Benign

:Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian

:Donor: Nick Street

:Date: November, 1995

This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.

https://goo.gl/U2Uwz2

Features are computed from a digitized image of a fine needle

aspirate (FNA) of a breast mass. They describe

characteristics of the cell nuclei present in the image.

Separating plane described above was obtained using

Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree

Construction Via Linear Programming." Proceedings of the 4th

Midwest Artificial Intelligence and Cognitive Science Society,

pp. 97-101, 1992], a classification method which uses linear

programming to construct a decision tree. Relevant features

were selected using an exhaustive search in the space of 1-4

features and 1-3 separating planes.

The actual linear program used to obtain the separating plane

in the 3-dimensional space is that described in:

[K. P. Bennett and O. L. Mangasarian: "Robust Linear

Programming Discrimination of Two Linearly Inseparable Sets",

Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server:

ftp ftp.cs.wisc.edu

cd math-prog/cpo-dataset/machine-learn/WDBC/

.. topic:: References

- W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction

for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on

Electronic Imaging: Science and Technology, volume 1905, pages 861-870,

San Jose, CA, 1993.

- O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and

prognosis via linear programming. Operations Research, 43(4), pages 570-577,

July-August 1995.

- W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques

to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994)

163-171.

feature_names:

['mean radius' 'mean texture' 'mean perimeter' 'mean area'

'mean smoothness' 'mean compactness' 'mean concavity'

'mean concave points' 'mean symmetry' 'mean fractal dimension'

'radius error' 'texture error' 'perimeter error' 'area error'

'smoothness error' 'compactness error' 'concavity error'

'concave points error' 'symmetry error' 'fractal dimension error'

'worst radius' 'worst texture' 'worst perimeter' 'worst area'

'worst smoothness' 'worst compactness' 'worst concavity'

'worst concave points' 'worst symmetry' 'worst fractal dimension']

filename:

C:\Users\tomo\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\datasets\data\breast_cancer.csv

データのキーは以下のようになっている。

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

print(cancer_ds.keys())

# dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

print(cancer_ds.keys())

# dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

データの内容

`'data'`～特徴量データセット

569人の細胞診結果に対する30個の特徴量のデータを格納した2次元配列。列のインデックス(0～29)が30個の特徴量に対応している。

'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,

1.189e-01],

[2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,

8.902e-02],

[1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,

8.758e-02],

...,

[1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,

7.820e-02],

[2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,

1.240e-01],

[7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,

7.039e-02]])

`'target'`～診断結果に対応したコード

各被検者の診断結果（悪性：malignant、良性：benign）を格納した0/1のコードの配列。569個の腫瘤に対応した1次元配列（0：悪性が212、1：良性が357）。

'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       .....
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,

.....

1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

`'target_names'`～診断結果

診断結果、悪性(malignant)／良性(benign)が定義されている。。

'target_names': array(['malignant', 'benign'], dtype='<U9')

1	'target_names': array(['malignant', 'benign'], dtype='<U9')

診断結果とコードの関係は以下の通り。

malignant	0
benign	1

`'feature_names'`～特徴名

データの格納順はDESCRの後。細胞診の結果得られた30個の特徴量の名前。

腫瘤に関する以下の10の属性について、それぞれ平均(mean)、標準偏差(error)、最悪値(worst)の3種類、合計30の特性値に対する名前が格納されている。ここでworstは各属性に関する最大値となっている。

radius：半径（中心から外周までの平均）
texture：テクスチャ―のグレースケールの標準偏差
perimeter：外周長
area：面積
smoothness：中心から外周までの部分偏差
compactness：コンパクト性（外周長²÷面積－1.0）
concavity：コンターの凹部強度
concave points：コンターの凹点の数
symmetry：対称性
fractal dimension：フラクタル次元

 'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',

'mean smoothness', 'mean compactness', 'mean concavity',

'mean concave points', 'mean symmetry', 'mean fractal dimension',

'radius error', 'texture error', 'perimeter error', 'area error',

'smoothness error', 'compactness error', 'concavity error',

'concave points error', 'symmetry error',

'fractal dimension error', 'worst radius', 'worst texture',

'worst perimeter', 'worst area', 'worst smoothness',

'worst compactness', 'worst concavity', 'worst concave points',

'worst symmetry', 'worst fractal dimension'], dtype='<U23')

特徴名とコードの関係は以下の通り。

	mean	error	worst
radius	0	10	20
texture	1	11	21
perimeter	2	12	22
area	3	13	23
smoothness	4	14	24
compactness	5	15	25
concavity	6	16	26
concave points	7	17	27
symmetry	8	18	28
fractal dimension	9	19	29

`'filename'`～ファイル名

これも格納順はDESCRの後で、CSVファイルの位置が示されている。1行目にはデータ数、特徴量数、特徴量名称が並んでおり、その後に569行のレコードに対する4列の特徴量と1列の診断結果データが格納されている。このファイルにはfeature_namesやDESCRに当たるデータは格納されていない。

'filename': 'C:...\\lib\\site-packages\\sklearn\\datasets\\data\\breast_cancer.csv'

1	'filename': 'C:...\\lib\\site-packages\\sklearn\\datasets\\data\\breast_cancer.csv'

`'DESCR'`～データセットの説明

データセットの説明。print(breast_ds_dataset['DESCR'])のようにprint文で整形表示される。

レコード数569個（悪性：212、良性：357）
属性は、30の数値属性とクラス
→predictiveの意味とclassが単数形なのがわからない

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        - class:
                - WDBC-Malignant
                - WDBC-Benign

    :Summary Statistics:

    ===================================== ====== ======
                                           Min    Max
    ===================================== ====== ======
    radius (mean):                        6.981  28.11
    texture (mean):                       9.71   39.28
    perimeter (mean):                     43.79  188.5
    area (mean):                          143.5  2501.0
    smoothness (mean):                    0.053  0.163
    compactness (mean):                   0.019  0.345
    concavity (mean):                     0.0    0.427
    concave points (mean):                0.0    0.201
    symmetry (mean):                      0.106  0.304
    fractal dimension (mean):             0.05   0.097
    radius (standard error):              0.112  2.873
    texture (standard error):             0.36   4.885
    perimeter (standard error):           0.757  21.98
    area (standard error):                6.802  542.2
    smoothness (standard error):          0.002  0.031
    compactness (standard error):         0.002  0.135
    concavity (standard error):           0.0    0.396
    concave points (standard error):      0.0    0.053
    symmetry (standard error):            0.008  0.079
    fractal dimension (standard error):   0.001  0.03
    radius (worst):                       7.93   36.04
    texture (worst):                      12.02  49.54
    perimeter (worst):                    50.41  251.2
    area (worst):                         185.2  4254.0
    smoothness (worst):                   0.071  0.223
    compactness (worst):                  0.027  1.058
    concavity (worst):                    0.0    1.252
    concave points (worst):               0.0    0.291
    symmetry (worst):                     0.156  0.664
    fractal dimension (worst):            0.055  0.208
    ===================================== ====== ======

    :Missing Attribute Values: None

    :Class Distribution: 212 - Malignant, 357 - Benign

    :Creator:  Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian

    :Donor: Nick Street

    :Date: November, 1995

This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.
https://goo.gl/U2Uwz2

Features are computed from a digitized image of a fine needle
aspirate (FNA) of a breast mass.  They describe
characteristics of the cell nuclei present in the image.

Separating plane described above was obtained using
Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
Construction Via Linear Programming." Proceedings of the 4th
Midwest Artificial Intelligence and Cognitive Science Society,
pp. 97-101, 1992], a classification method which uses linear
programming to construct a decision tree.  Relevant features
were selected using an exhaustive search in the space of 1-4
features and 1-3 separating planes.

The actual linear program used to obtain the separating plane
in the 3-dimensional space is that described in:
[K. P. Bennett and O. L. Mangasarian: "Robust Linear
Programming Discrimination of Two Linearly Inseparable Sets",
Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server:

ftp ftp.cs.wisc.edu
cd math-prog/cpo-dataset/machine-learn/WDBC/

.. topic:: References

   - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction 
     for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on 
     Electronic Imaging: Science and Technology, volume 1905, pages 861-870,
     San Jose, CA, 1993.
   - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and 
     prognosis via linear programming. Operations Research, 43(4), pages 570-577, 
     July-August 1995.
   - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques
     to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) 
     163-171.

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset

--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:

- radius (mean of distances from center to points on the perimeter)

- texture (standard deviation of gray-scale values)

- perimeter

- area

- smoothness (local variation in radius lengths)

- compactness (perimeter^2 / area - 1.0)

- concavity (severity of concave portions of the contour)

- concave points (number of concave portions of the contour)

- symmetry

- fractal dimension ("coastline approximation" - 1)

The mean, standard error, and "worst" or largest (mean of the three

largest values) of these features were computed for each image,

resulting in 30 features. For instance, field 3 is Mean Radius, field

13 is Radius SE, field 23 is Worst Radius.

- class:

- WDBC-Malignant

- WDBC-Benign

:Summary Statistics:

===================================== ====== ======

Min Max

===================================== ====== ======

radius (mean): 6.981 28.11

texture (mean): 9.71 39.28

perimeter (mean): 43.79 188.5

area (mean): 143.5 2501.0

smoothness (mean): 0.053 0.163

compactness (mean): 0.019 0.345

concavity (mean): 0.0 0.427

concave points (mean): 0.0 0.201

symmetry (mean): 0.106 0.304

fractal dimension (mean): 0.05 0.097

radius (standard error): 0.112 2.873

texture (standard error): 0.36 4.885

perimeter (standard error): 0.757 21.98

area (standard error): 6.802 542.2

smoothness (standard error): 0.002 0.031

compactness (standard error): 0.002 0.135

concavity (standard error): 0.0 0.396

concave points (standard error): 0.0 0.053

symmetry (standard error): 0.008 0.079

fractal dimension (standard error): 0.001 0.03

radius (worst): 7.93 36.04

texture (worst): 12.02 49.54

perimeter (worst): 50.41 251.2

area (worst): 185.2 4254.0

smoothness (worst): 0.071 0.223

compactness (worst): 0.027 1.058

concavity (worst): 0.0 1.252

concave points (worst): 0.0 0.291

symmetry (worst): 0.156 0.664

fractal dimension (worst): 0.055 0.208

===================================== ====== ======

:Missing Attribute Values: None

:Class Distribution: 212 - Malignant, 357 - Benign

:Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian

:Donor: Nick Street

:Date: November, 1995

This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.

https://goo.gl/U2Uwz2

Features are computed from a digitized image of a fine needle

aspirate (FNA) of a breast mass. They describe

characteristics of the cell nuclei present in the image.

Separating plane described above was obtained using

Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree

Construction Via Linear Programming." Proceedings of the 4th

Midwest Artificial Intelligence and Cognitive Science Society,

pp. 97-101, 1992], a classification method which uses linear

programming to construct a decision tree. Relevant features

were selected using an exhaustive search in the space of 1-4

features and 1-3 separating planes.

The actual linear program used to obtain the separating plane

in the 3-dimensional space is that described in:

[K. P. Bennett and O. L. Mangasarian: "Robust Linear

Programming Discrimination of Two Linearly Inseparable Sets",

Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server:

ftp ftp.cs.wisc.edu

cd math-prog/cpo-dataset/machine-learn/WDBC/

.. topic:: References

- W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction

for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on

Electronic Imaging: Science and Technology, volume 1905, pages 861-870,

San Jose, CA, 1993.

- O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and

prognosis via linear programming. Operations Research, 43(4), pages 570-577,

July-August 1995.

- W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques

to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994)

163-171.

データの利用

データの取得方法

breast_cancerデータセットから各データを取り出すのに、以下の2つの方法がある。

辞書のキーを使って呼び出す（例：breast_cancer_dataset['DESCR']）
キーの文字列をプロパティーに指定する（例：breast_cancer_dataset.DESCR）

全レコードの特徴量データの取得

'data'から、569のレコードに関する30の特徴量が569行30列の2次元配列で得られる。30の特徴量は'feature_names'の30の特徴名に対応している。

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

print(cancer_ds.data)

# [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
#  [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
#  [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
#  ...
#  [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
#  [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
#  [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

print(cancer_ds.data)

# [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]

# [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]

# [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]

# ...

# [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]

# [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]

# [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]

特定の特徴量のデータのみ取得

特定の特徴量に関する全レコードのデータを取り出すときにはX[:, n]の形で指定する。

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

features = cancer_ds['feature_names']
X = cancer_ds['data']
n_feature = 10

feature = X[:, n_feature]

print("feature name : {}".format(features[n_feature]))
print("feature data :\n{}".format(feature))

# feature name : radius error
# feature data :
# [1.095  0.5435 0.7456 0.4956 0.7572 0.3345 0.4467 0.5835 0.3063 0.2976
#  0.3795 0.5058 0.9555 0.4033 0.2121 0.37   0.4727 0.5692 0.7582 0.2699
#  0.1852 0.2773 0.4388 0.6917 0.8068 1.046  0.2545 0.8529 0.439  0.6003
#  .....
#  0.2784 0.2542 0.3031 0.2351 0.272  0.346  0.2104 0.1144 0.2957 0.5196
#  0.3163 0.28   0.2409 0.3013 0.2116 0.2199 0.2441 0.5375 0.2254 0.2388
#  0.3645 0.3141 0.2602 0.9622 1.176  0.7655 0.4564 0.726  0.3857]

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

features = cancer_ds['feature_names']

X = cancer_ds['data']

n_feature = 10

feature = X[:, n_feature]

print("feature name : {}".format(features[n_feature]))

print("feature data :\n{}".format(feature))

# feature name : radius error

# feature data :

# [1.095 0.5435 0.7456 0.4956 0.7572 0.3345 0.4467 0.5835 0.3063 0.2976

# 0.3795 0.5058 0.9555 0.4033 0.2121 0.37 0.4727 0.5692 0.7582 0.2699

# 0.1852 0.2773 0.4388 0.6917 0.8068 1.046 0.2545 0.8529 0.439 0.6003

# .....

# 0.2784 0.2542 0.3031 0.2351 0.272 0.346 0.2104 0.1144 0.2957 0.5196

# 0.3163 0.28 0.2409 0.3013 0.2116 0.2199 0.2441 0.5375 0.2254 0.2388

# 0.3645 0.3141 0.2602 0.9622 1.176 0.7655 0.4564 0.726 0.3857]

特定のクラスのデータのみ抽出

特定のクラス（この場合は診断結果）のレコードのみを抽出する方法。ndarrayの条件による要素抽出を使う。

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

targets = cancer_ds['target_names']
X = cancer_ds['data']
y = cancer_ds['target']

n_class = 0
data_0 = X[:, n_class]

print("data for class {}:\n{}".format(targets[n_class], X[y==n_class]))

# data for class malignant:
# [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
#  [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
#  [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
#  ...
#  [2.013e+01 2.825e+01 1.312e+02 ... 1.628e-01 2.572e-01 6.637e-02]
#  [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]]

from sklearn.datasets import load_breast_cancer

cancer_ds = load_breast_cancer()

targets = cancer_ds['target_names']

X = cancer_ds['data']

y = cancer_ds['target']

n_class = 0

data_0 = X[:, n_class]

print("data for class {}:\n{}".format(targets[n_class], X[y==n_class]))

# data for class malignant:

# [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]

# [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]

# [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]

# ...

# [2.013e+01 2.825e+01 1.312e+02 ... 1.628e-01 2.572e-01 6.637e-02]

# [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]

[2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]]

k-最近傍法 – クラス分類

2020-03-16 / tau / コメントする

概要

k-最近傍法(k nearest neighbors: knn)によるクラス分類は、テストデータの近傍の訓練データからテストデータのクラスを決定する。その手法は単純で、特段の学習処理はせず、訓練データの特徴量とクラスを記憶するのみで、テストデータが与えられたときに近傍点からクラスを決定する。手順は以下の通り。

特徴量とクラス分類の訓練データセットを記憶する
テストデータが与えられたら、特徴量空間の中で近傍点を選ぶ
近傍点のクラスからテストデータのクラスを決定する

パラメーターは近傍点の数で、1以上の自然数を設定できる。

利用方法

scikkit-learnのKNeighborsClassifierクラスの利用方法は以下の通り。

sklearn.neighborsからKNeighborsClassifierをインポート
コンストラクターの引数に近傍点数n_neighborsを指定して、KNeighborsClassifierのインスタンスを生成
fit()メソッドに訓練データの特徴量と属性値を与えて学習
predict()メソッドにテストデータの特徴量を指定して、属性値を予測
必要に応じて、kneighbors()メソッドでテストデータの近傍点情報を取得

コンストラクターには、通常n_neighborsで近傍点を指定する。デフォルトはn_neighbors=5。

KNeighborsClassifier(n_neighbors=n): nは近傍点の数。この他の引数に、近傍点を発見するアルゴリズムなどが指定できるようだ。

fit()メソッドに与える訓練データは、特徴量セットと属性値の2つ。

fit(X, y): Xは訓練データセットの特徴量データで、データ数×特徴量数の2次元配列。yは訓練データセットのクラスデータで要素数はデータ数に等しい

テストデータの属性値の予測は、predict()メソッドにテストデータの特徴量を与える。

y = predict(X): Xはテストデータの特徴量データで、データ数×特徴量数の2次元配列。戻り値yは予測されたクラスデータで要素数はデータ数に等しい。

テストデータに対する近傍点の情報を、kneighbors()メソッドで得ることができる。

neigh_dist, neigh_ind = kneighbors(X): X_testはテストデータの特徴量データで、データ数×特徴量数の2次元配列。戻り値y_testは予測された属性値データで要素数はデータ数に等しい。

neigh_dist, neigh_ind = kneighbors(X): テストデータの特徴量Xを引数に与え、近傍点に関する情報を得る。neigh_distは各テストデータから各近傍点までの距離、neigh_indは各テストデータに対する各近傍点のインデックス。いずれも2次元の配列で、テストデータ数×近傍点数の2次元配列となっている。

実行例

以下の例では、n_neighbors=2としてKNeighborsClassifierのインスタンスを準備している。

これに対してfit()メソッドで、2つの特徴量とそれに対するクラス値を持つ訓練データを6個与えている。特徴量データX_trainは行数がデータ数、列数が特徴量の数となる2次元配列を想定している。また属性値y_trainは訓練データ数と同じ要素数の1次元配列。

特徴量1	特徴量2	クラス値
-2	-1	0
-1	-2	0
-0.5	-0.5	0
0.5	0.5	1
1	2	1
2	1	1

これらの訓練データに対して、テストデータの特徴量X_testとして(-0.5, -1.5)、(1, 0)の2つを与えた時の出力を見てみる。

import numpy as np
from sklearn.neighbors import KNeighborsClassifier

X_train = np.array([
    [-2, -1],
    [-1, -2],
    [-0.5, -0.5],
    [0.5, 0.5],
    [1, 2],
    [2, 1]
])
y_train = np.array([0, 0, 0, 1, 1, 1])

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

X_test = np.array([[-0.5, -1.5], [1, 0]])
y_pred = clf.predict(X_test)

neigh_dist, neigh_ind = clf.kneighbors(X=X_test)

print("X_train=\n{}".format(X_train))
print("y_train={}".format(y_train))
print("X_test=\n{}".format(X_test))
print("y_pred={}".format(y_pred))
print("neighbors' distance=\n{}".format(neigh_dist))
print("neighbors' indicies=\n{}".format(neigh_ind))

import numpy as np

from sklearn.neighbors import KNeighborsClassifier

X_train = np.array([

[-2, -1],

[-1, -2],

[-0.5, -0.5],

[0.5, 0.5],

[1, 2],

[2, 1]

])

y_train = np.array([0, 0, 0, 1, 1, 1])

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train, y_train)

X_test = np.array([[-0.5, -1.5], [1, 0]])

y_pred = clf.predict(X_test)

neigh_dist, neigh_ind = clf.kneighbors(X=X_test)

print("X_train=\n{}".format(X_train))

print("y_train={}".format(y_train))

print("X_test=\n{}".format(X_test))

print("y_pred={}".format(y_pred))

print("neighbors' distance=\n{}".format(neigh_dist))

print("neighbors' indicies=\n{}".format(neigh_ind))

このコードの実行結果は以下の通り。

X_train=
[[-2.  -1. ]
 [-1.  -2. ]
 [-0.5 -0.5]
 [ 0.5  0.5]
 [ 1.   2. ]
 [ 2.   1. ]]
y_train=[0 0 0 1 1 1]
X_test=
[[-0.5 -1.5]
 [ 1.   0. ]]
y_pred=[0 1]
neighbors' distance=
[[0.70710678 1.         1.58113883]
 [0.70710678 1.41421356 1.58113883]]
neighbors' indicies=
[[1 2 0]
 [3 5 2]]

X_train=

[[-2. -1. ]

[-1. -2. ]

[-0.5 -0.5]

[ 0.5 0.5]

[ 1. 2. ]

[ 2. 1. ]]

y_train=[0 0 0 1 1 1]

X_test=

[[-0.5 -1.5]

[ 1. 0. ]]

y_pred=[0 1]

neighbors' distance=

[[0.70710678 1. 1.58113883]

[0.70710678 1.41421356 1.58113883]]

neighbors' indicies=

[[1 2 0]

[3 5 2]]

属性値の予測結果については、2つのテストデータに対して2つのクラス値0と1が返されている。

kneighbors()メソッドの戻り値から、1つ目のテストデータにはインデックスが1, 2, 0の3つの点とそれぞれへの距離0.7071, 1, 1.5811が、2つ目のテストデータにはインデックスが3, 5, 2の点とそれぞれへの距離0.7071, 1.4142, 1.5811が得られる。

1つ目のテストデータ(-0.5, -1.5)からの距離
- X_train[1]=(-1, -2)→ $\sqrt{(-0.5)^2+(-0.5)^2}\approx 0.7071$
- X_train[2]=(-0.5, -0.5)→ $\sqrt{0^2+(-1)^2} = 1$
- X_train[0]=(-2, -1)→ $\sqrt{(-1.5)^2+0.5^2} \approx 1.5811$
2つ目のテストデータ(1, 0)からの距離
- X_train[3]=(0.5, 0.5)→ $\sqrt{(-0.5)^2+0.5^2}\approx 0.7071$
- X_train[5]=(2, 1)→ $\sqrt{1^2+1^2}\approx 1.4142$
- X_train[2]=(-0.5, -0.5)→ $\sqrt{(-1.5)^2+(-0.5)^2} \approx 1.5811$

y_predは、テストデータごとに2つの近傍点のクラス値から多数決でクラス値を決定している。

1つ目のテストデータの属性値
- y_train[1]=0、y_train[2]=0、y_train[0]=0の多数決→0
2つ目のテストデータの属性値
- y_train[3]=1、y_train[5]=1、y_train[2]=0の多数決→1

この様子を特徴量平面上に描いたのが以下の図である。各点の色は、各データのクラスを示していて、下方の点は3つの近傍点のクラスが全て0なのでテストデータのクラスも0、右方の点は近傍点のうち2つがクラス1で1つがクラス0なのでテストデータのクラスは多数決で1となっている様子がわかる。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

X_train = np.array([
    [-2, -1],
    [-1, -2],
    [-0.5, -0.5],
    [0.5, 0.5],
    [1, 2],
    [2, 1]
])
y_train = np.array([0, 0, 0, 1, 1, 1])

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

X_test = np.array([[-0.5, -1.5], [1, 0]])
y_pred = clf.predict(X_test)

neigh_dist, neigh_ind = clf.kneighbors(X=X_test)

fig, ax = plt.subplots()

X0 = X_train[y_train==0]
X1 = X_train[y_train==1]

ax.scatter(X0[:, 0], X0[:, 1], label="class-0")
ax.scatter(X1[:, 0], X1[:, 1], label="class-1")
ax.scatter(X_test[:, 0], X_test[:, 1], marker='*', s=120, label="Test data")

for tests, ind in zip(X_test, neigh_ind):
    for neigh in ind:
        ax.plot(
            [tests[0], X_train[neigh][0]], [tests[1], X_train[neigh][1]],
            color='k', linestyle='dotted')

ax.set_xlabel("feature 0")
ax.set_xlabel("feature 1")

ax.legend(loc='upper left')

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier

X_train = np.array([

[-2, -1],

[-1, -2],

[-0.5, -0.5],

[0.5, 0.5],

[1, 2],

[2, 1]

])

y_train = np.array([0, 0, 0, 1, 1, 1])

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train, y_train)

X_test = np.array([[-0.5, -1.5], [1, 0]])

y_pred = clf.predict(X_test)

neigh_dist, neigh_ind = clf.kneighbors(X=X_test)

fig, ax = plt.subplots()

X0 = X_train[y_train==0]

X1 = X_train[y_train==1]

ax.scatter(X0[:, 0], X0[:, 1], label="class-0")

ax.scatter(X1[:, 0], X1[:, 1], label="class-1")

ax.scatter(X_test[:, 0], X_test[:, 1], marker='*', s=120, label="Test data")

for tests, ind in zip(X_test, neigh_ind):

for neigh in ind:

ax.plot(

[tests[0], X_train[neigh][0]], [tests[1], X_train[neigh][1]],

color='k', linestyle='dotted')

ax.set_xlabel("feature 0")

ax.set_xlabel("feature 1")

ax.legend(loc='upper left')

plt.show()

各種データに対する適用例

ndarray – 2次元配列の1次元化

2020-03-16 / tau / コメントする

概要

ndarrayの2次元配列を1次元化して扱いたいとき（たとえば2次元のAxes配列を一括で扱いたいとき）の方法に、reshape()メソッド／関数、flatten()メソッド、ravel()メソッド／関数を使う方法がある。

以下、次の配列を使う。

a = np.arange(4).reshape(2, 2)
print(a)

# [[0 1]
#  [2 3]]

a = np.arange(4).reshape(2, 2)

print(a)

# [[0 1]

# [2 3]]

ndarrayのメソッド

ndarrayのメソッドのreshape()、flatten()、ravel()を使う。戻り値もndarray。

`reshape()`メソッド

reshape()メソッドで1次元化する場合、reshape(-1)とする。

print(a.reshape(-1))

# [0 1 2 3]

print(a.reshape(-1))

# [0 1 2 3]

1次元の列ベクトルが必要な場合はreshape(-1, 1)。

print(a.reshape(-1, 1))

# [[0]
#  [1]
#  [2]
#  [3]]

print(a.reshape(-1, 1))

# [[0]

# [1]

# [2]

# [3]]

reshape(1, -1)とすると1行になるが、次元が2次元であるため、1つの1次元要素配列を要素に持つ2次元配列となってしまう。

print(a.reshape(1, -1))

# [[0 1 2 3]]

print(a.reshape(1, -1))

# [[0 1 2 3]]

`flatten()`メソッド

flatten()メソッドは、reshape(-1)と同じ効果を持つ。

print(a.flatten())

# [0 1 2 3]

print(a.flatten())

# [0 1 2 3]

`ravel()`メソッド

ravel()メソッドも、reshape(-1)と同じ効果を持つ。

print(a.ravel())

# [0 1 2 3]

print(a.ravel())

# [0 1 2 3]

Numpyの関数

Numpy.reshape()関数、Numpy.ravel()関数は、引数にndarray以外のarray-likeオブジェクトをとることができる。ただし戻り値はndarray。

Numpy.flatten()は定義されていない。

`numpy.reshape()`関数

numpy.reshape()関数は、第2引数で次元・次数を与える。

print(np.reshape(b, -1))

# [0 1 2 3]

print(np.reshape(b, -1))

# [0 1 2 3]

2次元にしたい場合は、第2引数をタプルにする。

print(np.reshape(b, (1, -1)))

# [[0 1 2 3]]

print(np.reshape(b, (-1, 1)))

# [[0]
#  [1]
#  [2]
#  [3]]

print(np.reshape(b, (1, -1)))

# [[0 1 2 3]]

print(np.reshape(b, (-1, 1)))

# [[0]

# [1]

# [2]

# [3]]

`numpy.ravel()`関数

numpy.ravel()関数の機能はravel()メソッドと同じ。

print(np.ravel(b))

# [0 1 2 3]

print(np.ravel(b))

# [0 1 2 3]

戻り値のビューとコピーの違い

reshape()とravel()は可能な限りビューを返す（結果の変更がオリジナルに影響を与える）。

a.reshape(-1)[0] = -1
print(a)

# [[-1  1]
#  [ 2  3]]

a[0, 0] = 0
a.ravel()[0] = -1
print(a)

# [[-1  1]
#  [ 2  3]]

a.reshape(-1)[0] = -1

print(a)

# [[-1 1]

# [ 2 3]]

a[0, 0] = 0

a.ravel()[0] = -1

print(a)

# [[-1 1]

# [ 2 3]]

flatten()はコピーを返す（結果の変更はオリジナルに影響しない）。

a.flatten()[0] = -1
print(a)

# [[0 1]
#  [2 3]]

a.flatten()[0] = -1

print(a)

# [[0 1]

# [2 3]]

forgeデータセット

2020-03-15 / tau / コメントする

概要

forgeデータセットは、”Pythonではじめる機械学習”(O’REILLY)中で用いられる架空のデータセットである。

その内容は、2クラスに分類された26個のデータで、2つの特徴量を想定した2次元配列データと各データのクラス分類を示したターゲットデータが得られる。

利用方法

mglearnパッケージから、たとえば以下の方法で利用する。

from mglearn.datasets import make_forge

X, y = make_forge()

from mglearn.datasets import make_forge

X, y = make_forge()

実行するとdeprecatedの警告が出るが、放置してもよいらしい。

内容

特徴量データ

2つの特徴量を持った配列が26個、2次元配列の形で得られる。

[[ 9.96346605,  4.59676542],
 [11.0329545,  -0.16816717],
 [11.54155807,  5.21116083],
 [ 8.69289001,  1.54322016],
 [ 8.1062269,   4.28695977],
 [ 8.30988863,  4.80623966],
 [11.93027136,  4.64866327],
 [ 9.67284681, -0.20283165],
 [ 8.34810316,  5.13415623],
 [ 8.67494727,  4.47573059],
 [ 9.17748385,  5.09283177],
 [10.24028948,  2.45544401],
 [ 8.68937095,  1.48709629],
 [ 8.92229526, -0.63993225],
 [ 9.49123469,  4.33224792],
 [ 9.25694192,  5.13284858],
 [ 7.99815287,  4.8525051 ],
 [ 8.18378052,  1.29564214],
 [ 8.7337095,   2.49162431],
 [ 9.32298256,  5.09840649],
 [10.06393839,  0.99078055],
 [ 9.50048972, -0.26430318],
 [ 8.34468785,  1.63824349],
 [ 9.50169345,  1.93824624],
 [ 9.15072323,  5.49832246],
 [11.563957,    1.3389402 ]]

[[ 9.96346605, 4.59676542],

[11.0329545, -0.16816717],

[11.54155807, 5.21116083],

[ 8.69289001, 1.54322016],

[ 8.1062269, 4.28695977],

[ 8.30988863, 4.80623966],

[11.93027136, 4.64866327],

[ 9.67284681, -0.20283165],

[ 8.34810316, 5.13415623],

[ 8.67494727, 4.47573059],

[ 9.17748385, 5.09283177],

[10.24028948, 2.45544401],

[ 8.68937095, 1.48709629],

[ 8.92229526, -0.63993225],

[ 9.49123469, 4.33224792],

[ 9.25694192, 5.13284858],

[ 7.99815287, 4.8525051 ],

[ 8.18378052, 1.29564214],

[ 8.7337095, 2.49162431],

[ 9.32298256, 5.09840649],

[10.06393839, 0.99078055],

[ 9.50048972, -0.26430318],

[ 8.34468785, 1.63824349],

[ 9.50169345, 1.93824624],

[ 9.15072323, 5.49832246],

[11.563957, 1.3389402 ]]

ターゲットデータ

26個のデータに対する2つのクラス(0, 1)が定められた1次元配列で、クラス0、1がそれぞれ13個ずつとなっている。

[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]

1	[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]

データ総括

特徴量とクラス分類の組を、データ番号とともに整理すると以下の通り。

 0   9.9634661   4.5967654  1
 1  11.0329545  -0.1681672  0
 2  11.5415581   5.2111608  1
 3   8.6928900   1.5432202  0
 4   8.1062269   4.2869598  0
 5   8.3098886   4.8062397  1
 6  11.9302714   4.6486633  1
 7   9.6728468  -0.2028316  0
 8   8.3481032   5.1341562  1
 9   8.6749473   4.4757306  1
10   9.1774838   5.0928318  1
11  10.2402895   2.4554440  1
12   8.6893709   1.4870963  0
13   8.9222953  -0.6399323  0
14   9.4912347   4.3322479  1
15   9.2569419   5.1328486  1
16   7.9981529   4.8525051  1
17   8.1837805   1.2956421  0
18   8.7337095   2.4916243  0
19   9.3229826   5.0984065  1
20  10.0639384   0.9907806  0
21   9.5004897  -0.2643032  0
22   8.3446878   1.6382435  0
23   9.5016934   1.9382462  0
24   9.1507232   5.4983225  1
25  11.5639570   1.3389402  0

0 9.9634661 4.5967654 1

1 11.0329545 -0.1681672 0

2 11.5415581 5.2111608 1

3 8.6928900 1.5432202 0

4 8.1062269 4.2869598 0

5 8.3098886 4.8062397 1

6 11.9302714 4.6486633 1

7 9.6728468 -0.2028316 0

8 8.3481032 5.1341562 1

9 8.6749473 4.4757306 1

10 9.1774838 5.0928318 1

11 10.2402895 2.4554440 1

12 8.6893709 1.4870963 0

13 8.9222953 -0.6399323 0

14 9.4912347 4.3322479 1

15 9.2569419 5.1328486 1

16 7.9981529 4.8525051 1

17 8.1837805 1.2956421 0

18 8.7337095 2.4916243 0

19 9.3229826 5.0984065 1

20 10.0639384 0.9907806 0

21 9.5004897 -0.2643032 0

22 8.3446878 1.6382435 0

23 9.5016934 1.9382462 0

24 9.1507232 5.4983225 1

25 11.5639570 1.3389402 0

from mglearn.datasets import make_forge

X, y = make_forge()

for n, (Xw, yw) in enumerate(zip(X, y)):
    print("{:2d}  {:10.7f}  {:10.7f}  {}".format(n, Xw[0], Xw[1], yw))

from mglearn.datasets import make_forge

X, y = make_forge()

for n, (Xw, yw) in enumerate(zip(X, y)):

print("{:2d} {:10.7f} {:10.7f} {}".format(n, Xw[0], Xw[1], yw))

訓練データとテストデータの分割～train_test_split()

2020-03-14 / tau / コメントする

概要

scikit-learnのtrain_test_split()関数を使うと、与えたデータをいろいろな方法で訓練データとテストデータに切り分けてくれる。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.arange(1, 13)
print(x)
# [ 1  2  3  4  5  6  7  8  9 10 11 12]

print(train_test_split(x))
# [array([ 7,  2, 12,  5,  3,  9, 11,  8, 10]), array([1, 6, 4])]

x_train, x_test = train_test_split(x)

print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
# x_train:[ 6  1 12  7  3  2 11  5  4]
# x_test :[ 8  9 10]

import numpy as np

from sklearn.model_selection import train_test_split

x = np.arange(1, 13)

print(x)

# [ 1 2 3 4 5 6 7 8 9 10 11 12]

print(train_test_split(x))

# [array([ 7, 2, 12, 5, 3, 9, 11, 8, 10]), array([1, 6, 4])]

x_train, x_test = train_test_split(x)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[ 6 1 12 7 3 2 11 5 4]

# x_test :[ 8 9 10]

8行目で、train_test_split()に配列を与えた結果、それが2つの配列に分割されていることがわかる。

11行目では、その結果を訓練用、テスト用の配列として取得している。

デフォルトでtrain_test_split()は、テスト用データのサイズが与えた配列のサイズの0.25となるように配列を分割する（1つ目のサイズ：2つ目のサイズ＝3:1）。x_testのサイズが12×0.25=3、x_trainのサイズが9となっていることが確認できる。

乱数系列の固定

データの分割あたって、要素の選択はtrain_test_split()の実行ごとにランダムに行われるが、random_stateパラメーターを指定することで固定できる。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.arange(1, 13)

x_train, x_test = train_test_split(x, random_state=0)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
# x_train:[11  3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5]

x_train, x_test = train_test_split(x, random_state=0)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
# x_train:[11  3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5]

x_train, x_test = train_test_split(x, random_state=1)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
# x_train:[11  2  7  1  8 12 10  9  6]
# x_test :[3 4 5]

x_train, x_test = train_test_split(x, random_state=1)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
# x_train:[11  2  7  1  8 12 10  9  6]
# x_test :[3 4 5]

import numpy as np

from sklearn.model_selection import train_test_split

x = np.arange(1, 13)

x_train, x_test = train_test_split(x, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[11 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5]

x_train, x_test = train_test_split(x, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[11 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5]

x_train, x_test = train_test_split(x, random_state=1)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[11 2 7 1 8 12 10 9 6]

# x_test :[3 4 5]

x_train, x_test = train_test_split(x, random_state=1)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[11 2 7 1 8 12 10 9 6]

# x_test :[3 4 5]

データのサイズ

テストデータサイズの指定

テストデータのサイズはtest_sizeパラメーターで指定することができる。

以下の例では、テストデータの比率をデフォルトの0.25→0.3に変更しており、テストデータのサイズが4となっている（test_size=0.26としてもx_testのサイズが4になり、テストデータのサイズは切り上げで計算されている）。

比率によってデータサイズを指定する場合は0<test_size<1の実数で指定(0や1.0で指定するとエラー)

x_train, x_test = train_test_split(x, test_size=0.3, random_state=0)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))

# x_train:[ 3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5 11]

x_train, x_test = train_test_split(x, test_size=0.3, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[ 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5 11]

訓練データのサイズを比率ではなく実際のサイズ(要素数)で指定することもできる。その場合、test_sizeを1以上の整数で指定。

以下の例ではテストデータのサイズを4として指定している。

x_train, x_test = train_test_split(x, test_size=4, random_state=0)
print("y_train:{}".format(x_train))
print("y_test :{}".format(x_test))

# y_train:[ 3  9  2  8 10  4  1  6]
# y_test :[ 7 12  5 11]

x_train, x_test = train_test_split(x, test_size=4, random_state=0)

print("y_train:{}".format(x_train))

print("y_test :{}".format(x_test))

# y_train:[ 3 9 2 8 10 4 1 6]

# y_test :[ 7 12 5 11]

訓練データサイズの指定

train_sizeパラメーターで訓練データのサイズを指定することもできる。

以下の例ではtrain_size=0.8とし、訓練データサイズが9となっている（訓練データサイズの計算は切り下げで行われている）。

x_train, x_test = train_test_split(x, train_size=0.8, random_state=0)
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))

# x_train:[11  3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5]

x_train, x_test = train_test_split(x, train_size=0.8, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[11 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5]

訓練データサイズも要素数での指定が可能。

x_train, x_test = train_test_split(x, train_size=10, random_state=0)
print("x_train:{}".format(x_train))
print("y_test :{}".format(x_test))

# x_train:[ 5 11  3  9  2  8 10  4  1  6]
# y_test :[ 7 12]

x_train, x_test = train_test_split(x, train_size=10, random_state=0)

print("x_train:{}".format(x_train))

print("y_test :{}".format(x_test))

# x_train:[ 5 11 3 9 2 8 10 4 1 6]

# y_test :[ 7 12]

データ選択の内部手続

ここで、random_state=0としてtest_sizeやtrain_sizeを変化させたとき、テストデータの要素が現れる順番は変わらないということに気づいた。

x_train, x_test = train_test_split(x, test_size=0.2, random_state=0)
# x_train:[11  3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5]

x_train, x_test = train_test_split(x, test_size=0.3, random_state=0)
# x_train:[ 3  9  2  8 10  4  1  6]
# x_test :[ 7 12  5 11]

x_train, x_test = train_test_split(x, test_size=0.4, random_state=0)
# x_train:[ 9  2  8 10  4  1  6]
# x_test :[ 7 12  5 11  3]

x_train, x_test = train_test_split(x, train_size=9, random_state=0)
# x_train:[ 5 11  3  9  2  8 10  4  1  6]
# y_test :[ 7 12]

x_train, x_test = train_test_split(x, train_size=8, random_state=0)
# x_train:[11  3  9  2  8 10  4  1  6]
# y_test :[ 7 12  5]

x_train, x_test = train_test_split(x, train_size=7, random_state=0)
# x_train:[ 3  9  2  8 10  4  1  6]
# y_test :[ 7 12  5 11]

x_train, x_test = train_test_split(x, test_size=0.2, random_state=0)

# x_train:[11 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5]

x_train, x_test = train_test_split(x, test_size=0.3, random_state=0)

# x_train:[ 3 9 2 8 10 4 1 6]

# x_test :[ 7 12 5 11]

x_train, x_test = train_test_split(x, test_size=0.4, random_state=0)

# x_train:[ 9 2 8 10 4 1 6]

# x_test :[ 7 12 5 11 3]

x_train, x_test = train_test_split(x, train_size=9, random_state=0)

# x_train:[ 5 11 3 9 2 8 10 4 1 6]

# y_test :[ 7 12]

x_train, x_test = train_test_split(x, train_size=8, random_state=0)

# x_train:[11 3 9 2 8 10 4 1 6]

# y_test :[ 7 12 5]

x_train, x_test = train_test_split(x, train_size=7, random_state=0)

# x_train:[ 3 9 2 8 10 4 1 6]

# y_test :[ 7 12 5 11]

test_size/train_sizeのどちらで指定しても、また比率／要素数の何れで指定しても、常にテストデータの要素は7, 12, 5,…の順番で現れている。

これに対して訓練データの方は、テストデータの要素数が変わると変化するが、テストデータの結果が同じなら訓練データのパターンも同じ。

すなわちtrain_test_split()のサイズ指定は、どのように指定しても一旦テストデータの要素数に変換し、共通の手順でテストデータを選んでいっていると考えられる。

複数データの同時分割

train_test_split()は複数データを同時に分割することもできる。

以下の例では、二つの配列を引数として与えている。その結果は、与えた配列ごとに訓練データ、テストデータの順でタプルとして返される。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.arange(1, 9)
y = np.arange(11, 19)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))
print("y_train:{}".format(y_train))
print("y_test :{}".format(y_test))

# x_train:[2 8 4 1 6 5]
# x_test :[7 3]
# y_train:[12 18 14 11 16 15]
# y_test :[17 13]

import numpy as np

from sklearn.model_selection import train_test_split

x = np.arange(1, 9)

y = np.arange(11, 19)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

print("y_train:{}".format(y_train))

print("y_test :{}".format(y_test))

# x_train:[2 8 4 1 6 5]

# x_test :[7 3]

# y_train:[12 18 14 11 16 15]

# y_test :[17 13]

これが一般的な使い方で、複数の特徴量に関する個体のデータセットと各個体のクラスに関するデータを、同時に訓練データとテストデータに分割するときに用いられる。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.vstack((np.arange(1, 11), np.arange(11, 21))).T
print("original x:\n{}".format(x))

y = np.arange(21, 31)
print("original y:{}".format(y))

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print("x_train:\n{}".format(x_train))
print("x_test :\n{}".format(x_test))
print("y_train:{}".format(y_train))
print("y_test :{}".format(y_test))

import numpy as np

from sklearn.model_selection import train_test_split

x = np.vstack((np.arange(1, 11), np.arange(11, 21))).T

print("original x:\n{}".format(x))

y = np.arange(21, 31)

print("original y:{}".format(y))

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print("x_train:\n{}".format(x_train))

print("x_test :\n{}".format(x_test))

print("y_train:{}".format(y_train))

print("y_test :{}".format(y_test))

元のデータは

original x:
[[ 1 11]
 [ 2 12]
 [ 3 13]
 [ 4 14]
 [ 5 15]
 [ 6 16]
 [ 7 17]
 [ 8 18]
 [ 9 19]
 [10 20]]
original y:[21 22 23 24 25 26 27 28 29 30]

original x:

[[ 1 11]

[ 2 12]

[ 3 13]

[ 4 14]

[ 5 15]

[ 6 16]

[ 7 17]

[ 8 18]

[ 9 19]

[10 20]]

original y:[21 22 23 24 25 26 27 28 29 30]

これを訓練データとテストデータに分割した結果は

x_train:
[[10 20]
 [ 2 12]
 [ 7 17]
 [ 8 18]
 [ 4 14]
 [ 1 11]
 [ 6 16]]
x_test :
[[ 3 13]
 [ 9 19]
 [ 5 15]]
y_train:[30 22 27 28 24 21 26]
y_test :[23 29 25]

x_train:

[[10 20]

[ 2 12]

[ 7 17]

[ 8 18]

[ 4 14]

[ 1 11]

[ 6 16]]

x_test :

[[ 3 13]

[ 9 19]

[ 5 15]]

y_train:[30 22 27 28 24 21 26]

y_test :[23 29 25]

`stratify`による層化(相似化)

train_test_split()による要素の選択はランダムに行われる。この場合、クラス分類のパターンが、元データ、訓練データ、テストデータで異なってくる。

以下の例では、元のデータの0と1の比率が1:2だが、訓練データでは1:4、テストデータでは2:1になっている。ケースによっては特定のクラスが極端に少ない／存在しないということも起こり得る。

import numpy as np
from sklearn.model_selection import train_test_split

y = np.array([0, 0, 0, 1, 1, 1, 1, 1])

y_train, y_test = train_test_split(y, test_size=3, random_state=0)
print("y_train:{}".format(y_train))
print("y_test :{}".format(y_test))

# y_train:[1 1 0 1 1]
# y_test :[1 0 0]

import numpy as np

from sklearn.model_selection import train_test_split

y = np.array([0, 0, 0, 1, 1, 1, 1, 1])

y_train, y_test = train_test_split(y, test_size=3, random_state=0)

print("y_train:{}".format(y_train))

print("y_test :{}".format(y_test))

# y_train:[1 1 0 1 1]

# y_test :[1 0 0]

そこで、stratifyパラメーターで配列を指定すると、その配列でのパターンと同じになるように訓練データとテストデータを分割してくれる。

以下の例では、先の配列を元の配列の0/1のパターンと相似になるように分割している。

y_train, y_test = train_test_split(y, test_size=3, stratify=y, random_state=0)
print("y_train:{}".format(y_train))
print("y_test :{}".format(y_test))

# y_train:[0 1 1 0 1]
# y_test :[1 1 0]

y_train, y_test = train_test_split(y, test_size=3, stratify=y, random_state=0)

print("y_train:{}".format(y_train))

print("y_test :{}".format(y_test))

# y_train:[0 1 1 0 1]

# y_test :[1 1 0]

次の例は、9個体の特徴量データxと各個体のクラス区分データyを、クラスの分布に沿って訓練データとテストデータに分割するイメージ。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.array([10, 10, 10, 11, 11, 11, 11, 11, 11])
y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])

x_train, x_test, y_train, y_test =\
    train_test_split(x, y, test_size=3, stratify=y, random_state=0)
print("y_train:{}".format(y_train))
print("y_test :{}".format(y_test))
print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))

# y_train:[0 1 1 0 1 1]
# y_test :[1 1 0]
# x_train:[10 11 11 10 11 11]
# x_test :[11 11 10]

import numpy as np

from sklearn.model_selection import train_test_split

x = np.array([10, 10, 10, 11, 11, 11, 11, 11, 11])

y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])

x_train, x_test, y_train, y_test =\

train_test_split(x, y, test_size=3, stratify=y, random_state=0)

print("y_train:{}".format(y_train))

print("y_test :{}".format(y_test))

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# y_train:[0 1 1 0 1 1]

# y_test :[1 1 0]

# x_train:[10 11 11 10 11 11]

# x_test :[11 11 10]

シャッフルの有無

デフォルトでtrain_test_split()は、データの分割にあたって要素の選択をランダムに行うが、shuffle=Falseを指定すると要素の順番を保持する。

import numpy as np
from sklearn.model_selection import train_test_split

x = np.arange(1, 13)

x_train, x_test = train_test_split(x, shuffle=False, random_state=0)

print("x_train:{}".format(x_train))
print("x_test :{}".format(x_test))

# x_train:[1 2 3 4 5 6 7 8 9]
# x_test :[10 11 12]

import numpy as np

from sklearn.model_selection import train_test_split

x = np.arange(1, 13)

x_train, x_test = train_test_split(x, shuffle=False, random_state=0)

print("x_train:{}".format(x_train))

print("x_test :{}".format(x_test))

# x_train:[1 2 3 4 5 6 7 8 9]

# x_test :[10 11 12]

matplotlib.pyplot.scatter – 散布図

2020-03-10 / tau / コメントする

概要

scatterはx座標とy座標のペアを与えて散布図を描く。

scatter(x, y, color/c=color, s=n, marker=marker, edgecolors=color): x、yは散布図の点の座標で、数値の場合は1点、配列の場合は複数の点を描く。color(またはc)とedgecolorはmatplotlibのcolor指定。markerはmatplotlibのmarkers指定。sはマーカーのサイズ。

基本形

import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt

rnd.seed(0)
x = rnd.random(50)
y = rnd.random(50)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.scatter(x, y, s=40, marker='o', color='aquamarine', edgecolors='black')

ax.set_aspect('equal')

plt.show()

import numpy as np

import numpy.random as rnd

import matplotlib.pyplot as plt

rnd.seed(0)

x = rnd.random(50)

y = rnd.random(50)

fig, ax = plt.subplots(figsize=(4.8, 3.6))

ax.scatter(x, y, s=40, marker='o', color='aquamarine', edgecolors='black')

ax.set_aspect('equal')

plt.show()

複数系列

複数系列の場合は、系列ごとにscatterを実行する。

import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt

x1 = rnd.random(50) + 0.5
y1 = rnd.random(50) + 1

x2 = rnd.random(50) + 1
y2 = rnd.random(50) + 0.5

fig, ax = plt.subplots(figsize=(6.4, 4.8))

ax.scatter(x1, y1, marker='o', s=40, c='blue', alpha=0.5)
ax.scatter(x2, y2, marker='^', s=80, color='red', alpha=0.5)

ax.set_aspect('equal')

plt.show()

import numpy as np

import numpy.random as rnd

import matplotlib.pyplot as plt

x1 = rnd.random(50) + 0.5

y1 = rnd.random(50) + 1

x2 = rnd.random(50) + 1

y2 = rnd.random(50) + 0.5

fig, ax = plt.subplots(figsize=(6.4, 4.8))

ax.scatter(x1, y1, marker='o', s=40, c='blue', alpha=0.5)

ax.scatter(x2, y2, marker='^', s=80, color='red', alpha=0.5)

ax.set_aspect('equal')

plt.show()

概要

学習率曲線

概要

学習率曲線

概要

凡例の位置

標準的な位置指定

bboxによる位置指定～凡例の外側への設置

凡例の並べ方

デザイン等

凡例の文字サイズ

引数handles、labels

複数グラフの場合の凡例

基本

凡例

概要

データの取得とデータ構造

データの内容

'data'～特徴量データセット

'target'～診断結果に対応したコード

'target_names'～診断結果

'feature_names'～特徴名

'filename'～ファイル名

'DESCR'～データセットの説明

データの利用

データの取得方法

全レコードの特徴量データの取得

特定の特徴量のデータのみ取得

特定のクラスのデータのみ抽出

概要

利用方法

実行例

各種データに対する適用例

概要

ndarrayのメソッド

reshape()メソッド

flatten()メソッド

ravel()メソッド

Numpyの関数

numpy.reshape()関数

numpy.ravel()関数

戻り値のビューとコピーの違い

概要

利用方法

内容

特徴量データ

ターゲットデータ

データ総括

概要

乱数系列の固定

データのサイズ

テストデータサイズの指定

訓練データサイズの指定

データ選択の内部手続

複数データの同時分割

stratifyによる層化(相似化)

シャッフルの有無

概要

基本形

複数系列

引数`handles`、`labels`

`'data'`～特徴量データセット

`'target'`～診断結果に対応したコード

`'target_names'`～診断結果

`'feature_names'`～特徴名

`'filename'`～ファイル名

`'DESCR'`～データセットの説明

`reshape()`メソッド

`flatten()`メソッド

`ravel()`メソッド

`numpy.reshape()`関数

`numpy.ravel()`関数

`stratify`による層化(相似化)