文章目錄
- 1. 基礎概念
-
- 1.1 學習曲線
- 1.2 L1/L2 範數
- 1.3 正則化
- 2. 線性回歸
-
- 2.1 線性回歸拟合正弦函數
- 2.2 線性回歸預測房價
- 3. 邏輯回歸
-
- 3.1 邏輯回歸模型成本函數
- 3.2 邏輯回歸癌症預測實戰
1. 基礎概念
1.1 學習曲線
通過學習曲線可以觀測模型準确度與訓練資料集大小的關系,其要表達的内容是當訓練資料集增加時,模型對訓練資料集拟合的準确性以及交叉驗證資料集預測的準确性的變化規律
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
n_dots = 200
X = np.linspace(0, 1, n_dots)
y = np.sqrt(X) + 0.2*np.random.rand(n_dots) - 0.1;
X = X.reshape(-1, 1)
y = y.reshape(-1, 1)
X.shape, y.shape
((200, 1), (200, 1))
# plot the data
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(X, y, color='tab:blue')
[<matplotlib.lines.Line2D at 0x1e0908c8988>]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# degree 表示多項式的階數
def polynomial_model(degree=1):
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
# 這裡的train_sizes是指定訓練樣本數量的變化規則,np.linspace(.1, 1.0, 5)表示把訓練樣本數量從0.1~1分成五等分
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
# learning_curve函數自動把訓練樣本的數量按照規定逐漸增加,然後畫出不同訓練樣本數量時模型的準确性
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
print("train_sizes:{}, train_scores:{}, test_scores:{}\n\t".format(train_sizes.shape, train_scores.shape, test_scores.shape))
# print("train_sizes:{}, train_scores:{}, test_scores:{}\n\t".format(train_sizes, train_scores, test_scores))
# 計算均值與方差
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
# plt.fill_between可以将模型準确性的平均值上下方差的空間裡用顔色填充
# plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
# train_scores_mean + train_scores_std, alpha=0.1,
# color="r")
# plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
# test_scores_mean + test_scores_std, alpha=0.1, color="g")
# 畫出不同樣本數量時對于的準确率
plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
# 調整說明框的參數
plt.legend(loc="best")
return plt
# 為了讓學習曲線更平滑,交叉驗證資料集的得分計算 10 次,每次都重新選中 20% 的資料計算一遍
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
titles = ['Learning Curves (Under Fitting)',
'Learning Curves',
'Learning Curves (Over Fitting)']
degrees = [1, 3, 10]
plt.figure(figsize=(18, 4))
for i in range(len(degrees)):
plt.subplot(1, 3, i + 1)
plot_learning_curve(polynomial_model(degrees[i]), titles[i], X, y, ylim=(0.75, 1.01), cv=cv)
plt.show()
train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)
train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)
train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)
過拟合解決方法:擷取更多的訓練資料,減少輸入的特征數量
欠拟合解決方法:增加有價值的特征,增加多項式特征
F1 Score可以綜合評價查準率與召回率,定義為:
F 1 S c o r e = 2 P R P + R F1Score = 2\frac{PR}{P+R} F1Score=2P+RPR
其中P是查準率,R是召回率,這樣一個數值就可以綜合評價。如果,查準率與召回率有一個為0,那麼F1 Score将會是0.
1.2 L1/L2 範數
def L1(x):
return 1 - np.abs(x)
def L2(x):
return np.sqrt(1 - np.power(x, 2))
def contour(v, x):
return 5 - np.sqrt(v - np.power(x + 2, 2)) # 4x1^2 + 9x2^2 = v
def format_spines(title):
ax = plt.gca() # gca 代表目前坐标軸,即 'get current axis'
ax.spines['right'].set_color('none') # 隐藏坐标軸
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom') # 設定刻度顯示位置
ax.spines['bottom'].set_position(('data',0)) # 設定下方坐标軸位置
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0)) # 設定左側坐标軸位置
plt.title(title)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.figure(figsize=(8.4, 4), dpi=100)
x = np.linspace(-1, 1, 100)
cx = np.linspace(-3, 1, 100)
plt.subplot(1, 2, 1)
format_spines('L1 norm')
plt.plot(x, L1(x), 'r-', x, -L1(x), 'r-')
plt.plot(cx, contour(20, cx), 'r--', cx, contour(15, cx), 'r--', cx, contour(10, cx), 'r--')
plt.subplot(1, 2, 2)
format_spines('L2 norm')
plt.plot(x, L2(x), 'b-', x, -L2(x), 'b-')
plt.plot(cx, contour(19, cx), 'b--', cx, contour(15, cx), 'b--', cx, contour(10, cx), 'b--')
[<matplotlib.lines.Line2D at 0x18d6498f508>,
<matplotlib.lines.Line2D at 0x18d64f374c8>,
<matplotlib.lines.Line2D at 0x18d64f376c8>]
L1範數作為正則項,會讓模型參數稀疏化,即讓模型參數向量裡的0元素盡可能多,隻保留模型參數向量中重要特征的貢獻。
而L2範數作為正則項,則讓模型參數盡量小,但不會為0,即盡量讓每個特征對應預測值都有一些小的貢獻。
作為推論,L1範數作為正則項,有以下幾個用途:
- 特征選擇:它會讓模型參數向量裡的元素為0的點盡量多。是以可以排除掉那些對預測值沒有什麼影響的特征。進而簡化問題。是以L1範數解決過拟合的措施,實際上是減少特征數量。
- 可解釋性:模型參數向量稀疏化後,隻會留下那些對預測值有重要影響的特征。這樣我們就容易解釋模型的因果關系。比如,針對某種癌症的篩查,如果有100個特征,那麼我們無從解釋到底哪些特征對陽性呈關鍵作用。稀疏化後,隻留下幾個關鍵的特征,就容易看到因果關系。
由此可見,L1範數作為正則項,更多的是一個分析工具,而适合用來對模型求解。因為它會把不重要的特征直接去除。大部分的情況下,我們解決過拟合問題,還是選擇L2範數作為正則項
1.3 正則化
- 線性模型的正則化函數:
J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ] + λ ∑ j = 1 n θ j 2 J(\theta)=\frac{1}{2m}[\sum^{m}_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})^{2}]+\lambda\sum^{n}_{j=1}\theta_{j}^{2} J(θ)=2m1[i=1∑m(hθ(x(i))−y(i))2]+λj=1∑nθj2
分析:
函數的前半部分是線性回歸模型的成本函數,而後半部分是正則項。增加了正則化部分後,成本函數就不再唯一地由預測值與真實值的誤差所決定,還和參數 θ \theta θ的大小有關。是以此時對 θ \theta θ就要加以限制,如果每個 θ \theta θ可能會讓預測值與真實值的誤差 ( h θ ( x ( i ) ) − y ( i ) ) 2 (h_{\theta}(x^{(i)})-y^{(i)})^{2} (hθ(x(i))−y(i))2值很小,但是 θ \theta θ值會很大,最終的結果就是成本函數會偏大。
其實,最本質的思考就是不讓每一個參數 θ \theta θ都占據過多的權重,因為過多的權值會導緻一些參數的作用沒有展現出來,多出來的參數變化導緻過拟合。而如果全部的參數都隻占據一小部分的權值,平等權重的參數會比較好的拟合問題,沒有多餘的參數,也就避免了過拟合的發生。其中的 λ \lambda λ還可以調節正則項的權重,進而避免線性回歸算法過拟合。
- 邏輯回歸的正則化函數:
J ( θ ) = 1 m [ ∑ i = 1 m y ( i ) log ( h θ ( x ( i ) ) ) + ( 1 − y ( i ) ) log ( 1 − h θ ( x ( i ) ) ) ] + λ 2 m ∑ j = 1 n θ j 2 J(\theta)=\frac{1}{m}[\sum^{m}_{i=1}y^{(i)}\log(h_{\theta}(x^{(i)}))+(1-y^{(i)})\log(1-h_{\theta}(x^{(i)}))]+\frac{\lambda}{2m}\sum^{n}_{j=1}\theta_{j}^{2} J(θ)=m1[i=1∑my(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i)))]+2mλj=1∑nθj2
其方法也是在原本的損失函數上增加了一個正則化項
2. 線性回歸
2.1 線性回歸拟合正弦函數
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
n_dots = 200
# 随機添加一些噪聲
X = np.linspace(-2 * np.pi, 2 * np.pi, n_dots)
Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
X = X.reshape(-1, 1)
Y = Y.reshape(-1, 1);
X.shape, Y.shape
((200, 1), (200, 1))
畫出噪聲曲線
# fig = plt.subplots(1,1,1)
# plt.plot(X,Y)
# plot the data
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(X, Y, color='tab:blue')
[<matplotlib.lines.Line2D at 0x21c99aadd08>]
使用管道将兩個類串起來
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# degree決定多項式的階數
def polynomial_model(degree=1):
# 多項式實作
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
# normalize表示進行特征縮放,對訓練樣本進行歸一化處理,處理後的特征值在[0,1]之間
linear_regression = LinearRegression(normalize=True)
# 把回歸模型與多項式模型串聯起來
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
分别使用2,3,5,10階多項式來拟合資料集
from sklearn.metrics import mean_squared_error
degrees = [2, 3, 5, 10]
results = []
for d in degrees:
# 構模組化型:多項式+線性回歸
model = polynomial_model(degree=d)
# 訓練
model.fit(X, Y)
# 測試結果
train_score = model.score(X, Y)
mse = mean_squared_error(Y, model.predict(X))
results.append({"model": model, "degree": d, "score": train_score, "mse": mse})
for r in results:
print("degree: {}; train score: {}; mean squared error: {}".format(r["degree"], r["score"], r["mse"]))
degree: 2; train score: 0.1491389252670503; mean squared error: 0.4291288571778199
degree: 3; train score: 0.2754424056436836; mean squared error: 0.3654281311707956
degree: 5; train score: 0.8932535122798162; mean squared error: 0.05383722401155313
degree: 10; train score: 0.9936267162822405; mean squared error: 0.0032143437271831064
from matplotlib.figure import SubplotParams
# 調整圖像框的大小,SubplotParams可以調整子圖的豎直間距
plt.figure(figsize=(10, 5), dpi=100, subplotpars=SubplotParams(hspace=0.3))
for i, r in enumerate(results):
fig = plt.subplot(2, 2, i+1)
plt.xlim(-8, 8)
plt.title("LinearRegression degree={}".format(r["degree"]))
plt.scatter(X, Y, s=5, c='b', alpha=0.5)
plt.plot(X, r["model"].predict(X), 'r-')
可以看見10階的多項式拟合正選函數隻能是在訓練區間中表現得良好,而在其他的區間上效果不是很好,如下圖所示
n_dots = 500
# x = np.linspace(-20, 20, n_dots)
x = np.linspace(-2.5 * np.pi, 2.5 * np.pi, n_dots)
x = x.reshape(-1,1)
y = results[-1]['model'].predict(x)
x.shape, y.shape
((500, 1), (500, 1))
fig = plt.subplot(1,1,1)
plt.xlim(-20, 20)
plt.plot(x,y)
[<matplotlib.lines.Line2D at 0x21c99b30bc8>]
整體檢視效果,均可以看見,資料隻能對部分有一個比較好的預測,也就是已經過拟合,是以可以通過減少階層數來實作長區間内比較好的效果
n_dots = 500
# x = np.linspace(-20, 20, n_dots)
x = np.linspace(-2.5 * np.pi, 2.5 * np.pi, n_dots)
x = x.reshape(-1,1)
plt.figure(figsize=(8, 5), dpi=100, subplotpars=SubplotParams(hspace=0.3))
for i in range(4):
y = results[i]['model'].predict(x)
plt.subplot(2,2,i+1)
plt.xlim(-20, 20)
plt.plot(x,y)
2.2 線性回歸預測房價
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target
X.shape, y.shape
((506, 13), (506,))
檢視樣本特征
boston.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
切分資料集,按8:2的格式進行切分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
import time
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# 記錄目前時間
start = time.clock()
# 使用訓練集對模型進行訓練
model.fit(X_train, y_train)
# 統計得分
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))
elaspe: 0.001749; train_score: 0.723941; cv_score: 0.795262
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:7: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
import sys
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:13: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
del sys.path[0]
在這個結果中看出訓練集與測試集的分數都不算很高,感覺是有點欠拟合,線性回歸模型太簡單;此時可以挖掘更多特征或者是增加多項式特征
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 自設定多項式階級
def polynomial_model(degree=1):
# 多項式設定
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
# normalize表示進行特征縮放,對訓練樣本進行歸一化處理,處理後的特征值在[0,1]之間
linear_regression = LinearRegression(normalize=True)
# 把回歸模型與多項式模型串聯起來
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
# 使用了2階多項式拟合
model = polynomial_model(degree=2)
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:20: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
elaspe: 0.735317; train_score: 0.930547; cv_score: 0.860049
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
可以看見,使用二階多項式的效果要比一階的效果要好得多
這裡有一個問題:現在一共有13個輸入特征,從一階多項式變為了二階多項式,輸入特征個數增加了多少個?
參考回答:二階多項式共有:13個單一的特征, C 13 2 C^{2}_{13} C132=78個兩兩配對的特征,13個各自平方的特征,共計104個特征。比一階多項式的13個特征增加了91個特征。
而基于這裡的參考回答,對于一階多項式拟合回歸問題的預測函數模型可以寫為:
h θ ( x ) = θ 0 + θ 1 x 1 + θ 2 x 2 + . . . + θ n x n h_{θ}(x) =θ_{0}+θ_{1}x_{1}+ θ_{2}x_{2}+...+θ_{n}x_{n} hθ(x)=θ0+θ1x1+θ2x2+...+θnxn
而對于二階多項式拟合回歸問題的預測函數模型可以寫為:
h θ ( x ) = θ 0 + θ 1 x 1 + θ 2 x 2 + . . . + θ n x n + θ n + 1 x 1 2 + θ n + 2 x 2 2 + . . . + θ 2 n x n 2 \begin{aligned} h_{θ}(x) =θ_{0}+θ_{1}x_{1}+ θ_{2}x_{2}+...+θ_{n}x_{n}+θ_{n+1}x_{1}^{2}+θ_{n+2}x_{2}^{2}+...+θ_{2n}x_{n}^{2} \end{aligned} hθ(x)=θ0+θ1x1+θ2x2+...+θnxn+θn+1x12+θn+2x22+...+θ2nxn2
畫出學習曲線
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
# 這裡的train_sizes是指定訓練樣本數量的變化規則,np.linspace(.1, 1.0, 5)表示把訓練樣本數量從0.1~1分成五等分
def plot_learning_curve(plt, estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4))
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]
start = time.clock()
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
plt.subplot(1, 3, i + 1)
plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:79: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
elaspe: 6.394253
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:85: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
<Figure size 1296x288 with 0 Axes>
二階多項式的效果對比:
import pandas as pd
result = pd.DataFrame()
pred = model.predict(X_test)[:10]
true = y_test[:10]
result['pred'] = pred
result['true'] = true
result['error'] = pred - true
result.head(10)
pred | true | error | |
---|---|---|---|
43.036887 | 44.8 | -1.763113 | |
1 | 18.716989 | 17.1 | 1.616989 |
2 | 12.138758 | 17.8 | -5.661242 |
3 | 32.128343 | 33.1 | -0.971657 |
4 | 20.503857 | 21.9 | -1.396143 |
5 | 22.349194 | 21.0 | 1.349194 |
6 | 14.668524 | 18.4 | -3.731476 |
7 | 9.946368 | 10.4 | -0.453632 |
8 | 21.741542 | 23.1 | -1.358458 |
9 | 15.891000 | 20.0 | -4.109000 |
- 批量梯度下降算法(Batch Gradient Descent)
對參數進行一次疊代運算,需要周遊所有的訓練資料集。當訓練資料集比較大時,其算法效率會比較低
- 随機梯度下降算法(Stochastic Gradient Descent)
關鍵是将累加器去掉,不需要周遊所有的訓練資料集,而是改成每次随機地從訓練資料集中取一個資料進行參數的疊代計算,随機梯度下降算法可以大大提高模型的訓練效率。
思考:為什麼随機取一個樣本進行參數疊代是可行的?
J ( θ ) = 1 2 m ∑ i = 1 m ( h ( x ( i ) ) − y ( i ) ) 2 J(θ) = \frac{1}{2m}\sum^{m}_{i=1}(h(x^{(i)})-y^{(i)})^{2} J(θ)=2m1i=1∑m(h(x(i))−y(i))2
這裡的累加後除以2是為了計算友善,而除以m的意思是平均值,既所有訓練資料集上的點到預測函數的距離的平均值。而随機梯度下降算法中,随機地選取資料集裡的一個資料,在這個做法中,如果計算次數足夠多,并且是真正随機,那麼随機出來的這組資料從機率的角度來看,和平均值是相當的。
打個比方,儲錢罐裡有1角的硬币10枚,5角的硬币2枚,1元的硬币1枚,總計3元、13枚硬币。随機從裡面取1000次,把每次取出來的硬币币值記錄下來,然後将硬币放回儲錢罐裡。這樣最後去算這1000次取出來的錢的平均值(1000次取出來的币值總和除以1000)和儲錢罐裡每枚硬币的平均值(3/13元)應該是近似相等的。
3. 邏輯回歸
3.1 邏輯回歸模型成本函數
def f_1(x):
return -np.log(x)
def f_0(x):
return -np.log(1 - x)
X = np.linspace(0.01, 0.99, 100)
f = [f_1, f_0]
titles = ["y=1: $-log(h_\\theta(x))$", "y=0: $-log(1 - h_\\theta(x))$"]
plt.figure(figsize=(12, 4))
for i in range(len(f)):
plt.subplot(1, 2, i + 1)
plt.title(titles[i])
plt.xlabel("$h_\\theta(x)$")
plt.ylabel("$Cost(h_\\theta(x), y)$")
plt.plot(X, f[i](X), 'r-')
簡單來說,二分類問題,正樣本的判斷值越遠離0,其損失越小;而對于負樣本的判斷值越遠離0,其損失越大
3.2 邏輯回歸癌症預測實戰
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# 載入資料
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
print('X.shape:{0}, y.shape:{1}; no. positive: {2}; no. negative: {3}'.format(
X.shape, y.shape, y[y==1].shape[0], y[y==0].shape[0]))
X.shape:(569, 30), y.shape:(569,); no. positive: 357; no. negative: 212
cancer.feature_names
array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension'], dtype='<U23')
訓練集與驗證集的劃分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
這裡使用邏輯回歸模型來訓練
# 模型訓練
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('train score: {train_score:.6f}; test score: {test_score:.6f}'.format(
train_score=train_score, test_score=test_score))
train score: 0.953846; test score: 0.964912
在分類問題上,sklearn中score得輸出值是正确預測的百分比
# 樣本預測
y_pred = model.predict(X_test)
print('matchs: {0}/{1}={2}'.format(np.equal(y_pred, y_test).sum(), y_test.shape[0], np.equal(y_pred, y_test).sum()/y_test.shape[0]))
matchs: 110/114=0.9649122807017544
找出自信度不足90%的樣本
# 預測機率:找出低于 90% 機率的樣本個數
y_pred_proba = model.predict_proba(X_test)
print('sample of predict probability: {0}, y_pred_proba.shape:{1}'.format(y_pred_proba[0], y_pred_proba.shape))
# 找出第一列,即預測為陰性的機率大于 0.1 的樣本,儲存在 result 裡
y_pred_proba_0 = y_pred_proba[:, 0] > 0.1
result = y_pred_proba[y_pred_proba_0]
# 在 result 結果集裡,找出第二列,即預測為陽性的機率大于 0.1 的樣本
y_pred_proba_1 = result[:, 1] > 0.1
print("y_pred_proba_1.shape:{}".format(y_pred_proba_1.shape))
sample of predict probability: [9.99999929e-01 7.14244099e-08], y_pred_proba.shape:(114, 2)
y_pred_proba_1.shape:(56,)
這裡使用了L1正則化處理的二階多項式進行邏輯回歸
import time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 增加多項式預處理
def polynomial_model(degree=1, **kwarg):
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
logistic_regression = LogisticRegression(**kwarg)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("logistic_regression", logistic_regression)])
return pipeline
# 增加二階多項式特征建立模型
model = polynomial_model(degree=2, penalty='l1', solver='liblinear')
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(
time.clock()-start, train_score, cv_score))
elaspe: 0.570833; train_score: 1.000000; cv_score: 0.956140
L1範數作為正則項,可以實作參數的稀疏化,即自動幫助我們選擇出那些對模型有關聯的特征。可以觀察一下有多少個特征沒有被丢棄,即其對應的模型參數 θ j \theta_{j} θj非0
logistic_regression = model.named_steps['logistic_regression']
print('model parameters shape: {0}; count of non-zero element: {1}'.format(
logistic_regression.coef_.shape,
np.count_nonzero(logistic_regression.coef_)))
model parameters shape: (1, 495); count of non-zero element: 93
邏輯回歸模型的coef_屬性裡儲存的就是模型參數。從輸出結果可以看到,增加二階多項式特征後,輸入特征由原來的30個增加到了495個,最終大多數特征都被丢棄,隻保留了93個有效特征。L1範數的參數稀疏化起到了重要作用,可以說進行了一個特征的降維過程。
這裡解釋一下495個特征是怎麼得來的:其中如果使用的是一階多項式的特征就是原本的30個特征,而現在為二階多項式,是以兩兩特征的組合成為了一個新的特征,而30個特征的兩兩排列組合有 C 30 2 = 435 個 C_{30}^{2}=435個 C302=435個,再加上自己與自己的乘積組合30個特征,是以一共就是 30 + 435 + 30 = 495 30+435+30=495 30+435+30=495個特征。
學習曲線
怎麼知道使用L1範數作為正則項能提高算法的準确性?
首先畫出使用L1範數作為正則項所對應的一階和二階多項式的學習曲線:
from utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit
# 為了讓學習曲線平滑,計算10次交叉驗證資料集的分數
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
title = 'Learning Curves (degree={0}, penalty={1})'
degrees = [1, 2]
penalty = 'l1'
start = time.clock()
plt.figure(figsize=(12, 4), dpi=144)
for i in range(len(degrees)):
plt.subplot(1, len(degrees), i + 1)
plot_learning_curve(plt, polynomial_model(degree=degrees[i], penalty=penalty, solver='liblinear', max_iter=300),
title.format(degrees[i], penalty), X, y, ylim=(0.8, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
if __name__ == '__main__':
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:16: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
app.launch_new_instance()
elaspe: 21.228479
畫出使用L2範數作為正則項所對應的一階和二階多項式的學習曲線:
import warnings
warnings.filterwarnings("ignore")
penalty = 'l2'
start = time.clock()
plt.figure(figsize=(12, 4), dpi=144)
for i in range(len(degrees)):
plt.subplot(1, len(degrees), i + 1)
plot_learning_curve(plt, polynomial_model(degree=degrees[i], penalty=penalty, solver='lbfgs'),
title.format(degrees[i], penalty), X, y, ylim=(0.8, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))
elaspe: 8.343727
L1範數對應的學習曲線,需要花比較長的時間,原因是,scikit-learn的learning_curve()函數在畫學習曲線的過程中,要對模型進行多次訓練,并計算交叉驗證樣本評分。同時,為了使曲線更平滑,針對每個點還會進行多次計算求平均值。這個就是ShuffleSplit類的作用。
測試三階多項式來拟合模型:
# 增加二階多項式特征建立模型
model = polynomial_model(degree=3, penalty='l1', solver='liblinear')
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
model.fit(X_train, y_train)
testscore = model.score(X_test, y_test)
test_score
0.9649122807017544
logistic_regression2 = model.named_steps['logistic_regression']
print("use feature nums:{}\nall feature nums:{}".format(np.count_nonzero(logistic_regression2.coef_), logistic_regression2.coef_.shape[-1]))
use feature nums:1262
all feature nums:5455
可視化預測結果:
import pandas as pd
result = pd.DataFrame()
result['pred'] = model.predict(X_test)
result['true'] = y_test
result['parm'] = result['pred']==result['true']
result.head(10)
pred | true | parm | |
---|---|---|---|
True | |||
1 | 1 | 1 | True |
2 | True | ||
3 | 1 | 1 | True |
4 | True | ||
5 | 1 | 1 | True |
6 | 1 | False | |
7 | 1 | 1 | True |
8 | True | ||
9 | True |