共享單車預測
運用到的資料集如下,可自行下載下傳。
capitalbike-dataset
資料預處理過程
類型變量轉變成one-hot編碼
數值變量進行标準化處理
import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt
data_path = 'capitalbike-Dataset/hour.csv' #讀資料到記憶體, 把capitalbike-Dataset檔案夾放在代碼所在的檔案夾下
rides = pd.read_csv(data_path)
dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']#所有類型編碼變量的名稱
for each in dummy_fields:
dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)#取出類型變量,變成one-hot編碼
rides = pd.concat([rides, dummies], axis=1)#将one-hot編碼與原來的變量合并在一起
fields_to_drop = ['instant', 'dteday', 'atemp', 'workingday', 'season', 'weathersit', 'mnth', 'hr', 'weekday']
data = rides.drop(fields_to_drop, axis=1)#删除變量
quant_features = ['cnt', 'temp', 'hum', 'windspeed'] #數值類型變量的名稱
scaled_features = {}#将每個變量的均值和方差存儲到該變量中
for each in quant_features:
mean, std = data[each].mean(), data[each].std()
scaled_features[each] = [mean, std]
data.loc[:, each] = (data[each] - mean) / std
資料集劃分:取後21天資料(21x24)作為測試集
#資料集的劃分
test_data = data[-21*24:]
train_data = data[:-21*24]
#目标列包含的字段
target_fields = ['cnt', 'casual', 'registered']
#訓練集劃分為特征變量列和目标特征列
features, target = train_data.drop(target_fields, axis=1), train_data[target_fields]
#測試集劃分為特征變量列和目标特征列
test_features, test_target = test_data.drop(target_fields, axis=1), test_data[target_fields]
#将資料類型轉化為numpy數組
x = features.values
y = target['cnt'].values
y = y.astype(float)
y = np.reshape(y, [len(y), 1])
建構神經網絡
#定義神經網絡的架構,features.shape[1]個輸入層單元,10個隐含層,1個輸出層
input_size = features.shape[1]
hidden_size = 10
output_size = 1
batch_size = 128
neu = torch.nn.Sequential(
torch.nn.Linear(input_size, hidden_size),
torch.nn.Sigmoid(),
torch.nn.Linear(hidden_size, output_size),
)
cost = torch.nn.MSELoss()
optimizer = optim.SGD(neu.parameters(), lr=0.01)
将訓練資料集采用批處理模式(batch size)進行訓練
#采用批處理後,訓練神經網絡,每128個樣本點華為一批,循環的時候一批一批的讀取
losses = []
for i in range(1000):
batch_loss = []
for start in range(0, len(x), batch_size):
end = start + batch_size if start + batch_size < len(x) else len(x)
xx = Variable(torch.FloatTensor(x[start:end]))
yy = Variable(torch.FloatTensor(y[start:end]))
predict = neu(xx)
loss = cost(predict, yy)
optimizer.zero_grad()
loss.backward()
optimizer.step()
batch_loss.append(loss.data.numpy())
if i % 100 == 0:
losses.append(np.mean(batch_loss))
print(i, np.mean(batch_loss))
plt.plot(np.arange(len(losses))*100, losses)
plt.xlabel('epoch')
plt.ylabel('mse')
loss輸出結果:
0 0.9080135
100 0.2709566
200 0.24481183
300 0.1912091
400 0.12836137
500 0.090683505
600 0.078006424
700 0.072939925
800 0.06986246
900 0.06753944
結果圖如下
測試神經網絡
将訓練好的神經網絡在測試集上進行預測,并将21天的預測資訊與真實資料進行比較
#測試神經網絡
target = test_target['cnt']
target = target.values.reshape([len(target), 1])
target = target.astype(float)
#将特征變量和目标變量包裹在variable中
x = Variable(torch.FloatTensor(test_features.values))
y = Variable(torch.FloatTensor(target))
predict = neu(x)
predict = predict.data.numpy()
#畫圖
fig, ax = plt.subplots(figsize=(10, 7))
mean, std = scaled_features['cnt']
ax.plot(predict * std + mean, label='prediction', linestyle='--')
ax.plot(target * std + mean, label='data')
ax.legend()
ax.set_xlabel('date_time')
ax.set_ylabel('counts')
plt.show()
比較圖如下