天天看點

深度學習與PyTorch實戰——共享單車預測共享單車預測

共享單車預測

運用到的資料集如下,可自行下載下傳。

capitalbike-dataset

資料預處理過程

類型變量轉變成one-hot編碼

數值變量進行标準化處理

import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt

data_path = 'capitalbike-Dataset/hour.csv'  #讀資料到記憶體, 把capitalbike-Dataset檔案夾放在代碼所在的檔案夾下
rides = pd.read_csv(data_path)
dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']#所有類型編碼變量的名稱
for each in dummy_fields:
    dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)#取出類型變量,變成one-hot編碼
    rides = pd.concat([rides, dummies], axis=1)#将one-hot編碼與原來的變量合并在一起
fields_to_drop = ['instant', 'dteday', 'atemp', 'workingday', 'season', 'weathersit', 'mnth', 'hr', 'weekday']
data = rides.drop(fields_to_drop, axis=1)#删除變量

quant_features = ['cnt', 'temp', 'hum', 'windspeed'] #數值類型變量的名稱
scaled_features = {}#将每個變量的均值和方差存儲到該變量中
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean) / std

           

資料集劃分:取後21天資料(21x24)作為測試集

#資料集的劃分
test_data = data[-21*24:]
train_data = data[:-21*24]
#目标列包含的字段
target_fields = ['cnt', 'casual', 'registered']
#訓練集劃分為特征變量列和目标特征列
features, target = train_data.drop(target_fields, axis=1), train_data[target_fields]
#測試集劃分為特征變量列和目标特征列
test_features, test_target = test_data.drop(target_fields, axis=1), test_data[target_fields]
#将資料類型轉化為numpy數組
x = features.values
y = target['cnt'].values
y = y.astype(float)
y = np.reshape(y, [len(y), 1])
           

建構神經網絡

#定義神經網絡的架構,features.shape[1]個輸入層單元,10個隐含層,1個輸出層
input_size = features.shape[1]
hidden_size = 10
output_size = 1
batch_size = 128
neu = torch.nn.Sequential(
    torch.nn.Linear(input_size, hidden_size),
    torch.nn.Sigmoid(),
    torch.nn.Linear(hidden_size, output_size),
)
cost = torch.nn.MSELoss()
optimizer = optim.SGD(neu.parameters(), lr=0.01)
           

将訓練資料集采用批處理模式(batch size)進行訓練

#采用批處理後,訓練神經網絡,每128個樣本點華為一批,循環的時候一批一批的讀取
losses = []
for i in range(1000):
    batch_loss = []
    for start in range(0, len(x), batch_size):
        end = start + batch_size if start + batch_size < len(x) else len(x)
        xx = Variable(torch.FloatTensor(x[start:end]))
        yy = Variable(torch.FloatTensor(y[start:end]))
        predict = neu(xx)
        loss = cost(predict, yy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_loss.append(loss.data.numpy())
    if i % 100 == 0:
        losses.append(np.mean(batch_loss))
        print(i, np.mean(batch_loss))
plt.plot(np.arange(len(losses))*100, losses)
plt.xlabel('epoch')
plt.ylabel('mse')
           

loss輸出結果:

0 0.9080135

100 0.2709566

200 0.24481183

300 0.1912091

400 0.12836137

500 0.090683505

600 0.078006424

700 0.072939925

800 0.06986246

900 0.06753944

結果圖如下

深度學習與PyTorch實戰——共享單車預測共享單車預測

測試神經網絡

将訓練好的神經網絡在測試集上進行預測,并将21天的預測資訊與真實資料進行比較

#測試神經網絡
target = test_target['cnt']
target = target.values.reshape([len(target), 1])
target = target.astype(float)
#将特征變量和目标變量包裹在variable中
x = Variable(torch.FloatTensor(test_features.values))
y = Variable(torch.FloatTensor(target))

predict = neu(x)
predict = predict.data.numpy()
#畫圖
fig, ax = plt.subplots(figsize=(10, 7))
mean, std = scaled_features['cnt']
ax.plot(predict * std + mean, label='prediction', linestyle='--')
ax.plot(target * std + mean, label='data')
ax.legend()
ax.set_xlabel('date_time')
ax.set_ylabel('counts')
plt.show()

           

比較圖如下

深度學習與PyTorch實戰——共享單車預測共享單車預測

繼續閱讀