完成实验五;实验四notebook增加数据集目录描述
This commit is contained in:
parent
70e8881691
commit
9756a73bcc
@ -59,7 +59,50 @@
|
|||||||
"id": "76238d03",
|
"id": "76238d03",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# 0. 数据集"
|
"# 0. 数据集\n",
|
||||||
|
"\n",
|
||||||
|
"数据集文件结构如下:\n",
|
||||||
|
"\n",
|
||||||
|
"```txt\n",
|
||||||
|
"Lab4\n",
|
||||||
|
"├── dataset\n",
|
||||||
|
"│ ├── Haze\n",
|
||||||
|
"│ │ ├── raw\n",
|
||||||
|
"│ │ │ ├── haze\n",
|
||||||
|
"│ │ │ │ ├── 001.jpg\n",
|
||||||
|
"│ │ │ │ ├── 002.jpg\n",
|
||||||
|
"│ │ │ │ ├── ...\n",
|
||||||
|
"│ │ │ │ └── 520.jpg\n",
|
||||||
|
"│ │ │ └── no_haze\n",
|
||||||
|
"│ │ │ ├── 001.jpg\n",
|
||||||
|
"│ │ │ ├── 002.jpg\n",
|
||||||
|
"│ │ │ ├── ...\n",
|
||||||
|
"│ │ │ └── 520.jpg\n",
|
||||||
|
"│ │ ├── split.csv\n",
|
||||||
|
"│ │ └── split_dataset.py\n",
|
||||||
|
"│ └── Vehicles\n",
|
||||||
|
"│ ├── raw\n",
|
||||||
|
"│ │ ├── bus\n",
|
||||||
|
"│ │ │ ├── bus001.jpg\n",
|
||||||
|
"│ │ │ ├── bus002.jpg\n",
|
||||||
|
"│ │ │ ├── ...\n",
|
||||||
|
"│ │ │ ├── bus218.jpg\n",
|
||||||
|
"│ │ │ └── desktop.ini\n",
|
||||||
|
"│ │ ├── car\n",
|
||||||
|
"│ │ │ ├── car001.jpg\n",
|
||||||
|
"│ │ │ ├── car002.jpg\n",
|
||||||
|
"│ │ │ ├── ...\n",
|
||||||
|
"│ │ │ └── car779.jpg\n",
|
||||||
|
"│ │ └── truck\n",
|
||||||
|
"│ │ ├── truck001.jpg\n",
|
||||||
|
"│ │ ├── truck002.jpg\n",
|
||||||
|
"│ │ ├── ...\n",
|
||||||
|
"│ │ └── truck360.jpg\n",
|
||||||
|
"│ ├── split_dataset.py\n",
|
||||||
|
"│ ├── test.csv\n",
|
||||||
|
"│ └── train.csv\n",
|
||||||
|
"└── ...\n",
|
||||||
|
"```"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1877,7 +1920,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "DeepLearningLab",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
480
Lab5/1-RNN.ipynb
Normal file
480
Lab5/1-RNN.ipynb
Normal file
File diff suppressed because one or more lines are too long
458
Lab5/2-LSTM.ipynb
Normal file
458
Lab5/2-LSTM.ipynb
Normal file
File diff suppressed because one or more lines are too long
472
Lab5/3-GRU.ipynb
Normal file
472
Lab5/3-GRU.ipynb
Normal file
File diff suppressed because one or more lines are too long
354
Lab5/4-param.ipynb
Normal file
354
Lab5/4-param.ipynb
Normal file
File diff suppressed because one or more lines are too long
141
Lab5/dataset.py
Normal file
141
Lab5/dataset.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import torch
|
||||||
|
import torch.utils.data as data
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
|
||||||
|
# 定义dataset
|
||||||
|
class my_Dataset(data.Dataset):
|
||||||
|
def __init__(self, features, labels):
|
||||||
|
self.seqs = features
|
||||||
|
self.targets = labels
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self.seqs[index], self.targets[index]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.seqs.shape[0]
|
||||||
|
|
||||||
|
|
||||||
|
# 空气质量数据集
|
||||||
|
class KrakowDataset:
|
||||||
|
def __init__(self, sensor:int=171, is_resample:bool=True):
|
||||||
|
# 选取几个月个月的数据
|
||||||
|
self.month = ['april-2017', 'august-2017', 'december-2017', 'february-2017',
|
||||||
|
'january-2017', 'july-2017', 'june-2017', 'march-2017',
|
||||||
|
'may-2017', 'november-2017', 'october-2017', 'september-2017']
|
||||||
|
raw_data = pd.concat([pd.read_csv(f'./dataset/Krakow-airquality/raw/{month}.csv') for month in self.month])
|
||||||
|
|
||||||
|
# 确定特征列
|
||||||
|
features = ['temperature', 'humidity', 'pressure', 'pm1', 'pm25', 'pm10']
|
||||||
|
self.sensor = sensor # 选取探测器,并非每个探测器都有数据
|
||||||
|
self.feature_col = ['UTC time'] + [f'{self.sensor}_{fea}' for fea in features]
|
||||||
|
data_df = raw_data[[col for col in raw_data.columns if col in self.feature_col]]
|
||||||
|
|
||||||
|
# 按时间戳排序
|
||||||
|
data_df['UTC time'] = pd.to_datetime(data_df['UTC time'])
|
||||||
|
data_df = data_df.set_index('UTC time').sort_index()
|
||||||
|
|
||||||
|
# 重采样、插分
|
||||||
|
if is_resample:
|
||||||
|
self.start_time, self.end_time = data_df.index.min(), data_df.index.max()
|
||||||
|
full_index = pd.date_range(self.start_time, self.end_time, freq='h')
|
||||||
|
data_df = data_df.reindex(full_index)
|
||||||
|
data_df = data_df.interpolate(method='linear')
|
||||||
|
else:
|
||||||
|
data_df = data_df.dropna()
|
||||||
|
|
||||||
|
# 数据标准化
|
||||||
|
self.min = data_df.min()
|
||||||
|
self.max = data_df.max()
|
||||||
|
self.data = (data_df - self.min) / (self.max - self.min)
|
||||||
|
|
||||||
|
def denormalize(self, x):
|
||||||
|
key = f'{self.sensor}_{self.target}'
|
||||||
|
return x * (self.max[key] - self.min[key]) + self.min[key]
|
||||||
|
|
||||||
|
def construct_set(self, train_por=0.6, test_por=0.2, window_size=12, target='pm25'):
|
||||||
|
train_x = []
|
||||||
|
train_y = []
|
||||||
|
val_x = []
|
||||||
|
val_y = []
|
||||||
|
test_x = []
|
||||||
|
test_y = []
|
||||||
|
self.target = target
|
||||||
|
self.feature_col.remove('UTC time')
|
||||||
|
self.data = self.data.reset_index()
|
||||||
|
|
||||||
|
len_train = int(self.data.shape[0] * train_por)
|
||||||
|
train_seqs = self.data[:len_train]
|
||||||
|
for i in range(train_seqs.shape[0] - window_size):
|
||||||
|
train_seq = train_seqs.loc[i:i + window_size]
|
||||||
|
train_x.append(train_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
|
||||||
|
train_y.append(train_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())
|
||||||
|
|
||||||
|
len_val = int(self.data.shape[0] * (train_por + test_por))
|
||||||
|
val_seqs = self.data[len_train:len_val]
|
||||||
|
val_seqs = val_seqs.reset_index()
|
||||||
|
for i in range(val_seqs.shape[0] - window_size):
|
||||||
|
val_seq = val_seqs.loc[i:i + window_size]
|
||||||
|
val_x.append(val_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
|
||||||
|
val_y.append(val_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())
|
||||||
|
|
||||||
|
test_seqs = self.data[len_val:]
|
||||||
|
test_seqs = test_seqs.reset_index()
|
||||||
|
for i in range(test_seqs.shape[0] - window_size):
|
||||||
|
test_seq = test_seqs.loc[i:i + window_size]
|
||||||
|
test_x.append(test_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
|
||||||
|
test_y.append(test_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())
|
||||||
|
|
||||||
|
train_set = my_Dataset(torch.Tensor(train_x), torch.Tensor(train_y))
|
||||||
|
val_set = my_Dataset(torch.Tensor(val_x), torch.Tensor(val_y))
|
||||||
|
test_set = my_Dataset(torch.Tensor(test_x), torch.Tensor(test_y))
|
||||||
|
return train_set, val_set, test_set
|
||||||
|
|
||||||
|
|
||||||
|
class TrafficDataset:
|
||||||
|
def __init__(self, sensor=10, target=0):
|
||||||
|
# 选取适当的检测器用作序列数据
|
||||||
|
self.raw_data = np.load('./dataset/traffic-flow/raw/traffic.npz')['data']
|
||||||
|
self.sensor = sensor
|
||||||
|
self.target = target
|
||||||
|
# 数据标准化
|
||||||
|
self.min = self.raw_data.min()
|
||||||
|
self.max = self.raw_data.max()
|
||||||
|
self.data = (self.raw_data - self.min) / (self.max - self.min)
|
||||||
|
|
||||||
|
def denormalize(self, x):
|
||||||
|
return x * (self.max - self.min) + self.min
|
||||||
|
|
||||||
|
def construct_set(self, train_por=0.6, test_por=0.2, window_size=12, label=0):
|
||||||
|
train_x = []
|
||||||
|
train_y = []
|
||||||
|
val_x = []
|
||||||
|
val_y = []
|
||||||
|
test_x = []
|
||||||
|
test_y = []
|
||||||
|
|
||||||
|
len_train = int(self.data.shape[0] * train_por)
|
||||||
|
train_seqs = self.data[0:len_train, self.sensor, :]
|
||||||
|
for i in range(len_train - window_size):
|
||||||
|
train_x.append(train_seqs[i:i + window_size - 1])
|
||||||
|
train_y.append(train_seqs[i + window_size][label])
|
||||||
|
|
||||||
|
len_val = int(self.data.shape[0] * test_por)
|
||||||
|
val_seqs = self.data[len_train:len_train + len_val, self.sensor, :]
|
||||||
|
for i in range(len_val - window_size):
|
||||||
|
val_x.append(val_seqs[i:i + window_size - 1])
|
||||||
|
val_y.append(val_seqs[i + window_size][label])
|
||||||
|
|
||||||
|
len_test = int(self.data.shape[0] * (1 - train_por - test_por))
|
||||||
|
test_seqs = self.data[len_train + len_val:, self.sensor, :]
|
||||||
|
for i in range(len_test - window_size):
|
||||||
|
test_x.append(test_seqs[i:i + window_size - 1])
|
||||||
|
test_y.append(test_seqs[i + window_size][label])
|
||||||
|
|
||||||
|
train_set = my_Dataset(torch.Tensor(train_x), torch.Tensor(train_y))
|
||||||
|
val_set = my_Dataset(torch.Tensor(val_x), torch.Tensor(val_y))
|
||||||
|
test_set = my_Dataset(torch.Tensor(test_x), torch.Tensor(test_y))
|
||||||
|
return train_set, val_set, test_set
|
176
Lab5/utils.py
Normal file
176
Lab5/utils.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import math
|
||||||
|
import torch
|
||||||
|
from torch.utils import data
|
||||||
|
import torch.nn as nn
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def mse_fn(y, pred):
|
||||||
|
return np.mean((np.array(y) - np.array(pred)) ** 2)
|
||||||
|
|
||||||
|
|
||||||
|
def mae_fn(y, pred):
|
||||||
|
return np.mean(np.abs(np.array(y) - np.array(pred)))
|
||||||
|
|
||||||
|
|
||||||
|
def mape_fn(y, pred):
|
||||||
|
mask = y != 0
|
||||||
|
y = y[mask]
|
||||||
|
pred = pred[mask]
|
||||||
|
mape = np.abs((y - pred) / y)
|
||||||
|
mape = np.mean(mape) * 100
|
||||||
|
return mape
|
||||||
|
|
||||||
|
|
||||||
|
def eval(y, pred):
|
||||||
|
y = y.cpu().numpy()
|
||||||
|
pred = pred.cpu().numpy()
|
||||||
|
mse = mse_fn(y, pred)
|
||||||
|
rmse = math.sqrt(mse)
|
||||||
|
mae = mae_fn(y, pred)
|
||||||
|
mape = mape_fn(y, pred)
|
||||||
|
return [rmse, mae, mape]
|
||||||
|
|
||||||
|
|
||||||
|
# 测试函数(用于分类)
|
||||||
|
def test(net, data_iter, loss_fn, denormalize_fn, device='cpu'):
|
||||||
|
rmse, mae, mape = 0, 0, 0
|
||||||
|
batch_count = 0
|
||||||
|
total_loss = 0.0
|
||||||
|
net.eval()
|
||||||
|
for seqs, targets in data_iter:
|
||||||
|
seqs = seqs.to(device).float()
|
||||||
|
targets = targets.to(device).float()
|
||||||
|
y_hat = net(seqs)
|
||||||
|
loss = loss_fn(y_hat, targets)
|
||||||
|
|
||||||
|
targets = denormalize_fn(targets)
|
||||||
|
y_hat = denormalize_fn(y_hat)
|
||||||
|
a, b, c = eval(targets.detach(), y_hat.detach())
|
||||||
|
rmse += a
|
||||||
|
mae += b
|
||||||
|
mape += c
|
||||||
|
total_loss += loss.detach().cpu().numpy().tolist()
|
||||||
|
batch_count += 1
|
||||||
|
return [rmse / batch_count, mae / batch_count, mape / batch_count], total_loss / batch_count
|
||||||
|
|
||||||
|
|
||||||
|
def train(net, train_iter, val_iter, test_iter, loss_fn, denormalize_fn, optimizer, num_epoch,
|
||||||
|
early_stop=10, device='cpu', num_print_epoch_round=0):
|
||||||
|
train_loss_lst = []
|
||||||
|
val_loss_lst = []
|
||||||
|
train_score_lst = []
|
||||||
|
val_score_lst = []
|
||||||
|
epoch_time = []
|
||||||
|
|
||||||
|
best_epoch = 0
|
||||||
|
best_val_rmse = 9999
|
||||||
|
early_stop_flag = 0
|
||||||
|
for epoch in range(num_epoch):
|
||||||
|
net.train()
|
||||||
|
epoch_loss = 0
|
||||||
|
batch_count = 0
|
||||||
|
batch_time = []
|
||||||
|
rmse, mae, mape = 0, 0, 0
|
||||||
|
for seqs, targets in train_iter:
|
||||||
|
batch_s = time.time()
|
||||||
|
seqs = seqs.to(device).float()
|
||||||
|
targets = targets.to(device).float()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
y_hat = net(seqs)
|
||||||
|
loss = loss_fn(y_hat, targets)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
targets = denormalize_fn(targets)
|
||||||
|
y_hat = denormalize_fn(y_hat)
|
||||||
|
a, b, c = eval(targets.detach(), y_hat.detach())
|
||||||
|
rmse += a
|
||||||
|
mae += b
|
||||||
|
mape += c
|
||||||
|
epoch_loss += loss.detach().cpu().numpy().tolist()
|
||||||
|
batch_count += 1
|
||||||
|
|
||||||
|
batch_time.append(time.time() - batch_s)
|
||||||
|
|
||||||
|
train_loss = epoch_loss / batch_count
|
||||||
|
train_loss_lst.append(train_loss)
|
||||||
|
train_score_lst.append([rmse/batch_count, mae/batch_count, mape/batch_count])
|
||||||
|
|
||||||
|
# 验证集
|
||||||
|
val_score, val_loss = test(net, val_iter, loss_fn, denormalize_fn, device)
|
||||||
|
val_score_lst.append(val_score)
|
||||||
|
val_loss_lst.append(val_loss)
|
||||||
|
|
||||||
|
epoch_time.append(np.array(batch_time).sum())
|
||||||
|
|
||||||
|
# 打印本轮训练结果
|
||||||
|
if num_print_epoch_round > 0 and (epoch+1) % num_print_epoch_round == 0:
|
||||||
|
print(
|
||||||
|
f"Epoch [{epoch + 1}/{num_epoch}],",
|
||||||
|
f"Train Loss: {train_loss:.4f},",
|
||||||
|
f"Train RMSE: {train_score_lst[-1][0]:.4f},",
|
||||||
|
f"Val Loss: {val_loss:.4f},",
|
||||||
|
f"Val RMSE: {val_score[0]:.6f},",
|
||||||
|
f"Time Use: {epoch_time[-1]:.3f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 早停
|
||||||
|
if val_score[0] < best_val_rmse:
|
||||||
|
best_val_rmse = val_score[0]
|
||||||
|
best_epoch = epoch
|
||||||
|
early_stop_flag = 0
|
||||||
|
else:
|
||||||
|
early_stop_flag += 1
|
||||||
|
if early_stop_flag == early_stop:
|
||||||
|
print(f'The model has not been improved for {early_stop} rounds. Stop early!')
|
||||||
|
break
|
||||||
|
|
||||||
|
# 输出最终训练结果
|
||||||
|
print(
|
||||||
|
f'Final result:',
|
||||||
|
f'Get best validation rmse {np.array(val_score_lst)[:, 0].min():.4f} at epoch {best_epoch},',
|
||||||
|
f'Total time {np.array(epoch_time).sum():.2f}s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 计算测试集效果
|
||||||
|
test_score, test_loss = test(net, test_iter, loss_fn, denormalize_fn, device)
|
||||||
|
print(
|
||||||
|
'Test result:',
|
||||||
|
f'Test RMSE: {test_score[0]},',
|
||||||
|
f'Test MAE: {test_score[1]},',
|
||||||
|
f'Test MAPE: {test_score[2]}'
|
||||||
|
)
|
||||||
|
return train_loss_lst, val_loss_lst, train_score_lst, val_score_lst, epoch
|
||||||
|
|
||||||
|
|
||||||
|
def visualize(num_epochs, train_data, test_data, x_label='epoch', y_label='loss'):
|
||||||
|
x = np.arange(0, num_epochs + 1).astype(dtype=np.int32)
|
||||||
|
plt.figure(figsize=(5, 3.5))
|
||||||
|
plt.plot(x, train_data, label=f"train_{y_label}", linewidth=1.5)
|
||||||
|
plt.plot(x, test_data, label=f"val_{y_label}", linewidth=1.5)
|
||||||
|
plt.xlabel(x_label)
|
||||||
|
plt.ylabel(y_label)
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_metric(score_log):
|
||||||
|
score_log = np.array(score_log)
|
||||||
|
|
||||||
|
plt.figure(figsize=(13, 3.5))
|
||||||
|
plt.subplot(1, 3, 1)
|
||||||
|
plt.plot(score_log[:, 0], c='#d28ad4')
|
||||||
|
plt.ylabel('RMSE')
|
||||||
|
|
||||||
|
plt.subplot(1, 3, 2)
|
||||||
|
plt.plot(score_log[:, 1], c='#e765eb')
|
||||||
|
plt.ylabel('MAE')
|
||||||
|
|
||||||
|
plt.subplot(1, 3, 3)
|
||||||
|
plt.plot(score_log[:, 2], c='#6b016d')
|
||||||
|
plt.ylabel('MAPE')
|
||||||
|
|
||||||
|
plt.show()
|
Loading…
x
Reference in New Issue
Block a user