From 0130448d393304619237af814b89ef7d22e0c85b Mon Sep 17 00:00:00 2001 From: kejingfan Date: Tue, 10 Oct 2023 19:12:07 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E6=89=8B=E5=8A=A8softmax?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83=E6=A2=AF=E5=BA=A6=E7=88=86?= =?UTF-8?q?=E7=82=B8=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 6 + .../Pytorch基本操作实验报告-checkpoint.ipynb | 238 +++++++++------ Lab1/Pytorch基本操作实验报告.ipynb | 274 ++++++++++-------- 3 files changed, 308 insertions(+), 210 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d99f2f3 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "python.formatting.provider": "none" +} \ No newline at end of file diff --git a/Lab1/.ipynb_checkpoints/Pytorch基本操作实验报告-checkpoint.ipynb b/Lab1/.ipynb_checkpoints/Pytorch基本操作实验报告-checkpoint.ipynb index a1b30dc..9791e16 100644 --- a/Lab1/.ipynb_checkpoints/Pytorch基本操作实验报告-checkpoint.ipynb +++ b/Lab1/.ipynb_checkpoints/Pytorch基本操作实验报告-checkpoint.ipynb @@ -5,7 +5,7 @@ "id": "3b57686b-7ac8-4897-bf76-3d982b1ff8da", "metadata": {}, "source": [ - "![school-logo](../images/school_logo.png)\n", + "

\"school-logo\"

\n", "\n", "

本科生《深度学习》课程
实验报告

\n", "
\n", @@ -125,15 +125,31 @@ "print(result3)" ] }, + { + "cell_type": "markdown", + "id": "bd9bd5cc-b6da-4dd6-a599-76498bc5247d", + "metadata": {}, + "source": [ + "第1、2、3种减法形式实质是一样的。\n", + "\n", + "步骤如下:\n", + "1. 对A、B两个张量进行广播,将A、B向广播的方向复制,得到两个$\\max(A.size(0), B.size(0))\\times \\max(A.size(1), B.size(1))$的张量;\n", + "2. 对广播后的两个张量作差,尺寸不变。\n", + "\n", + "第1种减法形式和第2种是等价的,前者是后者的符号化表示。\n", + "\n", + "第3种形式是手动实现的,将上述两个步骤分别手动实现了。但是torch.Tensor还内置了其他机制,这里仅模拟了广播和作差。" + ] + }, { "cell_type": "markdown", "id": "2489a3ad-f6ff-4561-bb26-e02654090b98", "metadata": {}, "source": [ "## 题目2\n", - "1. **利用Tensor创建两个大小分别$3\\times 2$和$4\\times 2$的随机数矩阵P和Q,要求服从均值为0,标准差0.01为的正态分布;**\n", - "2. **对第二步得到的矩阵Q进行形状变换得到Q的转置Q^T;**\n", - "3. **对上述得到的矩阵P和矩阵Q^T求矩阵相乘。**" + "1. **利用Tensor创建两个大小分别$3\\times 2$和$4\\times 2$的随机数矩阵P和Q,要求服从均值为$0$,标准差$0.01$为的正态分布;**\n", + "2. **对第二步得到的矩阵$Q$进行形状变换得到$Q$的转置$Q^T$;**\n", + "3. **对上述得到的矩阵$P$和矩阵$Q^T$求矩阵相乘。**" ] }, { @@ -147,21 +163,21 @@ "output_type": "stream", "text": [ "矩阵 P:\n", - "tensor([[ 0.0098, -0.0111],\n", - " [-0.0057, 0.0051],\n", - " [-0.0180, 0.0194]])\n", + "tensor([[-0.0131, 0.0147],\n", + " [ 0.0248, -0.0028],\n", + " [-0.0172, 0.0178]])\n", "矩阵 Q:\n", - "tensor([[ 0.0010, -0.0026],\n", - " [-0.0095, -0.0059],\n", - " [-0.0168, 0.0194],\n", - " [ 0.0022, 0.0125]])\n", + "tensor([[ 0.0015, 0.0015],\n", + " [-0.0121, -0.0074],\n", + " [ 0.0072, 0.0039],\n", + " [-0.0032, -0.0061]])\n", "矩阵 QT:\n", - "tensor([[ 0.0010, -0.0095, -0.0168, 0.0022],\n", - " [-0.0026, -0.0059, 0.0194, 0.0125]])\n", + "tensor([[ 0.0015, -0.0121, 0.0072, -0.0032],\n", + " [ 0.0015, -0.0074, 0.0039, -0.0061]])\n", "矩阵相乘的结果:\n", - "tensor([[ 3.8758e-05, -2.7672e-05, -3.7944e-04, -1.1683e-04],\n", - " [-1.8842e-05, 2.4259e-05, 1.9324e-04, 5.0424e-05],\n", - " [-6.8471e-05, 5.7510e-05, 6.7733e-04, 2.0131e-04]])\n" + "tensor([[ 2.8145e-06, 4.9911e-05, -3.6764e-05, -4.7670e-05],\n", + " [ 3.2685e-05, -2.7908e-04, 1.6724e-04, -6.1334e-05],\n", + " [ 1.4138e-06, 7.6416e-05, -5.3995e-05, -5.3379e-05]])\n" ] } ], @@ -207,21 +223,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "梯度(dy_3/dx): 2.0\n" + "仅通过y_1传递的梯度: 2.0\n", + "仅通过y_2传递的梯度: 3.0\n", + "dy_3/dx: 5.0\n" ] } ], "source": [ "x = torch.tensor(1.0, requires_grad=True)\n", + "\n", "y_1 = x ** 2\n", "with torch.no_grad():\n", " y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", + "print(\"仅通过y_1传递的梯度: \", x.grad.item())\n", "\n", - "y3 = y_1 + y_2\n", + "x.grad.data.zero_()\n", + "with torch.no_grad():\n", + " y_1 = x ** 2\n", + "y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", + "print(\"仅通过y_2传递的梯度: \", x.grad.item())\n", "\n", - "y3.backward()\n", + "x.grad.data.zero_()\n", + "y_1 = x ** 2\n", + "y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", "\n", - "print(\"梯度(dy_3/dx): \", x.grad.item())" + "print(\"dy_3/dx: \", x.grad.item())" ] }, { @@ -299,7 +331,9 @@ "主要实现:\n", "- 传入参数:`__init__()`\n", "- 对传入的参数进行更新:`step()`\n", - "- 清空传入参数存储的梯度:`zero_grad()`" + "- 清空传入参数存储的梯度:`zero_grad()`\n", + "\n", + "但是有一点需要注意,就是需要将传进来的`params`参数转化为`list`类型。因为`nn.Module`的`parameters()`方法会以``的类型返回模型的参数,但是该类型变量无法像`list`一样使用`for`循环遍历。" ] }, { @@ -312,6 +346,8 @@ "name": "stdout", "output_type": "stream", "text": [ + "x的初始值: 1.0\n", + "学习率: 0.1\n", "y.backward()之后,x的梯度: 2.0\n", "optimizer_test.step()之后,x的值: 0.800000011920929\n", "optimizer_test.zero_grad()之后,x的梯度: 0.0\n" @@ -319,9 +355,9 @@ } ], "source": [ - "class My_optimizer:\n", + "class My_Optimizer:\n", " def __init__(self, params: list[torch.Tensor], lr: float):\n", - " self.params = params\n", + " self.params = list(params)\n", " self.lr = lr\n", "\n", " def step(self):\n", @@ -336,9 +372,12 @@ "\n", "# 测试\n", "x = torch.tensor(1.0, requires_grad=True)\n", - "y = x ** 2\n", - "optimizer_test = My_optimizer([x], lr=0.1)\n", + "print(\"x的初始值: \", x.item())\n", "\n", + "optimizer_test = My_Optimizer([x], lr=0.1)\n", + "print(\"学习率: \", optimizer_test.lr)\n", + "\n", + "y = x ** 2\n", "y.backward()\n", "print(\"y.backward()之后,x的梯度: \", x.grad.item())\n", "\n", @@ -574,18 +613,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 685.3895894885063, Acc: 0.9642275829459848\n", - "Epoch 2/10, Loss: 677.4121572375298, Acc: 0.9974711945592333\n", - "Epoch 3/10, Loss: 677.2220785021782, Acc: 0.9990452451894614\n", - "Epoch 4/10, Loss: 677.1839035749435, Acc: 0.9993094710137819\n", - "Epoch 5/10, Loss: 677.1762611865997, Acc: 0.9998272919002676\n", - "Epoch 6/10, Loss: 677.1740638613701, Acc: 0.9999073923880469\n", - "Epoch 7/10, Loss: 677.1739921569824, Acc: 0.9997274632391843\n", - "Epoch 8/10, Loss: 677.1744710803032, Acc: 0.9999882508320989\n", - "Epoch 9/10, Loss: 677.1742913126945, Acc: 0.999904539547138\n", - "Epoch 10/10, Loss: 677.173879802227, Acc: 0.9997605824956097\n", - "Model weights: -0.0010404698550701141, bias: 0.02203504741191864\n", - "Prediction for test data: 0.505248486995697\n" + "Epoch 1/10, Loss: 678.0522713065147, Acc: 0.9949060965876567\n", + "Epoch 2/10, Loss: 677.2863736152649, Acc: 0.9980913352860563\n", + "Epoch 3/10, Loss: 677.197151362896, Acc: 0.9993721880397808\n", + "Epoch 4/10, Loss: 677.1782736182213, Acc: 0.9997903927928914\n", + "Epoch 5/10, Loss: 677.1754664182663, Acc: 0.9996946183328581\n", + "Epoch 6/10, Loss: 677.1741757392883, Acc: 0.9999630627469878\n", + "Epoch 7/10, Loss: 677.1742368340492, Acc: 0.9999474390293509\n", + "Epoch 8/10, Loss: 677.1745658516884, Acc: 0.9999775205877912\n", + "Epoch 9/10, Loss: 677.1739910840988, Acc: 0.9999218865585965\n", + "Epoch 10/10, Loss: 677.1743568778038, Acc: 0.9998403212619357\n", + "Model weights: -0.0020640366710722446, bias: 0.019105462357401848\n", + "Prediction for test data: 0.504260241985321\n" ] } ], @@ -595,11 +634,11 @@ "batch_size = 1024\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=5, pin_memory=True)\n", + "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_2_1().to(device)\n", "criterion = My_BCELoss()\n", - "optimizer = My_optimizer(model.parameters(), lr=learning_rate)\n", + "optimizer = My_Optimizer(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -687,32 +726,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 576.7015416165058, Acc: 0.9735617914738028\n", - "Epoch 2/10, Loss: 565.9262382361084, Acc: 0.9999925140596344\n", - "Epoch 3/10, Loss: 565.9295897295112, Acc: 0.9999952212094322\n", - "Epoch 4/10, Loss: 565.9272355019373, Acc: 0.9999899716045327\n", - "Epoch 5/10, Loss: 565.9276486165418, Acc: 0.9999941261622728\n", - "Epoch 6/10, Loss: 565.9258608743777, Acc: 0.999994099092236\n", - "Epoch 7/10, Loss: 565.9304406750343, Acc: 0.9999997538554865\n", - "Epoch 8/10, Loss: 565.9290585726536, Acc: 0.9999990918784897\n", - "Epoch 9/10, Loss: 565.9277625135361, Acc: 0.9999886345247774\n", - "Epoch 10/10, Loss: 565.9291837050997, Acc: 0.9999944677252854\n", - "Model weights: -3.712182683343629, bias: 1.8752337556721546\n", - "Prediction for test data: 0.13741241440796031\n" + "Epoch 1/10, Loss: 605.8295383291701, Acc: 0.9582065520342055\n", + "Epoch 2/10, Loss: 576.8061408727621, Acc: 0.9847528457476354\n", + "Epoch 3/10, Loss: 569.2995615954951, Acc: 0.9918221342823426\n", + "Epoch 4/10, Loss: 567.0406008049263, Acc: 0.9953926453617279\n", + "Epoch 5/10, Loss: 566.3090309665234, Acc: 0.9973164146429515\n", + "Epoch 6/10, Loss: 566.0625152740248, Acc: 0.9984245151624715\n", + "Epoch 7/10, Loss: 565.9721677527377, Acc: 0.9990875142056446\n", + "Epoch 8/10, Loss: 565.9461858826305, Acc: 0.999437410787883\n", + "Epoch 9/10, Loss: 565.9334115060377, Acc: 0.9996791976966314\n", + "Epoch 10/10, Loss: 565.9350714258352, Acc: 0.9997795372757307\n", + "Model weights: -3.688023801126246, bias: 1.8659015788034263\n", + "Prediction for test data: 0.139179294539555\n" ] } ], "source": [ - "learning_rate = 1e-2\n", + "learning_rate = 5e-2\n", "num_epochs = 10\n", "batch_size = 1024\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=5, pin_memory=True)\n", + "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_2_2().to(device)\n", "criterion = nn.BCELoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -751,7 +790,7 @@ "id": "e6bff679-f8d2-46cc-bdcb-82af7dab38b3", "metadata": {}, "source": [ - "对比发现,使用torch.nn的内置损失函数和优化器,正确率提升更快。\n", + "对比发现,手动实现的损失函数和优化器与torch.nn的内置损失函数和优化器相比,表现差不多。\n", "\n", "但是为什么相同分布的数据集训练出的权重和偏置,以及预测结果存在较大差别,这个问题的原因还有待我探究。" ] @@ -860,15 +899,16 @@ "output_type": "stream", "text": [ "输入:\n", - "tensor([[ 0.9415, 0.4358, -1.1650, 0.4496, -0.9394],\n", - " [-0.1956, -0.1466, -0.7704, 0.1465, -0.4571],\n", - " [-0.9923, -1.0455, -0.4241, 0.3850, 2.1680]], requires_grad=True)\n", + "tensor([[ 3.3495e+00, -1.5645e+00, 1.3911e-01, 8.2237e-03, 8.6507e-01],\n", + " [ 3.2858e-01, 3.3071e-01, 1.1809e+00, -1.5414e+00, 1.2054e+00],\n", + " [ 9.6236e-04, 1.2167e+00, -1.8887e-01, -9.1634e-01, 1.9415e+00]],\n", + " requires_grad=True)\n", "标签:\n", "tensor([[0., 0., 0., 1., 0.],\n", - " [0., 0., 0., 0., 1.],\n", - " [0., 0., 0., 0., 1.]])\n", - "My_CrossEntropyLoss损失值: 1.1712640523910522\n", - "nn.CrossEntropyLoss损失值: 1.1712640523910522\n" + " [1., 0., 0., 0., 0.],\n", + " [1., 0., 0., 0., 0.]])\n", + "My_CrossEntropyLoss损失值: 2.652722120285034\n", + "nn.CrossEntropyLoss损失值: 2.652722120285034\n" ] } ], @@ -1042,21 +1082,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 2/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 3/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 4/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 5/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 6/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 7/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 8/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 9/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 10/10, Loss: nan, Acc: 0.09999999403953552\n" + "Epoch 1/10, Loss: 100.43975830078125, Acc: 0.4251999855041504\n", + "Epoch 2/10, Loss: 45.485450744628906, Acc: 0.5367000102996826\n", + "Epoch 3/10, Loss: 34.95743179321289, Acc: 0.5881999731063843\n", + "Epoch 4/10, Loss: 30.256790161132812, Acc: 0.6238999962806702\n", + "Epoch 5/10, Loss: 27.338891983032227, Acc: 0.6474999785423279\n", + "Epoch 6/10, Loss: 25.360095977783203, Acc: 0.6664999723434448\n", + "Epoch 7/10, Loss: 23.8934326171875, Acc: 0.6789999604225159\n", + "Epoch 8/10, Loss: 22.703121185302734, Acc: 0.6876999735832214\n", + "Epoch 9/10, Loss: 21.799795150756836, Acc: 0.6959999799728394\n", + "Epoch 10/10, Loss: 21.04413414001465, Acc: 0.7023999691009521\n" ] } ], "source": [ - "learning_rate = 5e-3\n", + "learning_rate = 5e-1\n", "num_epochs = 10\n", "batch_size = 4096\n", "num_classes = 10\n", @@ -1065,17 +1105,17 @@ "transform = transforms.Compose(\n", " [\n", " transforms.ToTensor(),\n", - " transforms.Normalize((0.5,), (0.5,)),\n", + " transforms.Normalize((0.5,), (1.0,)),\n", " ]\n", ")\n", "train_dataset = datasets.FashionMNIST(root=\"./dataset\", train=True, transform=transform, download=True)\n", "test_dataset = datasets.FashionMNIST(root=\"./dataset\", train=False, transform=transform, download=True)\n", - "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True, num_workers=4, pin_memory=True)\n", - "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,shuffle=True, num_workers=4, pin_memory=True)\n", + "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True, num_workers=14, pin_memory=True)\n", + "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_3_1(num_classes).to(device)\n", "criterion = My_CrossEntropyLoss()\n", - "optimizer = My_optimizer(model.parameters(), lr=learning_rate)\n", + "optimizer = My_Optimizer(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -1109,7 +1149,11 @@ "id": "a49d0165-aeb7-48c0-9b67-956bb08cb356", "metadata": {}, "source": [ - "这里发现梯度爆炸。暂时无法解决。" + "在这里我遇到了梯度爆炸的问题。\n", + "\n", + "原来我在数据预处理中使用`transforms.Normalize((0.5,), (0.5,))`进行归一化,但是这样导致了梯度爆炸。\n", + "\n", + "将第二个参数方差改为1.0后,成功解决了梯度爆炸的问题。" ] }, { @@ -1162,28 +1206,28 @@ { "cell_type": "code", "execution_count": 19, - "id": "a58a23e1-368c-430a-ad62-0e256dff564d", + "id": "6d241c05-b153-4f56-a845-0f2362f6459b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 15.949012756347656, Acc: 0.7468000054359436\n", - "Epoch 2/10, Loss: 9.318169593811035, Acc: 0.7906999588012695\n", - "Epoch 3/10, Loss: 8.015625953674316, Acc: 0.8120999932289124\n", - "Epoch 4/10, Loss: 7.471133708953857, Acc: 0.8168999552726746\n", - "Epoch 5/10, Loss: 7.215029239654541, Acc: 0.8253999948501587\n", - "Epoch 6/10, Loss: 7.007692337036133, Acc: 0.8244999647140503\n", - "Epoch 7/10, Loss: 6.847175598144531, Acc: 0.828499972820282\n", - "Epoch 8/10, Loss: 6.6865668296813965, Acc: 0.8323000073432922\n", - "Epoch 9/10, Loss: 6.595873832702637, Acc: 0.8307999968528748\n", - "Epoch 10/10, Loss: 6.535965919494629, Acc: 0.8348999619483948\n" + "Epoch 1/10, Loss: 18.918968200683594, Acc: 0.7260000109672546\n", + "Epoch 2/10, Loss: 12.184525489807129, Acc: 0.7475000023841858\n", + "Epoch 3/10, Loss: 10.786707878112793, Acc: 0.7612999677658081\n", + "Epoch 4/10, Loss: 10.06576919555664, Acc: 0.7705000042915344\n", + "Epoch 5/10, Loss: 9.591888427734375, Acc: 0.7785999774932861\n", + "Epoch 6/10, Loss: 9.247062683105469, Acc: 0.7856999635696411\n", + "Epoch 7/10, Loss: 8.989615440368652, Acc: 0.7890999913215637\n", + "Epoch 8/10, Loss: 8.772100448608398, Acc: 0.792199969291687\n", + "Epoch 9/10, Loss: 8.593544006347656, Acc: 0.7978000044822693\n", + "Epoch 10/10, Loss: 8.453678131103516, Acc: 0.7997999787330627\n" ] } ], "source": [ - "learning_rate = 5e-3\n", + "learning_rate = 5e-2\n", "num_epochs = 10\n", "batch_size = 4096\n", "num_classes = 10\n", @@ -1197,12 +1241,12 @@ ")\n", "train_dataset = datasets.FashionMNIST(root=\"./dataset\", train=True, transform=transform, download=True)\n", "test_dataset = datasets.FashionMNIST(root=\"./dataset\", train=False, transform=transform, download=True)\n", - "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)\n", - "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)\n", + "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", + "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_3_2(num_classes).to(device)\n", "criterion = nn.CrossEntropyLoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -1238,7 +1282,9 @@ "id": "59555b67-1650-4e1a-a98e-7906878bf3d0", "metadata": {}, "source": [ - "与手动实现的softmax回归相比较,nn.CrossEntropyLoss比手动实现的My_CrossEntropyLoss更加稳定,没有出现梯度爆炸的情况。" + "与手动实现的softmax回归相比较,nn.CrossEntropyLoss比手动实现的My_CrossEntropyLoss更加稳定,对输入数据的兼容性更强,没有出现梯度爆炸的情况。\n", + "\n", + "总体表现上,torch.nn的内置功能相对手动实现的功能,正确率提升更快,最终正确率更高。" ] }, { @@ -1250,9 +1296,9 @@ "\n", "通过完成本次Pytorch基本操作实验,让我对Pytorch框架有了更加深入的理解。我接触深度学习主要是在大语言模型领域,比较熟悉微调大模型,但是涉及到底层的深度学习知识,我还有很多短板和不足。这次实验对我这方面的锻炼让我收获良多。\n", "\n", - "首先是数据集的设置。如果数据没有进行归一化,很容易出现梯度爆炸。这是在我以前直接使用图片数据集的经历中没有遇到过的问题。\n", + "首先是数据集的设置。如果数据没有合理进行归一化,很容易出现梯度爆炸。这是在我以前直接使用图片数据集的经历中没有遇到过的问题。\n", "\n", - "在实现logistic回归模型时,通过手动实现各个组件如优化器、线性层等,让我对这些模块的工作原理有了更清晰的认识。尤其是在实现广播机制时,需要充分理解张量操作的维度变换规律。而使用Pytorch内置模块进行实现时,通过继承nn.Module可以自动获得許多功能,使代码更加简洁。\n", + "在实现logistic回归模型时,通过手动实现各个组件如优化器、线性层等,让我对这些模块的工作原理有了更清晰的认识。尤其是在实现广播机制时,需要充分理解张量操作的维度变换规律。而使用Pytorch内置模块进行实现时,通过继承nn.Module可以自动获得许多功能,使代码更加简洁。\n", "\n", "在实现softmax回归时,则遇到了更大的困难。手动实现的模型很容易出现梯度爆炸的问题,而使用Pytorch内置的损失函数和优化器则可以稳定训练。这让我意识到了选择合适的优化方法的重要性。另外,Pytorch强大的自动微分机制也是构建深度神经网络的重要基础。\n", "\n", diff --git a/Lab1/Pytorch基本操作实验报告.ipynb b/Lab1/Pytorch基本操作实验报告.ipynb index dc0a7fd..9791e16 100644 --- a/Lab1/Pytorch基本操作实验报告.ipynb +++ b/Lab1/Pytorch基本操作实验报告.ipynb @@ -5,7 +5,7 @@ "id": "3b57686b-7ac8-4897-bf76-3d982b1ff8da", "metadata": {}, "source": [ - "![school-logo](../images/school_logo.png)\n", + "

\"school-logo\"

\n", "\n", "

本科生《深度学习》课程
实验报告

\n", "
\n", @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "id": "a4e12268-bad4-44c4-92d5-883624d93e25", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "id": "79ea46db-cf49-436c-9b5b-c6562d0da9e2", "metadata": {}, "outputs": [ @@ -125,20 +125,36 @@ "print(result3)" ] }, + { + "cell_type": "markdown", + "id": "bd9bd5cc-b6da-4dd6-a599-76498bc5247d", + "metadata": {}, + "source": [ + "第1、2、3种减法形式实质是一样的。\n", + "\n", + "步骤如下:\n", + "1. 对A、B两个张量进行广播,将A、B向广播的方向复制,得到两个$\\max(A.size(0), B.size(0))\\times \\max(A.size(1), B.size(1))$的张量;\n", + "2. 对广播后的两个张量作差,尺寸不变。\n", + "\n", + "第1种减法形式和第2种是等价的,前者是后者的符号化表示。\n", + "\n", + "第3种形式是手动实现的,将上述两个步骤分别手动实现了。但是torch.Tensor还内置了其他机制,这里仅模拟了广播和作差。" + ] + }, { "cell_type": "markdown", "id": "2489a3ad-f6ff-4561-bb26-e02654090b98", "metadata": {}, "source": [ "## 题目2\n", - "1. **利用Tensor创建两个大小分别$3\\times 2$和$4\\times 2$的随机数矩阵P和Q,要求服从均值为0,标准差0.01为的正态分布;**\n", - "2. **对第二步得到的矩阵Q进行形状变换得到Q的转置Q^T;**\n", - "3. **对上述得到的矩阵P和矩阵Q^T求矩阵相乘。**" + "1. **利用Tensor创建两个大小分别$3\\times 2$和$4\\times 2$的随机数矩阵P和Q,要求服从均值为$0$,标准差$0.01$为的正态分布;**\n", + "2. **对第二步得到的矩阵$Q$进行形状变换得到$Q$的转置$Q^T$;**\n", + "3. **对上述得到的矩阵$P$和矩阵$Q^T$求矩阵相乘。**" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "id": "41e4ee02-1d05-4101-b3f0-477bac0277fb", "metadata": {}, "outputs": [ @@ -147,21 +163,21 @@ "output_type": "stream", "text": [ "矩阵 P:\n", - "tensor([[-0.0094, -0.0073],\n", - " [-0.0087, -0.0008],\n", - " [-0.0012, 0.0103]])\n", + "tensor([[-0.0131, 0.0147],\n", + " [ 0.0248, -0.0028],\n", + " [-0.0172, 0.0178]])\n", "矩阵 Q:\n", - "tensor([[ 0.0094, -0.0126],\n", - " [-0.0082, 0.0005],\n", - " [-0.0079, -0.0101],\n", - " [-0.0002, -0.0161]])\n", + "tensor([[ 0.0015, 0.0015],\n", + " [-0.0121, -0.0074],\n", + " [ 0.0072, 0.0039],\n", + " [-0.0032, -0.0061]])\n", "矩阵 QT:\n", - "tensor([[ 0.0094, -0.0082, -0.0079, -0.0002],\n", - " [-0.0126, 0.0005, -0.0101, -0.0161]])\n", + "tensor([[ 0.0015, -0.0121, 0.0072, -0.0032],\n", + " [ 0.0015, -0.0074, 0.0039, -0.0061]])\n", "矩阵相乘的结果:\n", - "tensor([[ 4.8768e-06, 7.3478e-05, 1.4821e-04, 1.2020e-04],\n", - " [-7.1462e-05, 7.1558e-05, 7.7439e-05, 1.4915e-05],\n", - " [-1.4130e-04, 1.4922e-05, -9.4810e-05, -1.6629e-04]])\n" + "tensor([[ 2.8145e-06, 4.9911e-05, -3.6764e-05, -4.7670e-05],\n", + " [ 3.2685e-05, -2.7908e-04, 1.6724e-04, -6.1334e-05],\n", + " [ 1.4138e-06, 7.6416e-05, -5.3995e-05, -5.3379e-05]])\n" ] } ], @@ -199,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 4, "id": "951512cd-d915-4d04-959f-eb99d1971e2d", "metadata": {}, "outputs": [ @@ -207,21 +223,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "梯度(dy_3/dx): 2.0\n" + "仅通过y_1传递的梯度: 2.0\n", + "仅通过y_2传递的梯度: 3.0\n", + "dy_3/dx: 5.0\n" ] } ], "source": [ "x = torch.tensor(1.0, requires_grad=True)\n", + "\n", "y_1 = x ** 2\n", "with torch.no_grad():\n", " y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", + "print(\"仅通过y_1传递的梯度: \", x.grad.item())\n", "\n", - "y3 = y_1 + y_2\n", + "x.grad.data.zero_()\n", + "with torch.no_grad():\n", + " y_1 = x ** 2\n", + "y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", + "print(\"仅通过y_2传递的梯度: \", x.grad.item())\n", "\n", - "y3.backward()\n", + "x.grad.data.zero_()\n", + "y_1 = x ** 2\n", + "y_2 = x ** 3\n", + "y_3 = y_1 + y_2\n", + "y_3.backward()\n", "\n", - "print(\"梯度(dy_3/dx): \", x.grad.item())" + "print(\"dy_3/dx: \", x.grad.item())" ] }, { @@ -250,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "id": "e31b86ec-4114-48dd-8d73-fe4e0686419a", "metadata": {}, "outputs": [ @@ -299,12 +331,14 @@ "主要实现:\n", "- 传入参数:`__init__()`\n", "- 对传入的参数进行更新:`step()`\n", - "- 清空传入参数存储的梯度:`zero_grad()`" + "- 清空传入参数存储的梯度:`zero_grad()`\n", + "\n", + "但是有一点需要注意,就是需要将传进来的`params`参数转化为`list`类型。因为`nn.Module`的`parameters()`方法会以``的类型返回模型的参数,但是该类型变量无法像`list`一样使用`for`循环遍历。" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "id": "0297066c-9fc1-448d-bdcb-29a6f1519117", "metadata": {}, "outputs": [ @@ -312,6 +346,8 @@ "name": "stdout", "output_type": "stream", "text": [ + "x的初始值: 1.0\n", + "学习率: 0.1\n", "y.backward()之后,x的梯度: 2.0\n", "optimizer_test.step()之后,x的值: 0.800000011920929\n", "optimizer_test.zero_grad()之后,x的梯度: 0.0\n" @@ -319,9 +355,9 @@ } ], "source": [ - "class My_optimizer:\n", + "class My_Optimizer:\n", " def __init__(self, params: list[torch.Tensor], lr: float):\n", - " self.params = params\n", + " self.params = list(params)\n", " self.lr = lr\n", "\n", " def step(self):\n", @@ -336,9 +372,12 @@ "\n", "# 测试\n", "x = torch.tensor(1.0, requires_grad=True)\n", - "y = x ** 2\n", - "optimizer_test = My_optimizer([x], lr=0.1)\n", + "print(\"x的初始值: \", x.item())\n", "\n", + "optimizer_test = My_Optimizer([x], lr=0.1)\n", + "print(\"学习率: \", optimizer_test.lr)\n", + "\n", + "y = x ** 2\n", "y.backward()\n", "print(\"y.backward()之后,x的梯度: \", x.grad.item())\n", "\n", @@ -364,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "8e18695a-d8c5-4f77-8b5c-de40d9240fb9", "metadata": {}, "outputs": [ @@ -446,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 8, "id": "e7de7e4b-a084-4793-812e-46e8550ecd8d", "metadata": {}, "outputs": [], @@ -495,7 +534,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 9, "id": "c39fbafb-62e4-4b8c-9d65-6718d25f2970", "metadata": {}, "outputs": [ @@ -566,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "id": "5612661e-2809-4d46-96c2-33ee9f44116d", "metadata": {}, "outputs": [ @@ -574,18 +613,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 681.7797573804855, Acc: 0.9645753421871553\n", - "Epoch 2/10, Loss: 677.2049961090088, Acc: 0.9990700532071279\n", - "Epoch 3/10, Loss: 677.1804099082947, Acc: 0.9996768410577491\n", - "Epoch 4/10, Loss: 677.175698697567, Acc: 0.9996360650992927\n", - "Epoch 5/10, Loss: 677.1747546195984, Acc: 0.999986148984189\n", - "Epoch 6/10, Loss: 677.1744914650917, Acc: 0.9998796786709696\n", - "Epoch 7/10, Loss: 677.1742819547653, Acc: 0.9999521451026462\n", - "Epoch 8/10, Loss: 677.1738398075104, Acc: 0.9999777880946412\n", - "Epoch 9/10, Loss: 677.1740134358406, Acc: 0.9997993523341308\n", - "Epoch 10/10, Loss: 677.1745718121529, Acc: 0.9998104022783462\n", - "Model weights: -0.0036095045506954193, bias: 0.016485782340168953\n", - "Prediction for test data: 0.5032190084457397\n" + "Epoch 1/10, Loss: 678.0522713065147, Acc: 0.9949060965876567\n", + "Epoch 2/10, Loss: 677.2863736152649, Acc: 0.9980913352860563\n", + "Epoch 3/10, Loss: 677.197151362896, Acc: 0.9993721880397808\n", + "Epoch 4/10, Loss: 677.1782736182213, Acc: 0.9997903927928914\n", + "Epoch 5/10, Loss: 677.1754664182663, Acc: 0.9996946183328581\n", + "Epoch 6/10, Loss: 677.1741757392883, Acc: 0.9999630627469878\n", + "Epoch 7/10, Loss: 677.1742368340492, Acc: 0.9999474390293509\n", + "Epoch 8/10, Loss: 677.1745658516884, Acc: 0.9999775205877912\n", + "Epoch 9/10, Loss: 677.1739910840988, Acc: 0.9999218865585965\n", + "Epoch 10/10, Loss: 677.1743568778038, Acc: 0.9998403212619357\n", + "Model weights: -0.0020640366710722446, bias: 0.019105462357401848\n", + "Prediction for test data: 0.504260241985321\n" ] } ], @@ -595,11 +634,11 @@ "batch_size = 1024\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=5, pin_memory=True)\n", + "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_2_1().to(device)\n", "criterion = My_BCELoss()\n", - "optimizer = My_optimizer(model.parameters(), lr=learning_rate)\n", + "optimizer = My_Optimizer(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -653,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 11, "id": "fa121afd-a1af-4193-9b54-68041e0ed068", "metadata": {}, "outputs": [], @@ -679,7 +718,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 12, "id": "93b0fdb6-be8b-4663-b59e-05ed19a9ea09", "metadata": {}, "outputs": [ @@ -687,32 +726,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 582.1114150944223, Acc: 0.9622313076669672\n", - "Epoch 2/10, Loss: 565.93256158834, Acc: 0.9999686256703629\n", - "Epoch 3/10, Loss: 565.9305296230643, Acc: 0.9999988205402547\n", - "Epoch 4/10, Loss: 565.9292865398384, Acc: 0.9999988799203948\n", - "Epoch 5/10, Loss: 565.928863850198, Acc: 0.9999991768121363\n", - "Epoch 6/10, Loss: 565.9304914128694, Acc: 0.9999969140456769\n", - "Epoch 7/10, Loss: 565.9264041730053, Acc: 0.9999955753695261\n", - "Epoch 8/10, Loss: 565.9313891761873, Acc: 0.9999980937154029\n", - "Epoch 9/10, Loss: 565.9266170542029, Acc: 0.9999949410275989\n", - "Epoch 10/10, Loss: 565.9337094448973, Acc: 0.9999975812010478\n", - "Model weights: -3.7012964947839575, bias: 1.8774806436910758\n", - "Prediction for test data: 0.13897650708244993\n" + "Epoch 1/10, Loss: 605.8295383291701, Acc: 0.9582065520342055\n", + "Epoch 2/10, Loss: 576.8061408727621, Acc: 0.9847528457476354\n", + "Epoch 3/10, Loss: 569.2995615954951, Acc: 0.9918221342823426\n", + "Epoch 4/10, Loss: 567.0406008049263, Acc: 0.9953926453617279\n", + "Epoch 5/10, Loss: 566.3090309665234, Acc: 0.9973164146429515\n", + "Epoch 6/10, Loss: 566.0625152740248, Acc: 0.9984245151624715\n", + "Epoch 7/10, Loss: 565.9721677527377, Acc: 0.9990875142056446\n", + "Epoch 8/10, Loss: 565.9461858826305, Acc: 0.999437410787883\n", + "Epoch 9/10, Loss: 565.9334115060377, Acc: 0.9996791976966314\n", + "Epoch 10/10, Loss: 565.9350714258352, Acc: 0.9997795372757307\n", + "Model weights: -3.688023801126246, bias: 1.8659015788034263\n", + "Prediction for test data: 0.139179294539555\n" ] } ], "source": [ - "learning_rate = 1e-2\n", + "learning_rate = 5e-2\n", "num_epochs = 10\n", "batch_size = 1024\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=5, pin_memory=True)\n", + "dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_2_2().to(device)\n", "criterion = nn.BCELoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -751,7 +790,7 @@ "id": "e6bff679-f8d2-46cc-bdcb-82af7dab38b3", "metadata": {}, "source": [ - "对比发现,使用torch.nn的内置损失函数和优化器,正确率提升更快。\n", + "对比发现,手动实现的损失函数和优化器与torch.nn的内置损失函数和优化器相比,表现差不多。\n", "\n", "但是为什么相同分布的数据集训练出的权重和偏置,以及预测结果存在较大差别,这个问题的原因还有待我探究。" ] @@ -787,7 +826,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 13, "id": "e605f1b0-1d32-410f-bddf-402a85ccc9ff", "metadata": {}, "outputs": [ @@ -851,7 +890,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 14, "id": "759a3bb2-b5f4-4ea5-a2d7-15f0c4cdd14b", "metadata": {}, "outputs": [ @@ -860,15 +899,16 @@ "output_type": "stream", "text": [ "输入:\n", - "tensor([[ 0.4113, 1.0890, -0.4301, -0.1975, 2.2331],\n", - " [ 0.7901, 1.8117, -2.3197, -0.8144, -0.5751],\n", - " [-1.8110, -0.5550, -0.2773, 2.3990, 0.1804]], requires_grad=True)\n", + "tensor([[ 3.3495e+00, -1.5645e+00, 1.3911e-01, 8.2237e-03, 8.6507e-01],\n", + " [ 3.2858e-01, 3.3071e-01, 1.1809e+00, -1.5414e+00, 1.2054e+00],\n", + " [ 9.6236e-04, 1.2167e+00, -1.8887e-01, -9.1634e-01, 1.9415e+00]],\n", + " requires_grad=True)\n", "标签:\n", - "tensor([[0., 1., 0., 0., 0.],\n", + "tensor([[0., 0., 0., 1., 0.],\n", " [1., 0., 0., 0., 0.],\n", - " [0., 0., 0., 1., 0.]])\n", - "My_CrossEntropyLoss损失值: 1.1033374071121216\n", - "nn.CrossEntropyLoss损失值: 1.1033374071121216\n" + " [1., 0., 0., 0., 0.]])\n", + "My_CrossEntropyLoss损失值: 2.652722120285034\n", + "nn.CrossEntropyLoss损失值: 2.652722120285034\n" ] } ], @@ -913,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 15, "id": "74322629-8325-4823-b80f-f28182d577c1", "metadata": {}, "outputs": [ @@ -974,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "id": "bb31a75e-464c-4b94-b927-b219a765e35d", "metadata": {}, "outputs": [], @@ -1034,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 17, "id": "d816dae1-5fbe-4c29-9597-19d66b5eb6b4", "metadata": {}, "outputs": [ @@ -1042,21 +1082,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 2/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 3/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 4/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 5/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 6/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 7/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 8/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 9/10, Loss: nan, Acc: 0.09999999403953552\n", - "Epoch 10/10, Loss: nan, Acc: 0.09999999403953552\n" + "Epoch 1/10, Loss: 100.43975830078125, Acc: 0.4251999855041504\n", + "Epoch 2/10, Loss: 45.485450744628906, Acc: 0.5367000102996826\n", + "Epoch 3/10, Loss: 34.95743179321289, Acc: 0.5881999731063843\n", + "Epoch 4/10, Loss: 30.256790161132812, Acc: 0.6238999962806702\n", + "Epoch 5/10, Loss: 27.338891983032227, Acc: 0.6474999785423279\n", + "Epoch 6/10, Loss: 25.360095977783203, Acc: 0.6664999723434448\n", + "Epoch 7/10, Loss: 23.8934326171875, Acc: 0.6789999604225159\n", + "Epoch 8/10, Loss: 22.703121185302734, Acc: 0.6876999735832214\n", + "Epoch 9/10, Loss: 21.799795150756836, Acc: 0.6959999799728394\n", + "Epoch 10/10, Loss: 21.04413414001465, Acc: 0.7023999691009521\n" ] } ], "source": [ - "learning_rate = 5e-3\n", + "learning_rate = 5e-1\n", "num_epochs = 10\n", "batch_size = 4096\n", "num_classes = 10\n", @@ -1065,17 +1105,17 @@ "transform = transforms.Compose(\n", " [\n", " transforms.ToTensor(),\n", - " transforms.Normalize((0.5,), (0.5,)),\n", + " transforms.Normalize((0.5,), (1.0,)),\n", " ]\n", ")\n", "train_dataset = datasets.FashionMNIST(root=\"./dataset\", train=True, transform=transform, download=True)\n", "test_dataset = datasets.FashionMNIST(root=\"./dataset\", train=False, transform=transform, download=True)\n", - "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True, num_workers=4, pin_memory=True)\n", - "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,shuffle=True, num_workers=4, pin_memory=True)\n", + "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True, num_workers=14, pin_memory=True)\n", + "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_3_1(num_classes).to(device)\n", "criterion = My_CrossEntropyLoss()\n", - "optimizer = My_optimizer(model.parameters(), lr=learning_rate)\n", + "optimizer = My_Optimizer(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -1109,7 +1149,11 @@ "id": "a49d0165-aeb7-48c0-9b67-956bb08cb356", "metadata": {}, "source": [ - "这里发现梯度爆炸。暂时无法解决。" + "在这里我遇到了梯度爆炸的问题。\n", + "\n", + "原来我在数据预处理中使用`transforms.Normalize((0.5,), (0.5,))`进行归一化,但是这样导致了梯度爆炸。\n", + "\n", + "将第二个参数方差改为1.0后,成功解决了梯度爆炸的问题。" ] }, { @@ -1134,7 +1178,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 18, "id": "0163b9f7-1019-429c-8c29-06436d0a4c98", "metadata": {}, "outputs": [], @@ -1161,29 +1205,29 @@ }, { "cell_type": "code", - "execution_count": 38, - "id": "a58a23e1-368c-430a-ad62-0e256dff564d", + "execution_count": 19, + "id": "6d241c05-b153-4f56-a845-0f2362f6459b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/10, Loss: 15.768913269042969, Acc: 0.7530999779701233\n", - "Epoch 2/10, Loss: 9.122207641601562, Acc: 0.7967000007629395\n", - "Epoch 3/10, Loss: 7.9603657722473145, Acc: 0.8100999593734741\n", - "Epoch 4/10, Loss: 7.427120208740234, Acc: 0.8179000020027161\n", - "Epoch 5/10, Loss: 7.115703582763672, Acc: 0.8248999714851379\n", - "Epoch 6/10, Loss: 6.900459289550781, Acc: 0.8259999752044678\n", - "Epoch 7/10, Loss: 6.802896976470947, Acc: 0.8269000053405762\n", - "Epoch 8/10, Loss: 6.687209606170654, Acc: 0.832099974155426\n", - "Epoch 9/10, Loss: 6.6183180809021, Acc: 0.833299994468689\n", - "Epoch 10/10, Loss: 6.531178951263428, Acc: 0.8341999650001526\n" + "Epoch 1/10, Loss: 18.918968200683594, Acc: 0.7260000109672546\n", + "Epoch 2/10, Loss: 12.184525489807129, Acc: 0.7475000023841858\n", + "Epoch 3/10, Loss: 10.786707878112793, Acc: 0.7612999677658081\n", + "Epoch 4/10, Loss: 10.06576919555664, Acc: 0.7705000042915344\n", + "Epoch 5/10, Loss: 9.591888427734375, Acc: 0.7785999774932861\n", + "Epoch 6/10, Loss: 9.247062683105469, Acc: 0.7856999635696411\n", + "Epoch 7/10, Loss: 8.989615440368652, Acc: 0.7890999913215637\n", + "Epoch 8/10, Loss: 8.772100448608398, Acc: 0.792199969291687\n", + "Epoch 9/10, Loss: 8.593544006347656, Acc: 0.7978000044822693\n", + "Epoch 10/10, Loss: 8.453678131103516, Acc: 0.7997999787330627\n" ] } ], "source": [ - "learning_rate = 5e-3\n", + "learning_rate = 5e-2\n", "num_epochs = 10\n", "batch_size = 4096\n", "num_classes = 10\n", @@ -1197,12 +1241,12 @@ ")\n", "train_dataset = datasets.FashionMNIST(root=\"./dataset\", train=True, transform=transform, download=True)\n", "test_dataset = datasets.FashionMNIST(root=\"./dataset\", train=False, transform=transform, download=True)\n", - "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)\n", - "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)\n", + "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", + "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=14, pin_memory=True)\n", "\n", "model = Model_3_2(num_classes).to(device)\n", "criterion = nn.CrossEntropyLoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "for epoch in range(num_epochs):\n", " total_epoch_loss = 0\n", @@ -1238,7 +1282,9 @@ "id": "59555b67-1650-4e1a-a98e-7906878bf3d0", "metadata": {}, "source": [ - "与手动实现的softmax回归相比较,nn.CrossEntropyLoss比手动实现的My_CrossEntropyLoss更加稳定,没有出现梯度爆炸的情况。" + "与手动实现的softmax回归相比较,nn.CrossEntropyLoss比手动实现的My_CrossEntropyLoss更加稳定,对输入数据的兼容性更强,没有出现梯度爆炸的情况。\n", + "\n", + "总体表现上,torch.nn的内置功能相对手动实现的功能,正确率提升更快,最终正确率更高。" ] }, { @@ -1250,7 +1296,7 @@ "\n", "通过完成本次Pytorch基本操作实验,让我对Pytorch框架有了更加深入的理解。我接触深度学习主要是在大语言模型领域,比较熟悉微调大模型,但是涉及到底层的深度学习知识,我还有很多短板和不足。这次实验对我这方面的锻炼让我收获良多。\n", "\n", - "首先是数据集的设置。如果数据没有进行归一化,很容易出现梯度爆炸。这是在我以前直接使用图片数据集的经历中没有遇到过的问题。\n", + "首先是数据集的设置。如果数据没有合理进行归一化,很容易出现梯度爆炸。这是在我以前直接使用图片数据集的经历中没有遇到过的问题。\n", "\n", "在实现logistic回归模型时,通过手动实现各个组件如优化器、线性层等,让我对这些模块的工作原理有了更清晰的认识。尤其是在实现广播机制时,需要充分理解张量操作的维度变换规律。而使用Pytorch内置模块进行实现时,通过继承nn.Module可以自动获得许多功能,使代码更加简洁。\n", "\n",