fix exp2 pytorch rewrite fatal train issue

2025-11-30 22:01:56 +08:00
parent 48fcdfcc80
commit 43b807679f
13 changed files with 738 additions and 112 deletions
--- a/exp2/models/.gitignore
+++ b/exp2/models/.gitignore
@@ -1,2 +1,3 @@
 # Ignore every saved model files
-*.pth
+*.pth
+*.ckpt
--- a/exp2/modified/dataset.py
+++ b/exp2/modified/dataset.py
@@ -3,47 +3,9 @@ import numpy
 import torch
 from torch.utils.data import DataLoader, Dataset
 from torchvision.transforms import v2 as tvtrans
-from torchvision import datasets
-import torch.nn.functional as F
-
-
-class CNN(torch.nn.Module):
-    """卷积神经网络模型"""
-
-    def __init__(self):
-        super(CNN, self).__init__()
-
-        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=(3, 3))
-        self.pool1 = torch.nn.MaxPool2d(kernel_size=(2, 2))
-        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=(3, 3))
-        self.pool2 = torch.nn.MaxPool2d(kernel_size=(2, 2))
-        self.conv3 = torch.nn.Conv2d(64, 64, kernel_size=(3, 3))
-        self.flatten = torch.nn.Flatten()
-        # 28x28过第一轮卷积后变为26x26，过第一轮池化后变为13x13
-        # 过第二轮卷积后变为11x11，过第二轮池化后变为5x5
-        # 过第三轮卷积后变为3x3。
-        # 最后一轮卷积核个数为64。
-        self.fc1 = torch.nn.Linear(64 * 3 * 3, 64)
-        torch.nn.init.xavier_normal_(self.fc1.weight)
-        torch.nn.init.zeros_(self.fc1.bias)
-        self.fc2 = torch.nn.Linear(64, 10)
-        torch.nn.init.xavier_normal_(self.fc2.weight)
-        torch.nn.init.zeros_(self.fc2.bias)
-
-    def forward(self, x):
-        x = F.relu(self.conv1(x))
-        x = self.pool1(x)
-        x = F.relu(self.conv2(x))
-        x = self.pool2(x)
-        x = F.relu(self.conv3(x))
-        x = self.flatten(x)
-        x = F.relu(self.fc1(x))
-        x = self.fc2(x)
-        return F.softmax(x, dim=1)
-

 class MnistDataset(Dataset):
-    """用于加载Mnist的自定义数据集"""
+    """用于加载Mnist数据的自定义数据集"""

    shape: int
    transform: tvtrans.Transform
@@ -101,9 +63,10 @@ class MnistDataSource:
        ])

        # 创建数据集
-        train_dataset = MnistDataset(train_images, train_labels, transform=trans)
-        test_dataset = MnistDataset(test_images, test_labels, transform=trans)
-
+        train_dataset = MnistDataset(train_images, train_labels, 
+                                     transform=trans)
+        test_dataset = MnistDataset(test_images, test_labels, 
+                                    transform=trans)

        # 赋值到自身
        self.train_loader = DataLoader(dataset=train_dataset,
--- a/exp2/modified/model.py
+++ b/exp2/modified/model.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn.functional as F
+
+class Cnn(torch.nn.Module):
+    """卷积神经网络模型"""
+
+    def __init__(self):
+        super(Cnn, self).__init__()
+
+        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=(3, 3))
+        self.pool1 = torch.nn.MaxPool2d(kernel_size=(2, 2))
+        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=(3, 3))
+        self.pool2 = torch.nn.MaxPool2d(kernel_size=(2, 2))
+        self.conv3 = torch.nn.Conv2d(64, 64, kernel_size=(3, 3))
+        self.flatten = torch.nn.Flatten()
+        # 28x28过第一轮卷积后变为26x26，过第一轮池化后变为13x13
+        # 过第二轮卷积后变为11x11，过第二轮池化后变为5x5
+        # 过第三轮卷积后变为3x3。
+        # 最后一轮卷积核个数为64。
+        self.fc1 = torch.nn.Linear(64 * 3 * 3, 64)
+        self.fc2 = torch.nn.Linear(64, 10)
+
+        # 初始化模型参数
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        # YYC MARK:
+        # 把两个全连接线性层按tensorflow默认设置初始化，即：
+        # - kernel_initializer='glorot_uniform'
+        # - bias_initializer='zeros'
+        torch.nn.init.xavier_normal_(self.fc1.weight)
+        torch.nn.init.zeros_(self.fc1.bias)
+        torch.nn.init.xavier_normal_(self.fc2.weight)
+        torch.nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool1(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool2(x)
+        x = F.relu(self.conv3(x))
+        x = self.flatten(x)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        # YYC MARK:
+        # 绝对不要在这里用F.softmax(x, dim=1)输出！
+        # 由于这些代码是从tensorflow里转换过来的，
+        # tensorflow的loss function是接受possibility作为交叉熵计算的，
+        # 而pytorch要求接受logits，即模型softmax之前的参数作为交叉熵计算。
+        # 所以这里直接输出模型结果。
+        return x
+
+
--- a/exp2/modified/predict.py
+++ b/exp2/modified/predict.py
@@ -1,18 +1,21 @@
 from pathlib import Path
 import sys
-import torch
 import numpy
+import torch
+import torch.nn.functional as F
 from PIL import Image, ImageFile
 import matplotlib.pyplot as plt
-from mnist import CNN
+from model import Cnn

 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import gpu_utils
+import pytorch_gpu_utils


 class PredictResult:
+    """预测的结果"""

    possibilities: torch.Tensor
+    """预测结果，是每个数字不同的概率，是经过softmax后的数值"""

    def __init__(self, possibilities: torch.Tensor):
        self.possibilities = possibilities
@@ -29,47 +32,54 @@ class PredictResult:

 class Predictor:
    device: torch.device
-    cnn: CNN
+    model: Cnn

    def __init__(self):
-        self.device = gpu_utils.get_gpu_device()
-        self.cnn = CNN().to(self.device)
+        self.device = pytorch_gpu_utils.get_gpu_device()
+        self.model = Cnn().to(self.device)

        # 加载保存好的模型参数
        file_path = Path(__file__).resolve().parent.parent / 'models' / 'cnn.pth'
-        self.cnn.load_state_dict(torch.load(file_path))
+        self.model.load_state_dict(torch.load(file_path))
+
+    def generic_predict(self, in_data: torch.Tensor) -> PredictResult:
+        """
+        其它预测函数都要使用的预测后端。其它预测函数将数据处理成Tensor，然后传递给此函数进行实际预测。
+        
+        :param in_data: 传入的tensor，该tensor的shape必须是28x28，dtype为float32。
+        """
+        # 上传tensor到GPU
+        in_data = in_data.to(self.device)
+        # 为了满足要求，要在第一维度挤出2下
+        # 一次是灰度通道，一次是批次。
+        # 相当于batch size = 1的计算
+        in_data = in_data.unsqueeze(0).unsqueeze(0)
+        # 开始预测，由于模型输出的是没有softmax的数值，因此最后还需要softmax一下，
+        with torch.no_grad():
+            out_data = self.model(in_data)
+            out_data = F.softmax(out_data, dim=-1)
+            return PredictResult(out_data)
+       

    def predict_sketchpad(self, image: list[list[bool]]) -> PredictResult:
-        input = torch.Tensor(image).float().to(self.device)
+        input = torch.Tensor(image).float()
        assert(input.dim() == 2)
        assert(input.size(0) == 28)
        assert(input.size(1) == 28)

-        # 为了满足要求，要在第一维度挤出2下
-        # 一次是灰度通道，一次是批次。
-        # 相当于batch size = 1的计算
-        input = input.unsqueeze(0).unsqueeze(0)
-       
-        # 预测
-        with torch.no_grad():
-            output = self.cnn(input)
-            return PredictResult(output)
+        return self.generic_predict(input)

    def predict_image(self, image: ImageFile.ImageFile) -> PredictResult:
        # 确保图像为灰度图像，然后转换为numpy数组。
        # 注意这里的numpy数组是只读的，所以要先拷贝一份
        grayscale_image = image.convert('L')
        numpy_data = numpy.reshape(grayscale_image, (28, 28), copy=True)
-        # 转换到Tensor，设置dtype并传到GPU上
-        data = torch.from_numpy(numpy_data).float().to(self.device)
+        # 转换到Tensor，设置dtype
+        data = torch.from_numpy(numpy_data).float()
        # 归一化到255，又因为图像输入是白底黑字，需要做转换。
        data.div_(255.0).sub_(1).mul_(-1)

-        # 同理，挤出维度并预测
-        input = data.unsqueeze(0).unsqueeze(0)
-        with torch.no_grad():
-            output = self.cnn(input)
-            return PredictResult(output)
+        return self.generic_predict(data)

 def main():
    predictor = Predictor()
@@ -91,5 +101,6 @@ def main():


 if __name__ == "__main__":
+    pytorch_gpu_utils.print_gpu_availability()
    main()
    
--- a/exp2/modified/sketchpad.py
+++ b/exp2/modified/sketchpad.py
@@ -2,11 +2,10 @@ from pathlib import Path
 import sys
 import typing
 import tkinter as tk
-from tkinter import messagebox
 from predict import PredictResult, Predictor

 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import gpu_utils
+import pytorch_gpu_utils


 class SketchpadApp:
@@ -182,7 +181,8 @@ class SketchpadApp:
    # endregion

 if __name__ == "__main__":
-    gpu_utils.print_gpu_availability()
+    pytorch_gpu_utils.print_gpu_availability()
+    
    predictor = Predictor()

    root = tk.Tk()
--- a/exp2/modified/train.py
+++ b/exp2/modified/train.py
@@ -7,10 +7,11 @@ import ignite.engine
 import ignite.metrics
 from ignite.engine import Engine, Events
 from ignite.handlers.tqdm_logger import ProgressBar
-from mnist import CNN, MnistDataSource
+from dataset import MnistDataSource
+from model import Cnn

 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import gpu_utils
+import pytorch_gpu_utils


 class Trainer:
@@ -19,12 +20,12 @@ class Trainer:

    device: torch.device
    data_source: MnistDataSource
-    model: CNN
+    model: Cnn

    def __init__(self):
-        self.device = gpu_utils.get_gpu_device()
+        self.device = pytorch_gpu_utils.get_gpu_device()
        self.data_source = MnistDataSource(Trainer.N_BATCH_SIZE)
-        self.model = CNN().to(self.device)
+        self.model = Cnn().to(self.device)
        # 展示模型结构。批次为指定批次数量，通道只有一个灰度通道，大小28x28。
        torchinfo.summary(self.model, (Trainer.N_BATCH_SIZE, 1, 28, 28))

@@ -101,7 +102,7 @@ def main():

 #     device: torch.device
 #     data_source: MnistDataSource
-#     model: CNN
+#     model: Cnn

 #     trainer: Engine
 #     train_evaluator: Engine
@@ -109,7 +110,7 @@ def main():

 #     def __init__(self):
 #         self.device = gpu_utils.get_gpu_device()
-#         self.model = CNN().to(self.device)
+#         self.model = Cnn().to(self.device)
 #         self.data_source = MnistDataSource(batch_size=N_BATCH_SIZE)
 #         # 展示模型结构。批次为指定批次数量，通道只有一个灰度通道，大小28x28。
 #         torchinfo.summary(self.model, (N_BATCH_SIZE, 1, 28, 28))
@@ -188,5 +189,5 @@ def main():


 if __name__ == "__main__":
-    gpu_utils.print_gpu_availability()
+    pytorch_gpu_utils.print_gpu_availability()
    main()
--- a/exp2/source/predict.py
+++ b/exp2/source/predict.py
@@ -5,13 +5,6 @@ import matplotlib.pyplot as plt

 from train import CNN

-'''
-python 3.9 
-tensorflow 2.0.0b0
-pillow(PIL) 4.3.0
-''' 
-
-
 class Predict(object):
    def __init__(self):
        latest = tf.train.latest_checkpoint('./ckpt')
--- a/exp2/source/train.py
+++ b/exp2/source/train.py
@@ -1,11 +1,13 @@
-import os
+from pathlib import Path
+import sys
 import tensorflow as tf
-from tensorflow.keras import datasets, layers, models
+import keras
+from keras import datasets, layers, models
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import tensorflow_gpu_util
+

-'''
-python 3.9
-tensorflow 2.0.0b0
-''' 
 class CNN(object):
    def __init__(self):
        model = models.Sequential()
@@ -26,8 +28,7 @@ class CNN(object):
 class DataSource(object):
    def __init__(self):
        # mnist数据集存储的位置，如何不存在将自动下载
-        data_path = os.path.abspath(os.path.dirname(
-            __file__)) + '.'
+        data_path = Path(__file__).resolve().parent.parent / 'datasets' / 'mnist.npz'
        (train_images, train_labels), (test_images,
                                       test_labels) = datasets.mnist.load_data(path=data_path)
        # 6万张训练图片，1万张测试图片
@@ -43,19 +44,20 @@ class Train:
        self.cnn = CNN()
        self.data = DataSource()
    def train(self):
-        check_path = './ckpt/cp-{epoch:04d}.ckpt'
+        check_path = Path(__file__).resolve().parent.parent / 'models' / 'cnn.ckpt'
        # period 每隔5epoch保存一次
-        save_model_cb = tf.keras.callbacks.ModelCheckpoint(
-            check_path, save_weights_only=True, verbose=1, period=5)
+        save_model_cb = keras.callbacks.ModelCheckpoint(
+            str(check_path), save_weights_only=True, verbose=1, period=5)
        self.cnn.model.compile(optimizer='adam',
                               loss='sparse_categorical_crossentropy',
                               metrics=['accuracy'])
        self.cnn.model.fit(self.data.train_images, self.data.train_labels,
-                           epochs=5, callbacks=[save_model_cb])
+                           epochs=5, batch_size=1000, callbacks=[save_model_cb])
        test_loss, test_acc = self.cnn.model.evaluate(
            self.data.test_images, self.data.test_labels)
-        print("准确率: %.4f，共测试了%d张图片 " % (test_acc, len(self.data.test_labels)))
+        print("准确率: %.4f, 共测试了%d张图片 " % (test_acc, len(self.data.test_labels)))

 if __name__ == "__main__":
-    app = Train()
-    app.train()
+    tensorflow_gpu_util.print_gpu_availability()
+    #app = Train()
+    #app.train()