A PyTorch CIFAR-10 Demo Using Raw Data Instead of TorchVision

Virtually all the PyTorch CIFAR-10 image recognition examples I’ve seen use the built-in TorchVision CIFAR-10 dataset. Using a built-in dataset is fine for experiments but I wanted to code up a CIFAR-10 demo from raw data so that I’d be sure I knew how everything works.

CIFAR-10 is a dataset of 50,000 training images and 10,000 test images. For my demo, I pruned the raw data down to 5,000 training images and 1,000 test images. Each CIFAR-10 image is color (i.e., 3 channels), and just 32×32 pixels in size. Each pixel channel is an integer between 0 and 255. The images are one of 10 classes: plane, car, bird, cat, deer, dog, frog, horse, ship, truck.

After a couple of hours of work (while sitting on an airplane), I got a demo up and running. The demo reached nearly 100% accuracy on the training data (which is expected), and 47.7% accuracy on the test data (which is pretty decent for only using 1/10 of the available training images).

Converting the raw CIFAR-10 data to text files was an interesting challenge. See https://jamesmccaffreyblog.com/2022/03/10/fetching-cifar-10-data-and-saving-as-a-text-file/. For my prediction model, I used a relatively simple architecture that I borrowed from the PyTorch documentation example, and that I used before with the TorchVision built-in data. See https://jamesmccaffreyblog.com/2020/10/29/yet-another-cifar-10-example-using-pytorch/.

A good way to use time on an airplane flight.

Image classification isn’t easy. Here are three portraits, classified by Internet searches for “Italian”, “Asian”, and “Russian”. Left: By Pier Toffoletti. Center: By Dodi Ballada. Right: By Galya Bukova. If I were given the images and the labels, I don’t think I could match them.

Demo code:

# cifar_raw_data_cnn.py
# PyTorch 1.10.0-CPU Anaconda3-2020.02  Python 3.7.6
# Windows 10/11 

import numpy as np
import torch as T
device = T.device('cpu')

# -----------------------------------------------------------

class CIFAR10_Dataset(T.utils.data.Dataset):
  # 3072 comma-delim pixel values (0-255) then label (0-9)
  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,3073),
      delimiter=",", comments="#", dtype=np.float32)

    tmp_x = all_xy[:, 0:3072]  # all rows, cols [0,3071]
    tmp_x /= 255.0
    tmp_x = tmp_x.reshape(-1, 3, 32, 32)  # bs, chnls, 32x32
    tmp_y = all_xy[:, 3072]    # 1-D required

    self.x_data = \
      T.tensor(tmp_x, dtype=T.float32).to(device)
    self.y_data = \
      T.tensor(tmp_y, dtype=T.int64).to(device) 

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    lbl = self.y_data[idx]
    pixels = self.x_data[idx] 
    return (pixels, lbl)

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = T.nn.Conv2d(3, 6, 5)  # in, out, kernel
    self.conv2 = T.nn.Conv2d(6, 16, 5)
    self.pool = T.nn.MaxPool2d(2, 2)   # kernel, stride
    self.fc1 = T.nn.Linear(16 * 5 * 5, 120)
    self.fc2 = T.nn.Linear(120, 84)
    self.fc3 = T.nn.Linear(84, 10)  

  def forward(self, x):
    z = T.nn.functional.relu(self.conv1(x))
    z = self.pool(z)
    z = T.nn.functional.relu(self.conv2(z))
    z = self.pool(z)

    z = z.view(-1, 16 * 5 * 5)
    z = T.nn.functional.relu(self.fc1(z))
    z = T.nn.functional.relu(self.fc2(z))
    z = self.fc3(z)  # no activate: CrossEntropyLoss()
    return z

# -----------------------------------------------------------

def accuracy(model, ds):
  ldr = T.utils.data.DataLoader(ds,
    batch_size=len(ds), shuffle=False)
  n_correct = 0
  for data in ldr:
    (pixels, labels) = data
    with T.no_grad():
      oupts = model(pixels)
    (_, predicteds) = T.max(oupts, 1)
    n_correct += (predicteds == labels).sum().item()

  acc = (n_correct * 1.0) / len(ds)
  return acc

# -----------------------------------------------------------

def main():
  # 0. setup
  print("\nBegin CIFAR-10 with raw data CNN demo ")
  np.random.seed(1)
  T.manual_seed(1)

  # 1. create Dataset
  print("\nCreating 5000 train and 1000 test datasets ")
  train_file = ".\\Data\\cifar10_train_5000.txt"
  train_ds = CIFAR10_Dataset(train_file)
  test_file = ".\\Data\\cifar10_test_1000.txt"
  test_ds = CIFAR10_Dataset(test_file)

  bat_size = 10
  train_ldr = T.utils.data.DataLoader(train_ds,
    batch_size=bat_size, shuffle=True)

  # 2. create network
  print("\nCreating CNN with 2 conv and 400-120-84-10 ")
  net = Net().to(device)

  # 3. train model
  max_epochs = 100
  ep_log_interval = 10
  lrn_rate = 0.005

  loss_func = T.nn.CrossEntropyLoss()  # does log-softmax()
  optimizer = T.optim.SGD(net.parameters(), lr=lrn_rate)

  print("\nbat_size = %3d " % bat_size)
  print("loss = " + str(loss_func))
  print("optimizer = SGD")
  print("max_epochs = %3d " % max_epochs)
  print("lrn_rate = %0.3f " % lrn_rate)

  print("\nStarting training")
  net = net.train()
  for epoch in range(0, max_epochs):
    epoch_loss = 0  # for one full epoch
    for (batch_idx, batch) in enumerate(train_ldr):
      (X, Y) = batch  # X = pixels, Y = target labels
      optimizer.zero_grad()
      oupt = net(X)  # X is Size([bat_size, 3, 32, 32])
      loss_obj = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_obj.item()  # accumulate
      loss_obj.backward()
      optimizer.step()
    if epoch % ep_log_interval == 0:
      print("epoch = %4d  | loss = %10.4f  | " % \
        (epoch, epoch_loss), end="")
      net.eval()
      acc = accuracy(net, train_ds)
      net.train()
      print(" acc = %6.4f " % acc)
  print("Done ") 

  # 4. evaluate model accuracy
  print("\nComputing model accuracy")
  net.eval()
  acc_test = accuracy(net, test_ds)  # all at once
  print("Accuracy on test data = %0.4f" % acc_test)

  # 5. TODO: save trained model
  # 6. TODO: use model to make a prediction

  print("\nEnd CIFAR-10 CNN demo ")

if __name__ == "__main__":
  main()