A PyTorch Neural Network without Using the torch.nn Module

Just out of curiosity, I decided to try and implement a PyTorch neural network at a low level. By that I mean not using the torch.nn module which encapsulates a lot of functionality.

I coded up a demo. There were a lot more details to take care of than I thought there’d be, and writing the code took a lot longer than I thought it would.

There are no general concepts involved in this blog post — it’s all about the code details. I have implemented hundreds of neural networks from scratch using C# and Python, and I have implemented hundreds of neural networks using PyTorch. Even so, I had quite a difficult time implementing a PyTorch neural network without using the torch.nn module.

But I learned a lot from the conceptual challenge and so it was time well spent.

Three Disneyland ride concepts by Disney artist Bruce Bushman (1911-1972). Left: Trip to the Moon attraction in Tomorrowland. Center: Dumbo attraction in Fantasyland. Right: Jungle Cruise attraction in Adventureland.

Demo code:

# iris_scratch.py
# Iris data without using torch.nn

# inspired by a similar experiment at:
# https://pytorch.org/tutorials/beginner/nn_tutorial.html

import numpy as np
import torch as T
device = T.device('cpu')

# -----------------------------------------------------------

class BasicNet():  # 4-10-3
  def __init__(self):
    self.ih_wts = T.randn((4, 10), dtype=T.float32, 
      requires_grad=True).to(device)
    self.h_bias = T.zeros(10, dtype=T.float32,
      requires_grad=True).to(device)
    self.ho_wts = T.randn((10, 3), dtype=T.float32,
      requires_grad=True).to(device)
    self.o_bias = T.zeros(3, dtype=T.float32,
      requires_grad=True).to(device)

  def log_softmax(self, x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

  def __call__(self, x):
    # x is [bs,4]
    h = T.tanh(T.matmul(x, self.ih_wts) + self.h_bias)
    o = self.log_softmax(T.matmul(h, self.ho_wts) + 
      self.o_bias)
    return o

# -----------------------------------------------------------

class IrisDataset(T.utils.data.Dataset):
  def __init__(self, src_file, num_rows=None):
    # 5.0, 3.5, 1.3, 0.3, 0
    tmp_x = np.loadtxt(src_file, max_rows=num_rows,
      usecols=range(0,4), delimiter=",", skiprows=0,
      dtype=np.float32)
    tmp_y = np.loadtxt(src_file, max_rows=num_rows,
      usecols=4, delimiter=",", skiprows=0,
      dtype=np.int64)

    self.x_data = T.tensor(tmp_x, dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y, dtype=T.int64).to(device)

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    if T.is_tensor(idx):
      idx = idx.tolist()
    preds = self.x_data[idx]
    spcs = self.y_data[idx] 
    return preds, spcs

# -----------------------------------------------------------

def nll_loss(predicted, target):
  return -predicted[range(target.shape[0]), target].mean()

# -----------------------------------------------------------

def accuracy(model, dataset):
  # assumes model.eval()
  dataldr = T.utils.data.DataLoader(dataset, batch_size=1,
    shuffle=False)
  n_correct = 0; n_wrong = 0
  for (_, batch) in enumerate(dataldr):
    X = batch[0] 
    Y = batch[1]  # already flattened by Dataset
    with T.no_grad():
      oupt = model(X)  # logits form

    big_idx = T.argmax(oupt)
    # if big_idx.item() == Y.item():
    if big_idx == Y:
      n_correct += 1
    else:
      n_wrong += 1

  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc

# -----------------------------------------------------------

def main():
  print("\nBegin Iris problem with no torch.nn ")

  # 0. prepare
  T.manual_seed(1)
  np.random.seed(1)

  # 1. load data
  train_file = ".\\Data\\iris_train.txt"
  train_ds = IrisDataset(train_file, num_rows=120)
  
  bat_size = 4
  train_ldr = T.utils.data.DataLoader(train_ds,
    batch_size=bat_size, shuffle=True)
  
  # 2. create network
  net = BasicNet()

  # 3. train model
  max_epochs = 80
  ep_log_interval = 10
  lr = 0.01

  loss_func = nll_loss  # see above
  # optimizer = T.optim.SGD(net.parameters(),
  #   lr=lrn_rate)  # no parameters

  print("\nbat_size = %3d " % bat_size)
  print("loss = " + " custom nll_loss" )
  print("optimizer = custom code")
  print("max_epochs = %3d " % max_epochs)
  print("lrn_rate = %0.3f " % lr)

  print("\nStarting training")
  for epoch in range(0, max_epochs):
    epoch_loss = 0  # for one full epoch
  
    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # [10,4]
      Y = batch[1]  # OK; alreay flattened
      oupt = net(X)
      loss_val = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate

      loss_val.backward()  # compute gradients
        
      # torch.optimizer.step()
      # leaf Var in place
      with T.no_grad():  # update weights
        net.ih_wts -= net.ih_wts.grad * lr  
        net.h_bias -= net.h_bias.grad * lr
        net.ho_wts -= net.ho_wts.grad * lr
        net.o_bias -= net.o_bias.grad * lr

      # torch.optimizer.zero_grad()
      net.ih_wts.grad.zero_()  # get ready for next update
      net.h_bias.grad.zero_()
      net.ho_wts.grad.zero_()
      net.o_bias.grad.zero_()

    if epoch % ep_log_interval == 0:
      print("epoch = %6d   |   loss = %12.4f " % \
      (epoch, epoch_loss) )
      
  print("Done ")

  # 4. evaluate model accuracy
  print("\nComputing model accuracy")
  acc = accuracy(net, train_ds)  # item-by-item
  print("Accuracy on train data = %0.4f" % acc)
  
  # 5. make a prediction
  print("\nPredicting species for [6.1, 3.1, 5.1, 1.1]: ")
  unk = np.array([[6.1, 3.1, 5.1, 1.1]], dtype=np.float32)
  unk = T.tensor(unk, dtype=T.float32).to(device) 

  with T.no_grad():
    logits = net(unk)  # values do not sum to 1.0
  probs = T.softmax(logits, dim=1)
  T.set_printoptions(precision=4)
  print(probs)

  print("\nEnd ")

# -----------------------------------------------------------

if __name__ == "__main__":
  main()