PyTorch Word Embedding Layer from Scratch

The PyTorch neural library has a torch.nn.Embedding() layer that converts a word integer token to a vector. For example, “the” = 5 might be converted to a vector like [0.1234, -1.1044, 0.9876, 1.0234], assuming the embed_dim = 4. The values of the embedding vector are learned during training.

I tried to look up the source code for the Embedding class but quickly discovered a complex nightmare of dozens of C++ and Python files. So, to test my knowledge, I implemented a PyTorch embedding layer from scratch.

Left: IMDB example using built-in torch.nn.Embedding layer. Right: Same example using a from-scratch MyEmbedding layer.

It was almost too easy because an embedding is just a lookup table where indices represent the word/token and the rows are the embedding values. My demo greatly simplifies by leaving out options like padding and batch normalization.

class MyEmbedding(T.nn.Module):
  def __init__(self, vocab_size, embed_dim):
    super(MyEmbedding, self).__init__()
    self.weight = \
      T.nn.Parameter(T.zeros((vocab_size, embed_dim), \
        dtype=T.float32))
    T.nn.init.uniform_(self.weight, -0.10, +0.10)
    # T.nn.init.normal_(self.weight)  # mean = 0, stddev = 1

  def forward(self, x):
    return self.weight[x]

The embedding values are a matrix of trainable weights with size vocab_size by embed_dim. The built-in Embedding layer initializes using Normal with mean = 0, std dev = 1. Just to experiment, for my from-scratch MyEmbedding layer I used Uniform with range (-0.10, +0.10) initialization.

To test my custom embedding layer, I grabbed my standard IMDB movie review LSTM example. I ran the example using the built-in torch.nn.Embeding() layer, and then I edited the program to use the custom MyEmbedding() layer. Both versions worked quite well.

I searched the Internet for information about implementing an embedding layer from scratch and found wildly conflicting and contradictory information. Much of it involved one-hot encoding of the input token. My demo seems to make sense to me and to work fine but there’s a chance I could be wrong somehow.

I can’t think of any scenarios where it would be useful to implement a custom embedding layer but there might be some such situations. I did this experiment just to explore how embeddings work and increase my understanding.

Everyone has mental context embeddings. You can make all kinds of inferences about these three photos. All three women are about to be embedded in jail but only one committed a serious crime.

Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols. Getting the IMDB data is a major challenge. See jamesmccaffrey.wordpress.com/2022/01/17/imdb-movie-review-sentiment-analysis-using-an-lstm-with-pytorch/

# imdb_lstm.py
# uses preprocessed data instead of built-in data
# batch_first geometry
# PyTorch 1.10.0-CPU Anaconda3-2020.02  Python 3.7.6
# Windows 10/11

import numpy as np
import torch as T
device = T.device('cpu')

# -----------------------------------------------------------

class MyEmbedding(T.nn.Module):
  def __init__(self, vocab_size, embed_dim):
    super(MyEmbedding, self).__init__()
    self.weight = \
      T.nn.Parameter(T.zeros((vocab_size, embed_dim), \
        dtype=T.float32))
    T.nn.init.uniform_(self.weight, -0.10, +0.10)
    # T.nn.init.normal_(self.weight)  # mean = 0, stddev = 1

  def forward(self, x):
    return self.weight[x]

# -----------------------------------------------------------

class LSTM_Net(T.nn.Module):
  def __init__(self):
    # vocab_size = 129892
    super(LSTM_Net, self).__init__()
    # self.embed = T.nn.Embedding(129892, 32)  # built-in
    # self.embed = MyEmbedding(129892, 32)     # from scratch
    self.lstm = T.nn.LSTM(32, 100, batch_first=True)
    self.do1 = T.nn.Dropout(0.20)
    self.fc1 = T.nn.Linear(100, 1)  # binary
 
  def forward(self, x):
    # x = review/sentence. length = fixed w/ padding (front)
    z = self.embed(x)  # expand each token to 32 values
    z = z.reshape(-1, 50, 32)  # bat seq embed
    lstm_oupt, (h_n, c_n) = self.lstm(z)
    z = lstm_oupt[:,-1]  # shape [bs,100]  # [-1] is seq first
    z = self.do1(z)
    z = T.sigmoid(self.fc1(z))  # BCELoss()
    return z 

# -----------------------------------------------------------

class IMDB_Dataset(T.utils.data.Dataset):
  # 50 token IDs then 0 or 1 label, space delimited
  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,51),
      delimiter=" ", comments="#", dtype=np.int64)
    tmp_x = all_xy[:,0:50]   # cols [0,50) = [0,49]
    tmp_y = all_xy[:,50]     # all rows, just col 50
    self.x_data = T.tensor(tmp_x, dtype=T.int64).to(device) 
    self.y_data = T.tensor(tmp_y, dtype=T.float32).to(device)
    self.y_data = self.y_data.reshape(-1, 1)  # float32 2D 

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    tokens = self.x_data[idx]
    trgts = self.y_data[idx] 
    return (tokens, trgts)

# -----------------------------------------------------------

def accuracy(model, dataset):
  # data_x and data_y are lists of tensors
  # assumes model.eval()
  num_correct = 0; num_wrong = 0
  ldr = T.utils.data.DataLoader(dataset,
    batch_size=1, shuffle=False)
  for (batch_idx, batch) in enumerate(ldr):
    X = batch[0]  # inputs
    Y = batch[1]  # target sentiment label 0 or 1

    with T.no_grad():
      oupt = model(X)  # single [0.0, 1.0]
    if oupt "lt" 0.5 and Y == 0:
      num_correct += 1
    elif oupt "gte" 0.5 and Y == 1:
      num_correct += 1
    else:
      num_wrong += 1
    
  acc = (num_correct * 100.0) / (num_correct + num_wrong)
  return acc

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin PyTorch IMDB LSTM demo ")
  print("Using only reviews with 50 or less words ")
  T.manual_seed(3)  
  np.random.seed(3)

  # 1. load data 
  print("\nLoading preprocessed train and test data ")
  train_file = ".\\Data\\imdb_train_50w.txt"
  train_ds = IMDB_Dataset(train_file) 

  test_file = ".\\Data\\imdb_test_50w.txt"
  test_ds = IMDB_Dataset(test_file) 

  bat_size = 16
  train_ldr = T.utils.data.DataLoader(train_ds,
    batch_size=bat_size, shuffle=True, drop_last=False)
  n_train = len(train_ds)
  n_test = len(test_ds)
  print("Num train = %d Num test = %d " % (n_train, n_test))

# -----------------------------------------------------------

  # 2. create network
  print("\nCreating LSTM binary classifier ")
  net = LSTM_Net().to(device)

  # 3. train model
  loss_func = T.nn.BCELoss()  # binary cross entropy
  lrn_rate = 0.001
  optimizer = T.optim.Adam(net.parameters(), lr=lrn_rate)
  max_epochs = 10  #30
  log_interval = 5  # display progress

  print("\nbatch size = " + str(bat_size))
  print("loss func = " + str(loss_func))
  print("optimizer = Adam ")
  print("learn rate = %0.4f " % lrn_rate)
  print("max_epochs = %d " % max_epochs)

  print(net.embed.weight)
  input()

  print("\nStarting training ")
  net.train()  # set training mode
  for epoch in range(0, max_epochs):
    tot_err = 0.0  # for one epoch
    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # [bs,50]
      Y = batch[1]
      optimizer.zero_grad()
      oupt = net(X)
      loss_val = loss_func(oupt, Y) 
      tot_err += loss_val.item()
      loss_val.backward()  # compute gradients
      optimizer.step()     # update weights
  
    if epoch % log_interval == 0:
      print("epoch = %4d  |" % epoch, end="")
      print("   loss = %10.4f  |" % tot_err, end="")
      net.eval()
      train_acc = accuracy(net, train_ds)
      print("  acc = %8.2f%%" % train_acc)
      net.train()

  print("Training complete")

  print(net.embed.weight)
  input()

# -----------------------------------------------------------

  # 4. evaluate model
  net.eval()
  test_acc = accuracy(net, test_ds)
  print("\nAccuracy on test data = %8.2f%%" % test_acc)

  # 5. save model
  print("\nSaving trained model state")
  # fn = ".\\Models\\imdb_model.pt"
  # T.save(net.state_dict(), fn)

  # saved_model = Net()
  # saved_model.load_state_dict(T.load(fn))
  # use saved_model to make prediction(s)

  # 6. use model
  print("\nSentiment for \"the movie was a great \
waste of my time\"")
  print("0 = negative, 1 = positive ")
  review = np.array([4, 20, 16, 6, 86, 425, 7, 58, 64],
    dtype=np.int64)  # cheating . . 
  padding = np.zeros(50-len(review), dtype=np.int64)
  review = np.concatenate([padding, review])
  review = T.tensor(review, dtype=T.int64).to(device)
  
  net.eval()
  with T.no_grad():
    prediction = net(review)  # log-probs
  print("raw output : ", end="")
  print("%0.4f " % prediction.item())
  
  print("\nEnd PyTorch IMDB LSTM sentiment demo")

if __name__ == "__main__":
  main()