The PyTorch neural library has a torch.nn.Embedding() layer that converts a word integer token to a vector. For example, “the” = 5 might be converted to a vector like [0.1234, -1.1044, 0.9876, 1.0234], assuming the embed_dim = 4. The values of the embedding vector are learned during training.
I tried to look up the source code for the Embedding class but quickly discovered a complex nightmare of dozens of C++ and Python files. So, to test my knowledge, I implemented a PyTorch embedding layer from scratch.

Left: IMDB example using built-in torch.nn.Embedding layer. Right: Same example using a from-scratch MyEmbedding layer.
It was almost too easy because an embedding is just a lookup table where indices represent the word/token and the rows are the embedding values. My demo greatly simplifies by leaving out options like padding and batch normalization.
class MyEmbedding(T.nn.Module):
def __init__(self, vocab_size, embed_dim):
super(MyEmbedding, self).__init__()
self.weight = \
T.nn.Parameter(T.zeros((vocab_size, embed_dim), \
dtype=T.float32))
T.nn.init.uniform_(self.weight, -0.10, +0.10)
# T.nn.init.normal_(self.weight) # mean = 0, stddev = 1
def forward(self, x):
return self.weight[x]
The embedding values are a matrix of trainable weights with size vocab_size by embed_dim. The built-in Embedding layer initializes using Normal with mean = 0, std dev = 1. Just to experiment, for my from-scratch MyEmbedding layer I used Uniform with range (-0.10, +0.10) initialization.
To test my custom embedding layer, I grabbed my standard IMDB movie review LSTM example. I ran the example using the built-in torch.nn.Embeding() layer, and then I edited the program to use the custom MyEmbedding() layer. Both versions worked quite well.
I searched the Internet for information about implementing an embedding layer from scratch and found wildly conflicting and contradictory information. Much of it involved one-hot encoding of the input token. My demo seems to make sense to me and to work fine but there’s a chance I could be wrong somehow.
I can’t think of any scenarios where it would be useful to implement a custom embedding layer but there might be some such situations. I did this experiment just to explore how embeddings work and increase my understanding.

Everyone has mental context embeddings. You can make all kinds of inferences about these three photos. All three women are about to be embedded in jail but only one committed a serious crime.
Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols. Getting the IMDB data is a major challenge. See jamesmccaffrey.wordpress.com/2022/01/17/imdb-movie-review-sentiment-analysis-using-an-lstm-with-pytorch/
# imdb_lstm.py
# uses preprocessed data instead of built-in data
# batch_first geometry
# PyTorch 1.10.0-CPU Anaconda3-2020.02 Python 3.7.6
# Windows 10/11
import numpy as np
import torch as T
device = T.device('cpu')
# -----------------------------------------------------------
class MyEmbedding(T.nn.Module):
def __init__(self, vocab_size, embed_dim):
super(MyEmbedding, self).__init__()
self.weight = \
T.nn.Parameter(T.zeros((vocab_size, embed_dim), \
dtype=T.float32))
T.nn.init.uniform_(self.weight, -0.10, +0.10)
# T.nn.init.normal_(self.weight) # mean = 0, stddev = 1
def forward(self, x):
return self.weight[x]
# -----------------------------------------------------------
class LSTM_Net(T.nn.Module):
def __init__(self):
# vocab_size = 129892
super(LSTM_Net, self).__init__()
# self.embed = T.nn.Embedding(129892, 32) # built-in
# self.embed = MyEmbedding(129892, 32) # from scratch
self.lstm = T.nn.LSTM(32, 100, batch_first=True)
self.do1 = T.nn.Dropout(0.20)
self.fc1 = T.nn.Linear(100, 1) # binary
def forward(self, x):
# x = review/sentence. length = fixed w/ padding (front)
z = self.embed(x) # expand each token to 32 values
z = z.reshape(-1, 50, 32) # bat seq embed
lstm_oupt, (h_n, c_n) = self.lstm(z)
z = lstm_oupt[:,-1] # shape [bs,100] # [-1] is seq first
z = self.do1(z)
z = T.sigmoid(self.fc1(z)) # BCELoss()
return z
# -----------------------------------------------------------
class IMDB_Dataset(T.utils.data.Dataset):
# 50 token IDs then 0 or 1 label, space delimited
def __init__(self, src_file):
all_xy = np.loadtxt(src_file, usecols=range(0,51),
delimiter=" ", comments="#", dtype=np.int64)
tmp_x = all_xy[:,0:50] # cols [0,50) = [0,49]
tmp_y = all_xy[:,50] # all rows, just col 50
self.x_data = T.tensor(tmp_x, dtype=T.int64).to(device)
self.y_data = T.tensor(tmp_y, dtype=T.float32).to(device)
self.y_data = self.y_data.reshape(-1, 1) # float32 2D
def __len__(self):
return len(self.x_data)
def __getitem__(self, idx):
tokens = self.x_data[idx]
trgts = self.y_data[idx]
return (tokens, trgts)
# -----------------------------------------------------------
def accuracy(model, dataset):
# data_x and data_y are lists of tensors
# assumes model.eval()
num_correct = 0; num_wrong = 0
ldr = T.utils.data.DataLoader(dataset,
batch_size=1, shuffle=False)
for (batch_idx, batch) in enumerate(ldr):
X = batch[0] # inputs
Y = batch[1] # target sentiment label 0 or 1
with T.no_grad():
oupt = model(X) # single [0.0, 1.0]
if oupt "lt" 0.5 and Y == 0:
num_correct += 1
elif oupt "gte" 0.5 and Y == 1:
num_correct += 1
else:
num_wrong += 1
acc = (num_correct * 100.0) / (num_correct + num_wrong)
return acc
# -----------------------------------------------------------
def main():
# 0. get started
print("\nBegin PyTorch IMDB LSTM demo ")
print("Using only reviews with 50 or less words ")
T.manual_seed(3)
np.random.seed(3)
# 1. load data
print("\nLoading preprocessed train and test data ")
train_file = ".\\Data\\imdb_train_50w.txt"
train_ds = IMDB_Dataset(train_file)
test_file = ".\\Data\\imdb_test_50w.txt"
test_ds = IMDB_Dataset(test_file)
bat_size = 16
train_ldr = T.utils.data.DataLoader(train_ds,
batch_size=bat_size, shuffle=True, drop_last=False)
n_train = len(train_ds)
n_test = len(test_ds)
print("Num train = %d Num test = %d " % (n_train, n_test))
# -----------------------------------------------------------
# 2. create network
print("\nCreating LSTM binary classifier ")
net = LSTM_Net().to(device)
# 3. train model
loss_func = T.nn.BCELoss() # binary cross entropy
lrn_rate = 0.001
optimizer = T.optim.Adam(net.parameters(), lr=lrn_rate)
max_epochs = 10 #30
log_interval = 5 # display progress
print("\nbatch size = " + str(bat_size))
print("loss func = " + str(loss_func))
print("optimizer = Adam ")
print("learn rate = %0.4f " % lrn_rate)
print("max_epochs = %d " % max_epochs)
print(net.embed.weight)
input()
print("\nStarting training ")
net.train() # set training mode
for epoch in range(0, max_epochs):
tot_err = 0.0 # for one epoch
for (batch_idx, batch) in enumerate(train_ldr):
X = batch[0] # [bs,50]
Y = batch[1]
optimizer.zero_grad()
oupt = net(X)
loss_val = loss_func(oupt, Y)
tot_err += loss_val.item()
loss_val.backward() # compute gradients
optimizer.step() # update weights
if epoch % log_interval == 0:
print("epoch = %4d |" % epoch, end="")
print(" loss = %10.4f |" % tot_err, end="")
net.eval()
train_acc = accuracy(net, train_ds)
print(" acc = %8.2f%%" % train_acc)
net.train()
print("Training complete")
print(net.embed.weight)
input()
# -----------------------------------------------------------
# 4. evaluate model
net.eval()
test_acc = accuracy(net, test_ds)
print("\nAccuracy on test data = %8.2f%%" % test_acc)
# 5. save model
print("\nSaving trained model state")
# fn = ".\\Models\\imdb_model.pt"
# T.save(net.state_dict(), fn)
# saved_model = Net()
# saved_model.load_state_dict(T.load(fn))
# use saved_model to make prediction(s)
# 6. use model
print("\nSentiment for \"the movie was a great \
waste of my time\"")
print("0 = negative, 1 = positive ")
review = np.array([4, 20, 16, 6, 86, 425, 7, 58, 64],
dtype=np.int64) # cheating . .
padding = np.zeros(50-len(review), dtype=np.int64)
review = np.concatenate([padding, review])
review = T.tensor(review, dtype=T.int64).to(device)
net.eval()
with T.no_grad():
prediction = net(review) # log-probs
print("raw output : ", end="")
print("%0.4f " % prediction.item())
print("\nEnd PyTorch IMDB LSTM sentiment demo")
if __name__ == "__main__":
main()
.NET Test Automation Recipes
Software Testing
SciPy Programming Succinctly
Keras Succinctly
R Programming
2026 Visual Studio Live
2025 Summer MLADS Conference
2026 DevIntersection Conference
2025 Machine Learning Week
2025 Ai4 Conference
2026 G2E Conference
2026 iSC West Conference
You must be logged in to post a comment.