I work at a very large tech company. One of my job responsibilities is to deliver machine learning classes to software engineers, data scientists, and researchers. Now that covid shows signs of relenting, I expect to resume in-person classes soon (knock on wood). I’ve been reviewing some of my old examples and decided to update my introductory example of reinforcement learning (RL) — using Q-learning to solve a maze.
Solving a maze is the Hello World of RL. It’s a problem that’s easy to understand and involves most of the RL fundamentals: States, Actions, Rewards, Q-tables, and the Bellman equation.
There are a lot of ideas contained in the demo, but that’s what I explain in my classes at work.
Good fun.

Three old maze games I’ve played. Left: In the Fascination game (circa 1960), two players race to move three metal balls through a maze. Center: In the Puzzling Pyramid game (circa 1960), players set up hidden mazes and opponents had to navigate a metal ball up the hidden maze using a magnetic wand. When the ball hit a baffle it would fall to the bottom and the player had to start over. Right: In Mind Maze (circa 1970) each of two players set up a hidden maze for their opponent — same idea as Puzzling Pyramid.
Demo code:
# q_maze_2.py
# Anaconda3-2020.02 Python 3.7.6
import numpy as np
# =============================================================
def show_maze():
# hard-coded hack
print("\nThe maze to solve is: \n")
print(" ==================================================")
print(" | 0 . 1 | 2 . 3 . 4 |")
print(" | . | . . |")
print(" | start . | . . |")
print(" | . | . . |")
print(" | . . . . =========|========= . . . . . . . . . .|")
print(" | 5 . 6 | 7 . 8 | 9 |")
print(" | . | . | |")
print(" | . | . | |")
print(" | . | . | |")
print(" | . . . . ==========. . . . . =========|. . . . .|")
print(" | 10 . 11 . 12 . 13 | 14 |")
print(" | . . . | |")
print(" | . . . | goal |")
print(" | . . . | |")
print(" ==================================================")
def my_print(Q):
# hard-coded hack for this problem only
rows = len(Q); cols = len(Q[0])
print(" 0 1 2 3 4 5\
6 7 8 9 10 11 12\
13 14")
for i in range(rows):
print("%d " % i, end="")
if i "less-than" 10: print(" ", end="") # replace symbol
for j in range(cols): print(" %6.2f" % Q[i,j], end="")
print("")
print("")
def get_poss_next_states(s, A, ns):
# given a state s and an Action matrix A
# get list of possible next states
poss_next_states = []
for j in range(ns): # number states
if A[s,j] == 1: poss_next_states.append(j)
return poss_next_states
def get_rnd_next_state(s, A, ns):
# given a state s, pick a random feasible action
poss_next_states = get_poss_next_states(s, A, ns)
next_state = \
poss_next_states[np.random.randint(0,\
len(poss_next_states))]
return next_state
# =============================================================
def train(A, R, Q, gamma, lrn_rate, goal, ns, max_epochs):
# compute the Q matrix using Actions, Rewards
max_moves = 100; move = 0
for i in range(0,max_epochs):
curr_s = np.random.randint(0,ns) # random start state
while(True):
next_s = get_rnd_next_state(curr_s, A, ns)
poss_next_next_states = \
get_poss_next_states(next_s, A, ns)
max_Q = -9999.99
for j in range(len(poss_next_next_states)):
nn_s = poss_next_next_states[j]
q = Q[next_s,nn_s]
if q "greater-than" max_Q: # replace with symbol
max_Q = q
# Q = [(1-a) * Q] + [a * (rt + (g * maxQ))]
Q[curr_s][next_s] = ((1 - lrn_rate) * Q[curr_s] \
[next_s]) + (lrn_rate * (R[curr_s][next_s] + \
(gamma * max_Q)))
curr_s = next_s
if curr_s == goal: break
if move == max_moves: break
move += 1
# =============================================================
def walk(start, goal, Q):
# go to goal from start using Q
curr = start
print(str(curr) + "-}", end="")
while curr != goal:
next = np.argmax(Q[curr])
print(str(next) + "-}", end="")
curr = next
print("done")
# =============================================================
def main():
np.random.seed(1)
show_maze()
print("\nPress (enter) to continue")
foo = input()
print("Setting up maze States, Actions, Rewards in memory")
A = np.zeros(shape=[15,15], dtype=np.int) # Actions
A[0,1] = 1; A[0,5] = 1; A[1,0] = 1; A[2,3] = 1; A[3,2] = 1
A[3,4] = 1; A[3,8] = 1; A[4,3] = 1; A[4,9] = 1; A[5,0] = 1
A[5,6] = 1; A[5,10] = 1; A[6,5] = 1; A[7,8] = 1; A[7,12] = 1
A[8,3] = 1; A[8,7] = 1; A[9,4] = 1; A[9,14] = 1; A[10,5] = 1
A[10,11] = 1; A[11,10] = 1; A[11,12] = 1; A[12,7] = 1;
A[12,11] = 1; A[12,13] = 1; A[13,12] = 1; A[14,14] = 1
R = np.zeros(shape=[15,15], dtype=np.int) # Rewards
R[0,1] = -0.1; R[0,5] = -0.1; R[1,0] = -0.1; R[2,3] = -0.1
R[3,2] = -0.1; R[3,4] = -0.1; R[3,8] = -0.1; R[4,3] = -0.1
R[4,9] = -0.1; R[5,0] = -0.1; R[5,6] = -0.1; R[5,10] = -0.1
R[6,5] = -0.1; R[7,8] = -0.1; R[7,12] = -0.1; R[8,3] = -0.1
R[8,7] = -0.1; R[9,4] = -0.1; R[9,14] = 10.0; R[10,5] = -0.1
R[10,11] = -0.1; R[11,10] = -0.1; R[11,12] = -0.1
R[12,7] = -0.1; R[12,11] = -0.1; R[12,13] = -0.1
R[13,12] = -0.1; R[14,14] = -0.1
# =============================================================
Q = np.zeros(shape=[15,15], dtype=np.float32) # Quality
print("Analyzing maze with RL Q-learning")
start = 0; goal = 14
ns = 15 # number of states
gamma = 0.5 # weight of future rewards
lrn_rate = 0.5 # pct of new Q to retain in update
max_epochs = 1000
train(A, R, Q, gamma, lrn_rate, goal, ns, max_epochs)
print("Done ")
print("The Q matrix is: \n ")
my_print(Q)
print("Using Q to go from start (0) to goal (14)")
walk(start, goal, Q)
if __name__ == "__main__":
main()

.NET Test Automation Recipes
Software Testing
SciPy Programming Succinctly
Keras Succinctly
R Programming
2026 Visual Studio Live
2025 Summer MLADS Conference
2026 DevIntersection Conference
2025 Machine Learning Week
2025 Ai4 Conference
2026 G2E Conference
2026 iSC West Conference
You must be logged in to post a comment.