Reinforcement Q-Learning to Solve a Maze

I work at a very large tech company. One of my job responsibilities is to deliver machine learning classes to software engineers, data scientists, and researchers. Now that covid shows signs of relenting, I expect to resume in-person classes soon (knock on wood). I’ve been reviewing some of my old examples and decided to update my introductory example of reinforcement learning (RL) — using Q-learning to solve a maze.

Solving a maze is the Hello World of RL. It’s a problem that’s easy to understand and involves most of the RL fundamentals: States, Actions, Rewards, Q-tables, and the Bellman equation.

There are a lot of ideas contained in the demo, but that’s what I explain in my classes at work.

Good fun.

Three old maze games I’ve played. Left: In the Fascination game (circa 1960), two players race to move three metal balls through a maze. Center: In the Puzzling Pyramid game (circa 1960), players set up hidden mazes and opponents had to navigate a metal ball up the hidden maze using a magnetic wand. When the ball hit a baffle it would fall to the bottom and the player had to start over. Right: In Mind Maze (circa 1970) each of two players set up a hidden maze for their opponent — same idea as Puzzling Pyramid.

Demo code:

# q_maze_2.py
# Anaconda3-2020.02  Python 3.7.6

import numpy as np

# =============================================================

def show_maze():
  # hard-coded hack
  print("\nThe maze to solve is: \n")
  print(" ==================================================")
  print(" |      0 .       1 |       2 .       3 .       4 |")
  print(" |        .         |         .         .         |")
  print(" | start  .         |         .         .         |")
  print(" |        .         |         .         .         |") 
  print(" | . . . . =========|========= . . . . . . . . . .|")
  print(" |      5 .       6 |       7 .       8 |       9 |")
  print(" |        .         |         .         |         |")
  print(" |        .         |         .         |         |")
  print(" |        .         |         .         |         |") 
  print(" | . . . . ==========. . . . . =========|. . . . .|")
  print(" |     10 .      11 .      12 .      13 |      14 |")
  print(" |        .         .         .         |         |")
  print(" |        .         .         .         |  goal   |")
  print(" |        .         .         .         |         |") 
  print(" ==================================================")

def my_print(Q):
  # hard-coded hack for this problem only
  rows = len(Q); cols = len(Q[0])
  print("       0      1      2      3      4      5\
      6      7      8      9      10     11     12\
     13     14")
  for i in range(rows):
    print("%d " % i, end="")
    if i "less-than" 10: print(" ", end="")  # replace symbol
    for j in range(cols): print(" %6.2f" % Q[i,j], end="")
    print("")
  print("")

def get_poss_next_states(s, A, ns):
  # given a state s and an Action matrix A
  # get list of possible next states
  poss_next_states = []
  for j in range(ns):  # number states
    if A[s,j] == 1: poss_next_states.append(j)
  return poss_next_states

def get_rnd_next_state(s, A, ns):
  # given a state s, pick a random feasible action
  poss_next_states = get_poss_next_states(s, A, ns)
  next_state = \
    poss_next_states[np.random.randint(0,\
    len(poss_next_states))]
  return next_state 

# =============================================================

def train(A, R, Q, gamma, lrn_rate, goal, ns, max_epochs):
  # compute the Q matrix using Actions, Rewards
  max_moves = 100; move = 0
  for i in range(0,max_epochs):
    curr_s = np.random.randint(0,ns)  # random start state

    while(True):
      next_s = get_rnd_next_state(curr_s, A, ns)
      poss_next_next_states = \
        get_poss_next_states(next_s, A, ns)

      max_Q = -9999.99
      for j in range(len(poss_next_next_states)):
        nn_s = poss_next_next_states[j]
        q = Q[next_s,nn_s]
        if q "greater-than" max_Q:  # replace with symbol
          max_Q = q
      # Q = [(1-a) * Q]  +  [a * (rt + (g * maxQ))]
      Q[curr_s][next_s] = ((1 - lrn_rate) * Q[curr_s] \
        [next_s]) + (lrn_rate * (R[curr_s][next_s] + \
        (gamma * max_Q)))

      curr_s = next_s
      if curr_s == goal: break
      if move == max_moves: break
      move += 1

# =============================================================

def walk(start, goal, Q):
  # go to goal from start using Q
  curr = start
  print(str(curr) + "-}", end="")
  while curr != goal:
    next = np.argmax(Q[curr])
    print(str(next) + "-}", end="")
    curr = next
  print("done")

# =============================================================

def main():
  np.random.seed(1)
  show_maze()

  print("\nPress (enter) to continue")
  foo = input()

  print("Setting up maze States, Actions, Rewards in memory")
  A = np.zeros(shape=[15,15], dtype=np.int)  # Actions 
  A[0,1] = 1; A[0,5] = 1; A[1,0] = 1; A[2,3] = 1; A[3,2] = 1
  A[3,4] = 1; A[3,8] = 1; A[4,3] = 1; A[4,9] = 1; A[5,0] = 1
  A[5,6] = 1; A[5,10] = 1; A[6,5] = 1; A[7,8] = 1; A[7,12] = 1
  A[8,3] = 1; A[8,7] = 1; A[9,4] = 1; A[9,14] = 1; A[10,5] = 1
  A[10,11] = 1; A[11,10] = 1; A[11,12] = 1; A[12,7] = 1;
  A[12,11] = 1; A[12,13] = 1; A[13,12] = 1; A[14,14] = 1

  R = np.zeros(shape=[15,15], dtype=np.int)  # Rewards
  R[0,1] = -0.1; R[0,5] = -0.1; R[1,0] = -0.1; R[2,3] = -0.1
  R[3,2] = -0.1; R[3,4] = -0.1; R[3,8] = -0.1; R[4,3] = -0.1
  R[4,9] = -0.1; R[5,0] = -0.1; R[5,6] = -0.1; R[5,10] = -0.1
  R[6,5] = -0.1; R[7,8] = -0.1; R[7,12] = -0.1; R[8,3] = -0.1
  R[8,7] = -0.1; R[9,4] = -0.1; R[9,14] = 10.0; R[10,5] = -0.1
  R[10,11] = -0.1; R[11,10] = -0.1; R[11,12] = -0.1
  R[12,7] =  -0.1; R[12,11] = -0.1; R[12,13] = -0.1
  R[13,12] = -0.1; R[14,14] = -0.1

# =============================================================

  Q = np.zeros(shape=[15,15], dtype=np.float32)  # Quality
  
  print("Analyzing maze with RL Q-learning")
  start = 0; goal = 14
  ns = 15  # number of states
  gamma = 0.5         # weight of future rewards
  lrn_rate = 0.5      # pct of new Q to retain in update
  max_epochs = 1000
  train(A, R, Q, gamma, lrn_rate, goal, ns, max_epochs)
  print("Done ")
  
  print("The Q matrix is: \n ")
  my_print(Q)

  print("Using Q to go from start (0) to goal (14)")

  walk(start, goal, Q)

if __name__ == "__main__":
  main()