Q-learning是一种强化学习算法,用于解决基于动作-奖励机制的问题。以下是一个简单的 Python 实现 Q-learning 算法的示例,以解决一个简单的迷宫问题。
import numpy as np# 创建迷宫示例,用数字表示迷宫状态
# 0表示可通行的空格,1表示障碍物,9表示目标点
maze = np.array([[0, 0, 0, 1],[1, 1, 0, 1],[0, 0, 0, 0],[1, 0, 9, 1]
])# 定义Q-table,初始化为0
q_table = np.zeros((maze.shape[0] * maze.shape[1], 4))# 定义参数
alpha = 0.1 # 学习率
gamma = 0.9 # 折扣因子
epsilon = 0.1 # epsilon-greedy策略中的随机探索率# 定义动作:上下左右
actions = ['up', 'down', 'left', 'right']# 获取当前状态下的可执行动作
def get_possible_actions(state):row, col = np.where(maze == 0)current_state = np.ravel_multi_index(state, maze.shape)actions = []if (state[0] > 0) and (maze[state[0] - 1, state[1]] == 0):actions.append('up')if (state[0] < maze.shape[0] - 1) and (maze[state[0] + 1, state[1]] == 0):actions.append('down')if (state[1] > 0) and (maze[state[0], state[1] - 1] == 0):actions.append('left')if (state[1] < maze.shape[1] - 1) and (maze[state[0], state[1] + 1] == 0):actions.append('right')return actions# 选择动作
def choose_action(state):if np.random.uniform(0, 1) < epsilon:return np.random.choice(actions)else:possible_actions = get_possible_actions(state)state_idx = np.ravel_multi_index(state, maze.shape)q_values = q_table[state_idx][[actions.index(action) for action in possible_actions]]return possible_actions[np.argmax(q_values)]# 更新Q-table
def update_q_table(state, action, reward, new_state):state_idx = np.ravel_multi_index(state, maze.shape)new_state_idx = np.ravel_multi_index(new_state, maze.shape)action_idx = actions.index(action)max_future_q = np.max(q_table[new_state_idx])current_q = q_table[state_idx][action_idx]new_q = (1 - alpha) * current_q + alpha * (reward + gamma * max_future_q)q_table[state_idx][action_idx] = new_q# Q-learning主循环
for episode in range(1000):state = np.where(maze == 0)state = (state[0][0], state[1][0])done = Falsewhile not done:action = choose_action(state)if action == 'up':new_state = (state[0] - 1, state[1])elif action == 'down':new_state = (state[0] + 1, state[1])elif action == 'left':new_state = (state[0], state[1] - 1)else:new_state = (state[0], state[1] + 1)if maze[new_state] == 9:reward = 10done = Trueelse:reward = -1update_q_table(state, action, reward, new_state)state = new_state# 输出训练后的Q-table
print("训练后的Q-table:")
print(q_table)
这段代码实现了一个简单的 Q-learning 算法来解决一个简化的迷宫问题。这个例子中的迷宫是一个简单的矩阵,数字0表示可通行的空格,1表示障碍物,9表示目标点。算法的核心是根据状态选择动作并更新 Q-table。在实际问题中,状态和动作的定义以及环境的建模方式可能会有所不同。