Before asking for help, I apologize for my English. I'm from Switzerland, so it is not my first language. I am currently building a reinforcement learning bot to learn how to play Flappy Bird. I am using linear Q-learning approximation for this purpose, but it doesn't seem to be learning. Can someone help me? My agent consistently receives the same reward, and I am unsure whether my code is broken or if I need to train my agent for a longer period, like 8 hours. Below is my code. Please refrain from judging, as I am just starting to learn about reinforcement learning:
``import flappy_bird_gymnasium # Assuming this is a custom environment
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import *
env = gym.make("FlappyBird-v0", render_mode="rgb_array")
lr = 0.01
decay = 0.0000001
max_steps = 1000
gamma = 0.95
min_eps = 0.001
max_eps = 1
neps = 100000
W = np.random.rand(12, 2)
print(env.observation_space)
def plot_win_rate(rewards_all_episodes):
rewards_optimal = np.array(rewards_all_episodes)
rewards_optimal = np.array([0 if x == -1 else x for x in rewards_optimal])
rewards_optimal = rewards_optimal.cumsum()
win_rate_optimal = rewards_optimal / np.arange(1, len(rewards_all_episodes) + 1)
plt.plot(np.arange(1, len(rewards_all_episodes) + 1), win_rate_optimal)
plt.xlabel('Episode')
plt.ylabel('Win Rate')
plt.title('Win Rate per Episode')
plt.show()
def f(state):
return np.array(state).reshape(12, -1)
def epsilon_greedy(W, epsilon, state):
features = f(state)
if np.random.uniform(0, 1) > epsilon:
action = np.argmax(np.dot(W.T, features))
retur=0
else:
action = 0 if uniform(0,1)>0.07 else 1
retur=1
return action, retur
def gradient(X,W,Y):
m = X.shape[0]
return (1/m) * (X.T @ (X @ W - Y))
def regression(X, W,Y):
W-=lr*gradient(X,W,Y)
return W
def update(W, state, reward, newstate):
q_values = np.dot(f(state).T, W)
next_q_values = np.dot(f(newstate).T, W)
W += lr * np.outer(f(state), (reward + gamma * np.max(next_q_values) - q_values))
return W
def train(neps, min_eps, max_eps, decay, env, max_steps, W, fr):
rewards_all_episodes = []
for episode in tqdm(range(1, neps + 1)):
epsilon = min_eps + (max_eps - min_eps) * np.exp(-decay * episode)
state, _ = env.reset()
rewards = 0
X=[]
Y=[]
while True:
action, retur= epsilon_greedy(W, epsilon, state)
new_state, reward, done, _, _ = env.step(action)
W=update(W, state, reward, new_state)
rewards += reward
X.append(reward)
Y.append(state)
if done:
rewards_all_episodes.append(rewards)
break
state = new_state
if episode % fr == 0:
q_values = np.dot(f(state).T, W)
print(W)
print("epsilon ", epsilon)
print("\nQ-values:", q_values)
print("Episode {}: Average Reward: {}".format(episode, rewards))
return W, rewards_all_episodes
W, rewards_all_episodes = train(neps, min_eps, max_eps, decay, env, max_steps, W, 2500)
plot_win_rate(rewards_all_episodes)
np.save("trained_weightsQL.npy", W)
print(rewards_all_episodes, "gamma**step*")
while True:
pass
`
so i tried even using a DQN algorithme but nothing works