env.protect = zeros(4,12); env.cliff = [ 0 0 0 0 0 0 0 0 0 0 0 0; 0 0 0 0 0 0 0 0 0 0 0 0; 0 0 0 0 0 0 0 0 0 0 0 0; 0 1 1 1 1 1 1 1 1 1 1 0; ]; env.start = [4,1]; env.goal = [4,12]; n_episode = 1000; epsi = 0.1; % epsilon greedy parameter gamma = 0.999; % discount factor beta = 0.1; % learning rate % % Q - learnig % off policy % cum_r = zeros(1,n_episode); Q = zeros([size(env.protect),4]); for i=1:n_episode state = env.start; nextaction = e_greedy(shiftdim(Q(state(1),state(2),:)), epsi ); reward=0; while reward ~= -100 if state == env.goal break end action = nextaction; prex = state(2); prey = state(1); [state, reward] = gridmove(state,action,env); Q(prey,prex,action) = Q(prey,prex,action) + beta * ( reward + gamma *max(Q(state(1),state(2),:),[],3) - Q(prey,prex,action) ); nextaction = e_greedy(shiftdim(Q(state(1),state(2),:)), epsi ); cum_r(i) = reward + cum_r(i); end end plot(cum_r)