-
정책이터레이션 코드IT&컴퓨터공학/딥러닝 2020. 12. 6. 20:53
environment.py
9,10,
32,33,34
70,71,72
100
158
217,218,219
수정
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248import tkinter as tkfrom tkinter import Buttonimport timeimport numpy as npfrom PIL import ImageTk, ImagePhotoImage = ImageTk.PhotoImageUNIT = 100 # 픽셀 수HEIGHT = 5 # 그리드월드 세로WIDTH = 5 # 그리드월드 가로TRANSITION_PROB = 1 # 상태변환확률 : 1이므로 왼쪽으로 가는 행동을 하면 무조건 왼쪽으로 감POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상,하,좌,우ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동REWARDS = []class GraphicDisplay(tk.Tk):def __init__(self, agent):super(GraphicDisplay, self).__init__()self.title('Policy Iteration')self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))self.texts = []self.arrows = []self.env = Env()self.agent = agentself.evaluation_count = 0self.improvement_count = 0self.is_moving = 0(self.up, self.down, self.left, self.right), self.shapes = self.load_images()self.canvas = self._build_canvas()self.text_reward(2, 2, "R : 1.0")self.text_reward(1, 2, "R : -1.0")self.text_reward(2, 1, "R : -1.0")def _build_canvas(self):canvas = tk.Canvas(self, bg='white',height=HEIGHT * UNIT,width=WIDTH * UNIT)# 버튼 초기화iteration_button = Button(self, text="Evaluate",command=self.evaluate_policy)iteration_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10,window=iteration_button)policy_button = Button(self, text="Improve",command=self.improve_policy)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10,window=policy_button)policy_button = Button(self, text="move", command=self.move_by_policy)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10,window=policy_button)policy_button = Button(self, text="reset", command=self.reset)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10,window=policy_button)# 그리드 생성for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80x0, y0, x1, y1 = col, 0, col, HEIGHT * UNITcanvas.create_line(x0, y0, x1, y1)for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, rowcanvas.create_line(x0, y0, x1, y1)# 캔버스에 이미지 추가self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])canvas.create_image(250, 150, image=self.shapes[1])canvas.create_image(150, 250, image=self.shapes[1])canvas.create_image(250, 250, image=self.shapes[2])canvas.pack()return canvasdef load_images(self):up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))return (up, down, left, right), (rectangle, triangle, circle)def reset(self):if self.is_moving == 0:self.evaluation_count = 0self.improvement_count = 0for i in self.texts:self.canvas.delete(i)for i in self.arrows:self.canvas.delete(i)self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTHfor _ in range(HEIGHT)])self.agent.policy_table[2][2] = []x, y = self.canvas.coords(self.rectangle)self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)def text_value(self, row, col, contents, font='Helvetica', size=10,style='normal', anchor="nw"):origin_x, origin_y = 85, 70x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)font = (font, str(size), style)text = self.canvas.create_text(x, y, fill="black", text=contents,font=font, anchor=anchor)return self.texts.append(text)def text_reward(self, row, col, contents, font='Helvetica', size=10,style='normal', anchor="nw"):origin_x, origin_y = 5, 5x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)font = (font, str(size), style)text = self.canvas.create_text(x, y, fill="black", text=contents,font=font, anchor=anchor)return self.texts.append(text)def rectangle_move(self, action):base_action = np.array([0, 0])location = self.find_rectangle()self.render()if action == 0 and location[0] > 0: # 상base_action[1] -= UNITelif action == 1 and location[0] < HEIGHT - 1: # 하base_action[1] += UNITelif action == 2 and location[1] > 0: # 좌base_action[0] -= UNITelif action == 3 and location[1] < WIDTH - 1: # 우base_action[0] += UNIT# move agentself.canvas.move(self.rectangle, base_action[0], base_action[1])def find_rectangle(self):temp = self.canvas.coords(self.rectangle)x = (temp[0] / 100) - 0.5y = (temp[1] / 100) - 0.5return int(y), int(x)def move_by_policy(self):if self.improvement_count != 0 and self.is_moving != 1:self.is_moving = 1x, y = self.canvas.coords(self.rectangle)self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)x, y = self.find_rectangle()while len(self.agent.policy_table[x][y]) != 0:self.after(100,self.rectangle_move(self.agent.get_action([x, y])))x, y = self.find_rectangle()self.is_moving = 0def draw_one_arrow(self, col, row, policy):if col == 2 and row == 2:returnif policy[0] > 0: # uporigin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.up))if policy[1] > 0: # downorigin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.down))if policy[2] > 0: # leftorigin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.left))if policy[3] > 0: # rightorigin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.right))def draw_from_policy(self, policy_table):for i in range(HEIGHT):for j in range(WIDTH):self.draw_one_arrow(i, j, policy_table[i][j])def print_value_table(self, value_table):for i in range(WIDTH):for j in range(HEIGHT):self.text_value(i, j, round(value_table[i][j], 2))def render(self):time.sleep(0.1)self.canvas.tag_raise(self.rectangle)self.update()#evaluate 버튼 눌르면 실행되는 함수 = 정책평가def evaluate_policy(self):self.evaluation_count += 1for i in self.texts:self.canvas.delete(i)self.agent.policy_evaluation()self.print_value_table(self.agent.value_table)# improve 버튼 눌르면 실행되는 함수 = 정책개선def improve_policy(self):self.improvement_count += 1for i in self.arrows:self.canvas.delete(i)self.agent.policy_improvement()self.draw_from_policy(self.agent.policy_table)class Env:def __init__(self):self.transition_probability = TRANSITION_PROB # 상태변환확률self.width = WIDTH #가로self.height = HEIGHT #세로self.reward = [[0] * WIDTH for _ in range(HEIGHT)]self.possible_actions = POSSIBLE_ACTIONSself.reward[2][2] = 1 # (2,2) 좌표 동그라미 위치에 보상 1 : 목적지self.reward[1][2] = -1 # (1,2) 좌표 세모 위치에 보상 -1 : 장애물self.reward[2][1] = -1 # (2,1) 좌표 세모 위치에 보상 -1 : 장애물self.all_state = []for x in range(WIDTH):for y in range(HEIGHT):state = [x, y]self.all_state.append(state)def get_reward(self, state, action): # 다음상태에서 받을 수 있는 보상계산next_state = self.state_after_action(state, action) # 다음상태는 state_after_action 함수로 계산return self.reward[next_state[0]][next_state[1]] #다음상태에서 받을 수 있는 보상 반환def state_after_action(self, state, action_index): # 행동 후 어떤 state 인지action = ACTIONS[action_index]return self.check_boundary([state[0] + action[0], state[1] + action[1]])@staticmethod # Env 객체를 생성하지않고 직접 사용이 가능def check_boundary(state): # 바운더리 벗어나면 체크state[0] = (0 if state[0] < 0 else WIDTH - 1if state[0] > WIDTH - 1 else state[0])state[1] = (0 if state[1] < 0 else HEIGHT - 1if state[1] > HEIGHT - 1 else state[1])return statedef get_transition_prob(self, state, action): #상태변환환률 반환하는함수return self.transition_probabilitydef get_all_states(self): #모든 상태가져오는 함수return self.all_statecs policy_iteratation.py
15,
29,
49,
수정
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596import numpy as npfrom environment import GraphicDisplay, Envclass PolicyIteration:def __init__(self, env):# 환경에 대한 객체 선언self.env = env# 가치함수를 2차원 리스트로 초기화self.value_table = [[0.0] * env.width for _ in range(env.height)]# 상 하 좌 우 동일한 확률로 정책 초기화self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.widthfor _ in range(env.height)]# 마침 상태의 설정self.policy_table[2][2] = []# 할인율self.discount_factor = 0.9# 벨만 기대 방정식을 통해 다음 가치함수를 계산하는 정책 평가 ( 참 가치함수를 구하는 과정 )def policy_evaluation(self):# 다음 가치함수 초기화next_value_table = [[0.00] * self.env.widthfor _ in range(self.env.height)]# 모든 상태에 대해서 벨만 기대방정식을 계산for state in self.env.get_all_states():value = 0.0# 마침 상태의 가치 함수 = 0if state == [2, 2]:next_value_table[state[0]][state[1]] = valuecontinue# 벨만 기대 방정식for action in self.env.possible_actions: # 상,하,좌,우 행동에 대해서next_state = self.env.state_after_action(state, action) # 해당 행동 시 다음상태reward = self.env.get_reward(state, action) # 그때의 리워드next_value = self.get_value(next_state) #이후 상태value += (self.get_policy(state)[action] * #벨만 기대방정식 계산 ( 전부 더한다. max 가 아님 ! )(reward + self.discount_factor * next_value))next_value_table[state[0]][state[1]] = value #가치함수 updateself.value_table = next_value_table #가치함수 테이블 update# 현재 가치 함수에 대해서 탐욕 정책 발전 ( 정책 update )def policy_improvement(self):next_policy = self.policy_table #정책 테이블을 가져오고for state in self.env.get_all_states(): #모든 상태에 대해서if state == [2, 2]:continuevalue_list = []# 반환할 정책 초기화result = [0.0, 0.0, 0.0, 0.0]# 모든 행동에 대해서 [보상 + (할인율 * 다음 상태 가치함수)] = 큐함수 계산for index, action in enumerate(self.env.possible_actions):next_state = self.env.state_after_action(state, action) # 다음상태reward = self.env.get_reward(state, action) #그때의 보상next_value = self.get_value(next_state) #그때의 가치함수value = reward + self.discount_factor * next_valuevalue_list.append(value) # 큐함수 append# 받을 보상이 최대인 행동들에 대해 탐욕 정책 발전 ex) value_list =[3,1,3,2] 인 경우max_idx_list = np.argwhere(value_list == np.amax(value_list)) # amax 하면 3, argwhere 하면 3인 인덱스 반환 => 구한 큐함수들 중에서 가장 큰 값을 가지는 값의 행동을 반환max_idx_list = max_idx_list.flatten().tolist() # ex) [[0][2]] 인경우 => 1차원 리스트 [0,2] 로 변경prob = 1 / len(max_idx_list) # 길이 2 므로 각각 확률은 0.5 ,0.5for idx in max_idx_list:result[idx] = probnext_policy[state[0]][state[1]] = result #[0.5,0,0.5,0] 반환self.policy_table = next_policy# 특정 상태에서 정책에 따라 무작위로 행동을 반환def get_action(self, state):policy = self.get_policy(state)policy = np.array(policy)return np.random.choice(4, 1, p=policy)[0] # 정책을 고려해서 행동을 선택. 4= 행동의 개수, 1= 1개의 행동을 샘플링한다는 뜻 , 세번째 인자에는 정책을 넣음# 상태에 따른 정책 반환def get_policy(self, state):return self.policy_table[state[0]][state[1]]# 가치 함수의 값을 반환def get_value(self, state):return self.value_table[state[0]][state[1]]if __name__ == "__main__":env = Env()policy_iteration = PolicyIteration(env)grid_world = GraphicDisplay(policy_iteration)grid_world.mainloop()cs 'IT&컴퓨터공학 > 딥러닝' 카테고리의 다른 글
딥살사 코드 - 딥러닝의 시작 (0) 2020.12.07 큐러닝 코드 ( off policy TD control ) => 학습정책 =! 행동정책 (0) 2020.12.06 살사 코드 (0) 2020.12.06 살사 계산문제 = on policy TD control ( 행동정책 = 학습정책 ) (0) 2020.12.06 가치이터레이션 코드 (0) 2020.12.06 댓글