-
가치이터레이션 코드IT&컴퓨터공학/딥러닝 2020. 12. 6. 21:01
environment.py
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261import tkinter as tkimport timeimport numpy as npimport randomfrom PIL import ImageTk, ImagePhotoImage = ImageTk.PhotoImageUNIT = 100 # 픽셀 수HEIGHT = 5 # 그리드월드 세로WIDTH = 5 # 그리드월드 가로TRANSITION_PROB = 1POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동REWARDS = []class GraphicDisplay(tk.Tk):def __init__(self, value_iteration):super(GraphicDisplay, self).__init__()self.title('Value Iteration')self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))self.texts = []self.arrows = []self.env = Env()self.agent = value_iterationself.iteration_count = 0self.improvement_count = 0self.is_moving = 0(self.up, self.down, self.left,self.right), self.shapes = self.load_images()self.canvas = self._build_canvas()self.text_reward(2, 2, "R : 1.0")self.text_reward(1, 2, "R : -1.0")self.text_reward(2, 1, "R : -1.0")def _build_canvas(self):canvas = tk.Canvas(self, bg='white',height=HEIGHT * UNIT,width=WIDTH * UNIT)# 버튼 초기화iteration_button = tk.Button(self, text="Calculate",command=self.calculate_value)iteration_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10,window=iteration_button)policy_button = tk.Button(self, text="Print Policy",command=self.print_optimal_policy)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10,window=policy_button)policy_button = tk.Button(self, text="Move",command=self.move_by_policy)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10,window=policy_button)policy_button = tk.Button(self, text="Clear", command=self.clear)policy_button.configure(width=10, activebackground="#33B5E5")canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10,window=policy_button)# 그리드 생성for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80x0, y0, x1, y1 = col, 0, col, HEIGHT * UNITcanvas.create_line(x0, y0, x1, y1)for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, rowcanvas.create_line(x0, y0, x1, y1)# 캔버스에 이미지 추가self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])canvas.create_image(250, 150, image=self.shapes[1])canvas.create_image(150, 250, image=self.shapes[1])canvas.create_image(250, 250, image=self.shapes[2])canvas.pack()return canvasdef load_images(self):PhotoImage = ImageTk.PhotoImageup = PhotoImage(Image.open("../img/up.png").resize((13, 13)))right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))return (up, down, left, right), (rectangle, triangle, circle)def clear(self):if self.is_moving == 0:self.iteration_count = 0self.improvement_count = 0for i in self.texts:self.canvas.delete(i)for i in self.arrows:self.canvas.delete(i)self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]x, y = self.canvas.coords(self.rectangle)self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)def reset(self):self.update()time.sleep(0.5)self.canvas.delete(self.rectangle)return self.canvas.coords(self.rectangle)def text_value(self, row, col, contents, font='Helvetica', size=12,style='normal', anchor="nw"):origin_x, origin_y = 85, 70x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)font = (font, str(size), style)text = self.canvas.create_text(x, y, fill="black", text=contents,font=font, anchor=anchor)return self.texts.append(text)def text_reward(self, row, col, contents, font='Helvetica', size=12,style='normal', anchor="nw"):origin_x, origin_y = 5, 5x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)font = (font, str(size), style)text = self.canvas.create_text(x, y, fill="black", text=contents,font=font, anchor=anchor)return self.texts.append(text)def rectangle_move(self, action):base_action = np.array([0, 0])location = self.find_rectangle()self.render()if action == 0 and location[0] > 0: # upbase_action[1] -= UNITelif action == 1 and location[0] < HEIGHT - 1: # downbase_action[1] += UNITelif action == 2 and location[1] > 0: # leftbase_action[0] -= UNITelif action == 3 and location[1] < WIDTH - 1: # rightbase_action[0] += UNITself.canvas.move(self.rectangle, base_action[0],base_action[1]) # move agentdef find_rectangle(self):temp = self.canvas.coords(self.rectangle)x = (temp[0] / 100) - 0.5y = (temp[1] / 100) - 0.5return int(y), int(x)def move_by_policy(self):if self.improvement_count != 0 and self.is_moving != 1:self.is_moving = 1x, y = self.canvas.coords(self.rectangle)self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)x, y = self.find_rectangle()while len(self.agent.get_action([x, y])) != 0:action = random.sample(self.agent.get_action([x, y]), 1)[0]self.after(100, self.rectangle_move(action))x, y = self.find_rectangle()self.is_moving = 0def draw_one_arrow(self, col, row, action):if col == 2 and row == 2:returnif action == 0: # uporigin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.up))elif action == 1: # downorigin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.down))elif action == 3: # rightorigin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.right))elif action == 2: # leftorigin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)self.arrows.append(self.canvas.create_image(origin_x, origin_y,image=self.left))def draw_from_values(self, state, action_list):i = state[0]j = state[1]for action in action_list:self.draw_one_arrow(i, j, action)def print_values(self, values):for i in range(WIDTH):for j in range(HEIGHT):self.text_value(i, j, round(values[i][j], 2))def render(self):time.sleep(0.1)self.canvas.tag_raise(self.rectangle)self.update()def calculate_value(self):self.iteration_count += 1for i in self.texts:self.canvas.delete(i)self.agent.value_iteration()self.print_values(self.agent.value_table)def print_optimal_policy(self):self.improvement_count += 1for i in self.arrows:self.canvas.delete(i)for state in self.env.get_all_states():action = self.agent.get_action(state)self.draw_from_values(state, action)class Env:def __init__(self):self.transition_probability = TRANSITION_PROBself.width = WIDTH # Width of Grid Worldself.height = HEIGHT # Height of GridWorldself.reward = [[0] * WIDTH for _ in range(HEIGHT)]self.possible_actions = POSSIBLE_ACTIONSself.reward[2][2] = 1 # reward 1 for circleself.reward[1][2] = -1 # reward -1 for triangleself.reward[2][1] = -1 # reward -1 for triangleself.all_state = []for x in range(WIDTH):for y in range(HEIGHT):state = [x, y]self.all_state.append(state)def get_reward(self, state, action):next_state = self.state_after_action(state, action)return self.reward[next_state[0]][next_state[1]]def state_after_action(self, state, action_index):action = ACTIONS[action_index]return self.check_boundary([state[0] + action[0], state[1] + action[1]])@staticmethoddef check_boundary(state):state[0] = (0 if state[0] < 0 else WIDTH - 1if state[0] > WIDTH - 1 else state[0])state[1] = (0 if state[1] < 0 else HEIGHT - 1if state[1] > HEIGHT - 1 else state[1])return statedef get_transition_prob(self, state, action):return self.transition_probabilitydef get_all_states(self):return self.all_statecs value_iteration.py
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768import numpy as npfrom environment import GraphicDisplay, Envclass ValueIteration:def __init__(self, env):# 환경에 대한 객체 선언self.env = env# 가치 함수를 2차원 리스트로 초기화self.value_table = [[0.0] * env.width for _ in range(env.height)]# 할인율self.discount_factor = 0.9# 벨만 최적 방정식을 통해 다음 가치 함수 계산def value_iteration(self):# 다음 가치함수 초기화next_value_table = [[0.0] * self.env.widthfor _ in range(self.env.height)]# 모든 상태에 대해서 벨만 최적방정식을 계산for state in self.env.get_all_states():# 마침 상태의 가치 함수 = 0if state == [2, 2]:next_value_table[state[0]][state[1]] = 0.0continue# 벨만 최적 방정식value_list = []for action in self.env.possible_actions:next_state = self.env.state_after_action(state, action)reward = self.env.get_reward(state, action)next_value = self.get_value(next_state)value_list.append((reward + self.discount_factor * next_value))# 최댓값을 다음 가치 함수로 대입 ( 기대방정식과 다른 점 )next_value_table[state[0]][state[1]] = max(value_list)self.value_table = next_value_table# 현재 가치 함수로부터 행동을 반환def get_action(self, state):if state == [2, 2]:return []# 모든 행동에 대해 (보상 + (감가율 * 다음 상태 가치함수)) = 큐함수 를 계산value_list = []for action in self.env.possible_actions:next_state = self.env.state_after_action(state, action)reward = self.env.get_reward(state, action)next_value = self.get_value(next_state)value = (reward + self.discount_factor * next_value)value_list.append(value)# 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환max_idx_list = np.argwhere(value_list == np.amax(value_list))action_list = max_idx_list.flatten().tolist()return action_list # 정책이터레이션과 다르게 정책을 반환하지않음. 최적방정식의 경우는 정책이 밖으로 드러나지 않으므로 ! 오직 가치함수 최댓값만으로 행동을 결정def get_value(self, state):return self.value_table[state[0]][state[1]]if __name__ == "__main__":env = Env()value_iteration = ValueIteration(env)grid_world = GraphicDisplay(value_iteration)grid_world.mainloop()cs 'IT&컴퓨터공학 > 딥러닝' 카테고리의 다른 글
딥살사 코드 - 딥러닝의 시작 (0) 2020.12.07 큐러닝 코드 ( off policy TD control ) => 학습정책 =! 행동정책 (0) 2020.12.06 살사 코드 (0) 2020.12.06 살사 계산문제 = on policy TD control ( 행동정책 = 학습정책 ) (0) 2020.12.06 정책이터레이션 코드 (0) 2020.12.06 댓글