ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 큐러닝 코드 ( off policy TD control ) => 학습정책 =! 행동정책
    IT&컴퓨터공학/딥러닝 2020. 12. 6. 23:49

    살사는 행동정책과 학습정책이 동일해서 그리드월드에 갇히는 경우가 발생함

    예를들어 전 에피소드에서 맞는 길을 가던중에 장애물로 가는 행동을 해서 현재상태의 가치함수가 낮아지면

    다음 에피소드에서도 이 상태로는 다신 안감 -> 원래 맞는길인데 거기로 안가고 계속 뱅글뱅글 돔

     

    이걸 극복한게 큐러닝

    큐러닝은 학습정책과 행동정책을 따로 둔다.

    벨만 최적방정식을 이용한다.

     

    environment.py

     

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    import time
    import numpy as np
    import tkinter as tk
    from PIL import ImageTk, Image
     
    np.random.seed(1)
    PhotoImage = ImageTk.PhotoImage
    UNIT = 100  # 픽셀 수
    HEIGHT = 5  # 그리드월드 세로
    WIDTH = 5  # 그리드월드 가로
     
     
    class Env(tk.Tk):
        def __init__(self):
            super(Env, self).__init__()
            self.action_space = ['u''d''l''r']
            self.n_actions = len(self.action_space)
            self.title('Q Learning')
            self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
            self.shapes = self.load_images()
            self.canvas = self._build_canvas()
            self.texts = []
     
        def _build_canvas(self):
            canvas = tk.Canvas(self, bg='white',
                               height=HEIGHT * UNIT,
                               width=WIDTH * UNIT)
            # 그리드 생성
            for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
                x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
                canvas.create_line(x0, y0, x1, y1)
            for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
                x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
                canvas.create_line(x0, y0, x1, y1)
     
            # 캔버스에 이미지 추가
            self.rectangle = canvas.create_image(5050, image=self.shapes[0])
            self.triangle1 = canvas.create_image(250150, image=self.shapes[1])
            self.triangle2 = canvas.create_image(150250, image=self.shapes[1])
            self.circle = canvas.create_image(250250, image=self.shapes[2])
     
            canvas.pack()
     
            return canvas
     
        def load_images(self):
            rectangle = PhotoImage(
                Image.open("../img/rectangle.png").resize((6565)))
            triangle = PhotoImage(
                Image.open("../img/triangle.png").resize((6565)))
            circle = PhotoImage(
                Image.open("../img/circle.png").resize((6565)))
     
            return rectangle, triangle, circle
     
        def text_value(self, row, col, contents, action, font='Helvetica', size=10,
                       style='normal', anchor="nw"):
     
            if action == 0:
                origin_x, origin_y = 742
            elif action == 1:
                origin_x, origin_y = 8542
            elif action == 2:
                origin_x, origin_y = 425
            else:
                origin_x, origin_y = 4277
     
            x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
            font = (font, str(size), style)
            text = self.canvas.create_text(x, y, fill="black", text=contents,
                                           font=font, anchor=anchor)
            return self.texts.append(text)
     
        def print_value_all(self, q_table):
            for i in self.texts:
                self.canvas.delete(i)
            self.texts.clear()
            for i in range(HEIGHT):
                for j in range(WIDTH):
                    for action in range(04):
                        state = [i, j]
                        if str(state) in q_table.keys():
                            temp = q_table[str(state)][action]
                            self.text_value(j, i, round(temp, 3), action)
     
        def coords_to_state(self, coords):
            x = int((coords[0- 50/ 100)
            y = int((coords[1- 50/ 100)
            return [x, y]
     
        def state_to_coords(self, state):
            x = int(state[0* 100 + 50)
            y = int(state[1* 100 + 50)
            return [x, y]
     
        def reset(self):
            self.update()
            time.sleep(0.5)
            x, y = self.canvas.coords(self.rectangle)
            self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
            self.render()
            return self.coords_to_state(self.canvas.coords(self.rectangle))
     
        def step(self, action):
            state = self.canvas.coords(self.rectangle)
            base_action = np.array([00])
            self.render()
     
            if action == 0:  # 상
                if state[1> UNIT:
                    base_action[1-= UNIT
            elif action == 1:  # 하
                if state[1< (HEIGHT - 1* UNIT:
                    base_action[1+= UNIT
            elif action == 2:  # 좌
                if state[0> UNIT:
                    base_action[0-= UNIT
            elif action == 3:  # 우
                if state[0< (WIDTH - 1* UNIT:
                    base_action[0+= UNIT
     
            # 에이전트 이동
            self.canvas.move(self.rectangle, base_action[0], base_action[1])
            # 에이전트(빨간 네모)를 가장 상위로 배치
            self.canvas.tag_raise(self.rectangle)
            next_state = self.canvas.coords(self.rectangle)
     
            # 보상 함수
            if next_state == self.canvas.coords(self.circle):
                reward = 100
                done = True
            elif next_state in [self.canvas.coords(self.triangle1),
                                self.canvas.coords(self.triangle2)]:
                reward = -100
                done = True
            else:
                reward = 0
                done = False
     
            next_state = self.coords_to_state(next_state)
            return next_state, reward, done
     
        def render(self):
            time.sleep(0.03)
            self.update()
     
    cs

     

    agent.py

     

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    import numpy as np
    import random
    from environment import Env
    from collections import defaultdict
     
     
    class QLearningAgent:
        def __init__(self, actions):
            self.actions = actions
            self.step_size = 0.01
            self.discount_factor = 0.9
            self.epsilon = 0.9
            self.q_table = defaultdict(lambda: [0.00.00.00.0])
     
        # <s, a, r, s'> 샘플로부터 큐함수 업데이트 => 살사와 차이점 : 다음상태에 대한 다음 행동은 몰라도 된다. 그냥 4가지 행동에 대한 큐함수중 가장 큰값으로 update
        def learn(self, state, action, reward, next_state):
            state, next_state = str(state), str(next_state)
            q_1 = self.q_table[state][action] # 현재상태 큐함수
            # 벨만 최적 방정식을 사용한 큐함수의 업데이트
            q_2 = reward + self.discount_factor * max(self.q_table[next_state]) #max를 취하는건 최적방정식이라서
            self.q_table[state][action] += self.step_size * (q_2 - q_1)
     
        # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환
        def get_action(self, state):
            if np.random.rand() < self.epsilon:
                # 무작위 행동 반환
                action = np.random.choice(self.actions)
            else:
                # 큐함수에 따른 행동 반환
                state = str(state)
                q_list = self.q_table[state]
                action = arg_max(q_list)
            return action
     
     
    # 큐함수의 값에 따라 최적의 행동을 반환
    def arg_max(q_list):
        max_idx_list = np.argwhere(q_list == np.amax(q_list))
        max_idx_list = max_idx_list.flatten().tolist()
        return random.choice(max_idx_list)
     
     
    if __name__ == "__main__":
        env = Env()
        agent = QLearningAgent(actions=list(range(env.n_actions)))
     
        for episode in range(1000):
            state = env.reset()
     
            while True:
                # 게임 환경과 상태를 초기화
                env.render()
                # 현재 상태에 대한 행동 선택
                action = agent.get_action(state)
                # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴
                next_state, reward, done = env.step(action)
                # <s,a,r,s'>로 큐함수를 업데이트
                agent.learn(state, action, reward, next_state)
     
                state = next_state
                
                # 모든 큐함수를 화면에 표시
                env.print_value_all(agent.q_table)
     
                if done:
                    break
     
    cs

    댓글

Designed by Tistory.