import numpy as np from itertools import count import random import math import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.autograd as autograd from torch.autograd import Variable from torch.nn import init import matplotlib.mlab as mlab import matplotlib.pyplot as plt import matplotlib.animation from IPython.display import HTML GOAL_VALUE = 1 EDGE_VALUE = -1 HOLE_VALUE = -1 VISIBLE_RADIUS = 1 GRID_SIZE = 8 NUM_HOLES = 4 GOAL_VALUE = 1 EDGE_VALUE = -1 HOLE_VALUE = -1 VISIBLE_RADIUS = 1 GRID_SIZE = 8 NUM_HOLES = 4 # enviroment and behaviour, without moving out of wall class Grid(): def __init__(self, n_holes = 4, grid_size = GRID_SIZE, random_seed = 0, set_reward = 0, ry = 5, rx = 5, train = True, direction = None): random.seed(random_seed) # check type of grid, attention type is not a string if type(grid_size) == int: self.grid_size_y = self.grid_size_x = grid_size elif type(grid_size) == tuple: y, x = grid_size self.grid_size_y = y self.grid_size_x = x self.n_holes = n_holes # Define the surronding using padding padded_size_y = self.grid_size_y + 4 * VISIBLE_RADIUS padded_size_x = self.grid_size_x + 4 * VISIBLE_RADIUS # intialize grid with zeros, attention y and x order self.grid = np.zeros((padded_size_y, padded_size_x)) # Padding for edges # intialize border with predefined negative values self.grid[0:2*VISIBLE_RADIUS, :] = EDGE_VALUE self.grid[-2*VISIBLE_RADIUS:, :] = EDGE_VALUE self.grid[:, 0:2*VISIBLE_RADIUS] = EDGE_VALUE self.grid[:, -2*VISIBLE_RADIUS:] = EDGE_VALUE if set_reward == 0: gy = random.randint(0, self.grid_size_y) + 2*VISIBLE_RADIUS gx = random.randint(0, self.grid_size_x) + 2*VISIBLE_RADIUS while self.grid[gy,gx] == HOLE_VALUE: gy = random.randint(0, self.grid_size_y) + 2*VISIBLE_RADIUS gx = random.randint(0, self.grid_size_x) + 2*VISIBLE_RADIUS self.grid[gy,gx] = GOAL_VALUE else: for pos_reward in set_reward: radius = 0 self.grid[pos_reward[0]-radius: pos_reward[0]+1+radius, pos_reward[1]-radius: pos_reward[1]+1+radius] = GOAL_VALUE def visible1(self, pos): # observable area is the squre around the agent, so 3x3 region , problem is when the agent is going to the # edge and corner of the grid y, x = pos y_relative = y * 19./(self.grid_size_y + 4) x_relative = x * 19./(self.grid_size_x + 4) visible = self.grid[y-VISIBLE_RADIUS:y+VISIBLE_RADIUS+1, x-VISIBLE_RADIUS:x+VISIBLE_RADIUS+1] if np.sum(visible) != 0 and (x ==2 or x == self.grid_size_x + 1): visible = np.multiply(visible, y_relative * np.ones((3,3)) ) elif np.sum(visible) != 0 and (y ==2 or y == self.grid_size_y + 1): visible = np.multiply(visible, x_relative * np.ones((3,3)) ) return visible def visible(self, pos): # observable area is the squre around the agent, so 3x3 region , problem is when the agent is going to the # edge and corner of the grid y, x = pos visible = self.grid[y-VISIBLE_RADIUS:y+VISIBLE_RADIUS+1, x-VISIBLE_RADIUS:x+VISIBLE_RADIUS+1] return visible class Agent(): def reset(self, grid, grid_size, set_agent = 0): if type(grid_size) == tuple: self.grid_size_y, self.grid_size_x = grid_size else: self.grid_size_y = self.grid_size_x = grid_size # position initialize if set_agent == 0: random.seed() poss = list(np.argwhere(grid.grid == 0)) # poss = [tuple(pos) for pos in poss] # print (poss) index = np.random.choice(len(poss), p = len(poss) * [1/len(poss)]) self.pos = poss[index] else : self.pos = set_agent # moves in four direction, Implement the relfective behaviour when arriving upon the wall , one way is to let the agent ran randomly when it clicks to wall, the other way is reflective boundary. def act(self, action): # Move according to action: 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT y, x = self.pos # up if action == 0: y -= 1 # right elif action == 1: x += 1 # down elif action == 2: y += 1 # left elif action == 3: x -= 1 self.pos = (y, x) # The setting of enviroment basically makes everything moves inside, there is little regularity, # class GameScale_y(): # 初始化,初始grid和agent def __init__(self, grid_size = 8, holes = 4, discount = 0.99, time_limit = 200, random_seed = 0, set_reward = 0, input_type = 0): self.discount = discount self.time_limit = time_limit self.grid_size = grid_size self.set_reward = set_reward self.seed = random_seed Set_reward = [] # for the reward draw for pos in self.set_reward: y, x = pos Set_reward.append((2 * VISIBLE_RADIUS + int(self.grid_size[0] * y), 2 * VISIBLE_RADIUS + int(self.grid_size[1] * x))) self.grid = Grid(n_holes = holes, grid_size = grid_size, random_seed = self.seed, set_reward = Set_reward, train = False) self.agent = Agent() self.History = [] self.values = self.grid.grid.copy() self.values.fill(0) self.t = 0 self.seed = random_seed self.seed_range = 2 self.holes = holes self.input_type = input_type # set limit sizes def reset(self, set_agent = 0, action = True, reward_control = 0, size = None, size_range = np.arange(10, 51, 10), prob = 5 * [0.2] , limit_set = 8, test = None, context = (0.5, 0.25), train = True, map_set = [], scale = None): """Start a new episode by resetting grid and agent""" # reset the reward so that it will not be erased in time # set size if size == None: size = size_range[np.random.choice(len(size_range), p=prob)] else: size = size self.size = size # set reward if len(self.set_reward) != 0: radius = self.size//10 - 1 if test!= None: radius = test if train == True: k = np.random.randint(1, 4) elif scale != None: k = scale else: k = np.random.randint(1, 4) self.time_limit = int(k * self.size * limit_set) self.Set_reward = [] for pos in self.set_reward: y, x = pos self.Set_reward.append((2 * VISIBLE_RADIUS + int(k * size * y), 2 * VISIBLE_RADIUS + int(size * x))) self.grid = Grid(n_holes = 0, grid_size = (k * self.size, self.size), random_seed = 0, set_reward = self.Set_reward, train = train) self.grid_size = (self.grid.grid_size_y, self.grid.grid_size_x) if len(map_set) != 0: self.grid.grid = map_set self.grid.grid[np.where(self.grid.grid == 1)] = 0 self.reward_control = reward_control # this variable is used to select which reward chosen as target self.pos_reward = self.Set_reward[reward_control] # select the reward self.grid.grid[self.pos_reward[0]-radius: self.pos_reward[0]+1+radius, self.pos_reward[1]-radius: self.pos_reward[1]+1+radius] = 1 else: y, x = context self.grid.grid[np.where(self.grid.grid == 1)] = 0 self.pos_reward = (2 * VISIBLE_RADIUS + int(self.size * y), 2 * VISIBLE_RADIUS + int(self.size * x)) # set position self.agent.reset(self.grid, self.grid_size, set_agent = set_agent) self.hidden = self.net.initHidden() if action == True: self.action = self.net.initAction() self.t = 0 self.reward = 0 @property def visible_state(self): """Return the visible area surrounding the agent, and current agent health""" if self.input_type == 0: visible = self.grid.visible(self.agent.pos) elif self.input_type == 1: visible = self.grid.visible1(self.agent.pos) return visible.flatten() def stimulus(self, pos): visible = self.grid.visible1(pos) return visible.flatten() @staticmethod def sample(): # choose between 0, 1,2,3 np.random.seed() return np.random.randint(0,4) def animate(history): frames = len(history) print("Rendering %d frames..." % frames) fig = plt.figure(figsize=(6, 2)) fig_grid = fig.add_subplot(111) def render_frame(i): grid, time = history[i] # Render grid fig_grid.matshow(grid, vmin=-1, vmax=1, cmap='jet') anim = matplotlib.animation.FuncAnimation( fig, render_frame, frames=frames, interval=100 ) plt.close() display(HTML(anim.to_html5_video())) def animate_group(History): k = 0 fig = plt.figure(figsize=(6, 6)) for history in History: frames = len(history) print("Rendering %d frames..." % frames) fig_grid = fig.add_subplot(151 + k) k += 1 def render_frame(i): grid, time = history[i] # Render grid fig_grid.matshow(grid, vmin=-1, vmax=1, cmap='jet') anim = matplotlib.animation.FuncAnimation( fig, render_frame, frames=frames, interval=100 ) plt.close() display(HTML(anim.to_html5_video())) def move(pos, act): i,j = pos pos_possible = [(i-1,j),(i,j+1),(i+1,j),(i,j-1)] return pos_possible[act]