forked from udacity/Value-based-methods
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenv.py
More file actions
119 lines (94 loc) · 3.74 KB
/
env.py
File metadata and controls
119 lines (94 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from unityagents import UnityEnvironment
import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim
import sys
import time
sys.path.append('util/')
from agent import Agent
from qn import QNetwork
TRAIN = int(sys.argv[1])
print("Train : {}".format(TRAIN))
env = UnityEnvironment(file_name="Banana.app", base_port=64738, worker_id=3, seed=1)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents in the environment
print('Number of agents:', len(env_info.agents))
# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)
# examine the state space
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)
agent = Agent(state_size=state_size, action_size=action_size, seed=0)
if TRAIN == 1:
print("TRAINNG" )
n_episodes=1600
max_t=1000
eps_start=1.0
eps_end=0.01
eps_decay=0.998
scores = [] # list containing scores from each episode
scores_window = deque(maxlen=100) # last 100 scores
eps = eps_start # initialize epsilon
for i_episode in range(1, n_episodes+1):
env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations[0]
score = 0
for t in range(max_t):
action = agent.act(state, eps)
env_ = env.step(action)[brain_name]
next_state = env_.vector_observations[0] # get the next state
reward = env_.rewards[0] # get the reward
done = env_.local_done[0]
agent.step(state, action, reward, next_state, done)
state = next_state
score += reward
if done:
break
scores_window.append(score) # save most recent score
scores.append(score) # save most recent score
eps = max(eps_end, eps_decay*eps) # decrease epsilon
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
if i_episode % 100 == 0:
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
if np.mean(scores_window)>=17.0:
print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
break
from matplotlib.pylab import plt
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
fig.savefig('artifact/scores.png', dpi=fig.dpi)
torch.save(agent.qnetwork_local.state_dict(), 'artifact/checkpoint.pth')
env.close()
else:
print("Smart Agent")
# Uncooment this for smart agent visuazliation
eps = 0.8
agent.qnetwork_local.load_state_dict(torch.load('artifact/checkpoint.pth'))
for i in range(3):
env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations[0]
for j in range(200):
action = agent.act(state, eps)
env_ = env.step(action)[brain_name]
next_state = env_.vector_observations[0] # get the next state
reward = env_.rewards[0] # get the reward
done = env_.local_done[0]
agent.step(state, action, reward, next_state, done)
time.sleep(0.1)
if done:
break