forked from cogsci-modeling-19/rl-twoStepTask
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimulate.py
More file actions
120 lines (107 loc) · 5.28 KB
/
simulate.py
File metadata and controls
120 lines (107 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from agents.model_based import AgentModelBased
from agents.model_free import AgentModelFree
from agents.hybrid import HybridAgent
from agents.random_agent import RandomAgent
from environment import TwoStepEnv
from utils import random_walk_gaussian
import pandas as pd
import numpy as np
def simulate(agent_type='random', trials=200, seed=None, verbose=False, params:dict={}, from_data:pd.DataFrame=None, use_reward_distribution=False):
"""
Simulate the two-step task using the environment and the given agent
:param agent_type: ['random', 'model_free', 'model_based', 'hybrid']
:param trials: number of trials to simulate
:param seed: random seed
:param verbose: print details if True
:param params: parameters for the agent
:param from_data: use reward probabilities from the given data
:param use_reward_distribution: use the same sampled reward distribution if True
:return: simulated task data as a dataframe and the agent
"""
if verbose:
print(f"Simulating {agent_type} agent, {trials} trials.")
print(f"Agent parameters: {params if params else 'default'}")
# set a random seed
np.random.seed(seed)
# simulate the task
action_space = TwoStepEnv.action_space
state_space = TwoStepEnv.state_space
if agent_type == 'model_based':
agent = AgentModelBased(action_space, state_space, **params)
elif agent_type == 'model_free':
agent = AgentModelFree(action_space, state_space, **params)
elif agent_type == 'hybrid' or agent_type.startswith('hybrid'):
agent = HybridAgent(action_space, state_space, **params)
else:
agent = RandomAgent(action_space, state_space, **params)
env = TwoStepEnv()
task_data = simulate_two_step_task(env, agent, trials=trials, from_data=from_data, use_reward_distribution=use_reward_distribution)
# convert the data to a dataframe
task_df = pd.DataFrame.from_dict(task_data, orient='index')
# unset the random seed
np.random.seed(None)
return task_df, agent
def simulate_two_step_task(env: TwoStepEnv, agent, trials=200,
from_data:pd.DataFrame=None, use_reward_distribution=True):
"""
Simulate with given agent
:param env: environment
:param agent: given agent
:param trials: number of trials
:param policy_method: method for action selection ['softmax', 'epsilon-greedy']
:param from_data: use reward probabilities from the given data
:param use_reward_distribution: use reward distribution if True
:return: simulated task data
"""
if from_data is not None:
if use_reward_distribution:
reward_distribution = from_data['rewardDistribution'].iloc[0]
# convert the reward distribution to float
reward_probabilities = reward_distribution.astype(float)
else:
reward_probabilities = from_data['rewardProbabilities'].iloc[0]
# reshape the reward probabilities to the correct shape, with zeros for the first stage
reward_probabilities = np.array([0, 0, *reward_probabilities])
# reshape
reward_probabilities = reward_probabilities.reshape((3, 2))
env.set_reward_probabilities(reward_probabilities)
task_data = {}
sd_for_random_walk = 0.025
time_step = 0
while time_step < trials:
# first stage choice
terminal = False
while not terminal:
current_state = env.state
action = agent.policy(env.state)
next_state, reward, terminal, info = env.step(action)
if agent:
agent.update_beliefs(current_state, action, reward, next_state,
terminal)
info['trial_index'] = int(time_step)
task_data[time_step] = info
env.reset()
# update the reward probabilities for the next trial
# 3 cases:
# 1. free simulation -> simulate a random walk in the reward probabilities
# 2. simulate according to the reward probabilities from data and sample reward distribution in the environment
# -> use the same reward probabilities as the data
# 3. simulate according to the reward distribution from data -> use the same reward distribution as the data
if from_data is not None and time_step < trials - 1:
if use_reward_distribution:
reward_distribution = from_data['rewardDistribution'].iloc[time_step + 1]
# convert the reward distribution to float
new_reward_prob_matrix = reward_distribution.astype(float)
else:
new_reward_prob_matrix = from_data['rewardProbabilities'].iloc[time_step + 1]
# include zeros for the first stage
new_reward_prob_matrix = np.array([0, 0, *new_reward_prob_matrix])
# reshape to the epxpected shape (3, 2): state-action
new_reward_prob_matrix = new_reward_prob_matrix.reshape((3, 2))
else:
# simulate a random walk in the reward probabilities
new_reward_prob_matrix = random_walk_gaussian(env.reward_prob_matrix,
sd_for_random_walk)
env.set_reward_probabilities(new_reward_prob_matrix)
time_step += 1
return task_data