I am new to tensorFlow and trying to define a RL algorithm,
class DQN(object): def init(self, state_shape, action_shape, lr=0.01): ''' Deep Q-Network Tensorflow model.
Args:
- state_shape: Input state shape
- action_shape: Output action shape
'''
self.states_input = tf.placeholder(tf.float32, shape=state_shape)
self.Q_target = tf.placeholder(tf.float32, shape=action_shape)
self.inputs1 = tf.placeholder(tf.float32, shape=state_shape)
self.Q_w = self.define_network(self.inputs1, [80, 60, 4])
self.Q_w_t = self.define_network(self.inputs1, [80, 60, 4])
self.predict = self.predict_output(self.Q_w)
self.nextQ = self.Q_target
self.loss = self.compute_loss(self.nextQ, self.Q_w)
self.trainer = self.optimize_network(0.001)
self.updateModel = self.trainer.minimize(self.loss)
self.init = tf.global_variables_initializer()
def define_network(self, input_layer, layer_shape):
Q_out = slim.stack(input_layer, slim.fully_connected, layer_shape)
return Q_out
def predict_output(self, Q_out):
best_action = tf.argmax(Q_out, 1)
return best_action
def optimize_network(self, lr):
opt = tf.train.AdamOptimizer(learning_rate=lr)
return opt
def compute_loss(self, Q_target, Q_estimate):
loss = tf.losses.huber_loss(Q_target, Q_estimate)
return loss
now there is another function that is calling these objects and placeholder which is below
def optimize_model(session, policy_net, target_net, batch, gamma): ''' Calculates the target Q-values for the given batch and uses them to update the model.
Args:
- session: Tensorflow session
- policy_net: Policy DQN model
- target_net: DQN model used to generate target Q-values
- batch: Batch of experiences uesd to optimize model
- gamma: Discount factor
'''
policyQw = policy_net.define_network(policy_net.inputs1, [80, 60, 4])
for b in batch:
Qcurr = session.run(policyQw, feed_dict={policy_net.inputs1:np.identity(16)[b[0]:b[0]+1]})
Q1 = session.run(target_net.Q_w_t, feed_dict={target_net.inputs1:np.identity(16)[b[2]:b[2]+1]})
maxQ1 = np.max(Q1)
targetQ = Qcurr
if b[4]==1:
targetQ[0, b[1]] = b[3]
else:
targetQ[0, b[1]] = b[3] + gamma * maxQ1
session.run([policy_net.updateModel],feed_dict={policy_net.inputs1:np.identity(16)[b[0]:b[0]+1],policy_net.nextQ:targetQ})
pass
I have called these functions from the main function below:
def train(env, num_episodes=500, gamma=0.99, batch_size=64, annealing_steps=1000, s_epsilon=1.0, f_epsilon=0.1, max_episode_steps=200): ''' DQN algorithm
Args:
- env: The environment to train the agent on
- num_episodes: The number of episodes to train the agent for
- gamma: The discount factor
- batch_size: Number of experiences in a batch
- annealing_steps: The number of steps to anneal epsilon over
- s_epsilon: The initial epsilon value for e-greedy action selection
- f_epsilon: The final epsilon value for the e-greedy action selection
Returns: (policy_net, episode_rewards)
- policy_net: Trained DQN model
- episode_rewards: Numpy array containing the reward of each episode during training
'''
tf.reset_default_graph()
policy_net = DQN([1, env.nS], [1, env.action_space.n])
target_net = DQN([1, env.nS], [1, env.action_space.n])
target_ops = update_target_graph_op(tf.trainable_variables(), 0.7)
memory = ReplayMemory(800)
epsilon = LinearSchedule(annealing_steps, f_epsilon, s_epsilon)
total_steps = 0
episode_rewards = list()
## CODE STARTS:
inputs1 = policy_net.states_input
#Q_w = policy_net.define_network(inputs1, [32, 16, 4])
#Q_w_t = target_net.define_network(inputs1, [32, 16, 4])
#predict = policy_net.predict_output(Q_w)
#nextQ = target_net.Q_target
#loss = policy_net.compute_loss(nextQ, Q_w)
#trainer = policy_net.optimize_network(0.001)
#updateModel = trainer.minimize(loss)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
eps = epsilon.value(i)
is_done = False
state = env.reset()
total_reward = 0
total_steps = 0
#print(sess.run(tf.trainable_variables()))
#The Q-Network
while (total_steps < max_episode_steps):
Qcurr = sess.run([policy_net.Q_w], feed_dict={policy_net.inputs1:np.identity(16)[state:state+1]})
action = eGreedyActionSelection(Qcurr[0], eps,env)
#print(action)
next_state, reward, is_done, _ = env.step(action)
memory.add(state, action, next_state, reward, is_done)
total_reward += reward
if(total_steps % batch_size == batch_size-1):
batch = memory.sample(batch_size)
optimize_model(sess, policy_net, target_net, batch, gamma)
update_target(sess, target_ops)
if is_done==1:
break
state = next_state
total_steps += 1
episode_rewards.append(total_reward)
print("Percent of succesful episodes: " + str(sum(episode_rewards)/num_episodes) + "%")
return policy_net, episode_rewards
I am trying to move the some placeholders and function objects to second part of code but I am getting an error of:
FailedPreconditionError: Attempting to use uninitialized value Stack_4/fully_connected_1/weights [[node Stack_4/fully_connected_1/weights/read (defined at /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/variables.py:277) = IdentityT=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
which is happening on this line of second code block, policyQw = policy_net.define_network(policy_net.inputs1, [80, 60, 4])
Any idea how can I do it or remove the error. Thanks for your help in advance.
Comments
Post a Comment