运行 Tensorflow 时 GPU 利用率低 [英] Low GPU utilisation when running Tensorflow

查看：105 发布时间：2021/7/7 18:56:32 python tensorflow reinforcement-learning q-learning openai-gym

本文介绍了运行 Tensorflow 时 GPU 利用率低的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我一直在使用 Tensorflow 和 OpenAI 健身房进行深度强化学习.我的问题是 GPU 利用率低.谷歌搜索这个问题，我明白在训练小型网络(例如训练 mnist)时期望大量 GPU 利用率是错误的.但是我的神经网络并不小，我想.该架构类似于原始 deepmind 论文中给出的(或多或少).我的网络架构总结如下

I've been doing Deep Reinforcement Learning using Tensorflow and OpenAI gym. My problem is low GPU utilisation. Googling this issue, I understood that it's wrong to expect much GPU utilisation when training small networks ( eg. for training mnist). But my Neural Network is not so small, I think. The architecture is similar to the given in the original deepmind paper (more or less). The architecture of my network is summarized below

卷积层 1(filters=32，kernel_size=8x8，strides=4)

Convolution layer 1 (filters=32, kernel_size=8x8, strides=4)

卷积层 2(filters=64，kernel_size=8x8，strides=2)

Convolution layer 2 (filters=64, kernel_size=8x8, strides=2)

卷积层 3(filters=64，kernel_size=8x8，strides=1)

Convolution layer 3 (filters=64, kernel_size=8x8, strides=1)

密集层(单位=512)

Dense layer (units=512)

输出层(单位=9)

我正在使用 Tesla P100 16GB gpu 进行培训.我的学习算法是 Simple DQN.(同样，来自 Deepmind 论文).超参数都在论文中给出.GPU 利用率仍然远低于 10%(如 nvidia-smi 所示).可能的问题是什么?

I am training on a Tesla P100 16GB gpu. My learning algorithm is Simple DQN. (Again, from the Deepmind paper). Hyper-parameters are all as given in the paper. Still GPU utilisation is well below 10% (as shown by nvidia-smi). What could be the possible issue?

import tensorflow as tf
import numpy as np
import os, sys
import gym
from collections import deque
from time import sleep
import os


os.environ['CUDA_VISIBLE_DEVICES'] = '1'


def reset_graph(seed=142):
    tf.reset_default_graph()


def preprocess_observation(obs):
    img = obs[34:210:2, ::2] # crop and downsize
    return np.mean(img, axis=2).reshape(88, 80) / 255.0


def combine_observations_multichannel(preprocessed_observations):
    return np.array(preprocessed_observations).transpose([1, 2, 0])


n_observations_per_state = 3
preprocessed_observations = deque([], maxlen=n_observations_per_state)
env = gym.make("Breakout-v0")
obs = env.reset()


input_height = 88
input_width = 80
input_channels = 3
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3 
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 10x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n  # Number of discrete actions are available
initializer = tf.variance_scaling_initializer()


def q_network(X_state, name):
    prev_layer = X_state
    with tf.variable_scope(name) as scope:
        for n_maps, kernel_size, strides, padding, activation in zip(
                conv_n_maps, conv_kernel_sizes, conv_strides,
                conv_paddings, conv_activation):
            prev_layer = tf.layers.conv2d(
                prev_layer, filters=n_maps, kernel_size=kernel_size,
                strides=strides, padding=padding, activation=activation,
                kernel_initializer=initializer)
        last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
        hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
                                 activation=hidden_activation,
                                 kernel_initializer=initializer)
        outputs = tf.layers.dense(hidden, n_outputs,
                                  kernel_initializer=initializer)
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=scope.name)
    trainable_vars_by_name = {var.name[len(scope.name):]: var
                              for var in trainable_vars}
    return outputs, trainable_vars_by_name


X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
                                            input_channels])
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")

copy_ops = [target_var.assign(online_vars[var_name])
            for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)


learning_rate = 0.001
momentum = 0.95

with tf.variable_scope("train"):
    X_action = tf.placeholder(tf.int32, shape=[None])
    y = tf.placeholder(tf.float32, shape=[None, 1])
    q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
                            axis=1, keep_dims=True)
    loss = tf.reduce_mean((y - q_value) ** 2) 

    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
    training_op = optimizer.minimize(loss, global_step=global_step)


replay_memory_size = 500000
replay_memory = deque([], maxlen=replay_memory_size)

def sample_memories(batch_size):
    indices = np.random.permutation(len(replay_memory))[:batch_size]
    cols = [[], [], [], [], []] # state, action, reward, next_state, continue
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)


eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000


def epsilon_greedy(q_values, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs) # random action
    else:
        return np.argmax(q_values) # optimal action

n_steps = 4000000  # total number of training steps
training_start = 10000  # start training after 10,000 game iterations
training_interval = 4  # run a training step every 4 game iterations
save_steps = 1000  # save the model every 1,000 training steps
copy_steps = 10000  # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 5  # Skip the start of every game (it's just waiting time).
batch_size = 64
iteration = 0  # game iterations
checkpoint_dir = './saved_networks'
checkpoint_path = "./saved_networks/dqn_breakout.cpkt"
summary_path = "./summary/"
done = True # env needs to be reset

# Summary variables
svar_reward = tf.Variable(tf.zeros([1], dtype=tf.int32)) # Episode reward
svar_mmq = tf.Variable(tf.zeros([1]), dtype=tf.float32) # Episode Mean-Max-Q
svar_loss = tf.Variable(tf.zeros([1], dtype=tf.float64))
all_svars = [svar_reward, svar_mmq, svar_loss]
tf.summary.scalar("Episode Reward", tf.squeeze(svar_reward))
tf.summary.scalar("Episode Mean-Max-Q", tf.squeeze(svar_mmq))
tf.summary.scalar("Episode MSE", tf.squeeze(svar_loss))
# Placeholders
svar_reward_p, svar_mmq_p =  tf.placeholder(tf.int32, [1]), tf.placeholder(tf.float32, [1])
svar_loss_p = tf.placeholder(tf.float64, [1])
svars_placeholders = [svar_reward_p,  svar_mmq_p, svar_loss_p]

# Assign operation
summary_assign_op = [all_svars[i].assign(svars_placeholders[i]) for i in range(len(svars_placeholders))]
writer = tf.summary.FileWriter(summary_path)
summary_op = tf.summary.merge_all()
# For keeping track of no. of episodes played.
episode_step = tf.Variable(tf.zeros([1], dtype=tf.int64), trainable=False)
inc_episode_count = episode_step.assign_add([1])


init = tf.global_variables_initializer()
saver = tf.train.Saver()


loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
ep_reward = 0
ep_loss = 0.

with tf.Session() as sess:
    if os.path.isfile(checkpoint_path + ".index"):
        saver.restore(sess, checkpoint_path)
        print("<--------------------- Graph restored! -------------------------->")
    else:
        print("<--------- No checkpoints found! Starting over.. ---------------->")
        init.run()
        copy_online_to_target.run()
    while True:
        step = global_step.eval()
        if step >= n_steps:
            break
        iteration += 1
        print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f}   ".format(
            iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
        if done: # game over, start again
            obs = env.reset()
            # Clear observations from the past episode
            preprocessed_observations.clear()
            for skip in range(skip_start): # skip the start of each game
                obs, reward, done, info = env.step(0) # Do nothing
                preprocessed_observations.append(preprocess_observation(obs))
            state = combine_observations_multichannel(preprocessed_observations)
        # Online DQN evaluates what to do
        q_values = online_q_values.eval(feed_dict={X_state: [state]})
        action = epsilon_greedy(q_values, step)

        # Online DQN plays
        obs, reward, done, info = env.step(action)
        ep_reward += reward
        preprocessed_observations.append(preprocess_observation(obs))
        next_state = combine_observations_multichannel(preprocessed_observations)

        # Let's memorize what happened
        replay_memory.append((state, action, reward, next_state, 1.0 - done))
        state = next_state

        # Compute statistics for tracking progress
        total_max_q += q_values.max()
        game_length += 1
        if done:
            mean_max_q = total_max_q / game_length
            # Write summary -- start
            if iteration >= training_start:
                sess.run(summary_assign_op, feed_dict={
                    svar_reward_p: [ep_reward],
                    svar_mmq_p: [mean_max_q],
                    svar_loss_p: [ep_loss],
                })
                summaries_str = sess.run(summary_op)
                writer.add_summary(summaries_str, sess.run(episode_step))
                sess.run(inc_episode_count)
            # Write summary -- end
            total_max_q = 0.0
            game_length = ep_reward = ep_loss = 0

        if iteration < training_start or iteration % training_interval != 0:
            continue # only train after warmup period and at regular intervals

        # Sample memories and use the target DQN to produce the target Q-Value
        X_state_val, X_action_val, rewards, X_next_state_val, continues = (
            sample_memories(batch_size))
        next_q_values = target_q_values.eval(
            feed_dict={X_state: X_next_state_val})
        max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
        y_val = rewards + continues * discount_rate * max_next_q_values

        # Train the online DQN
        _, loss_val = sess.run([training_op, loss], feed_dict={
            X_state: X_state_val, X_action: X_action_val, y: y_val})
        ep_loss += loss_val
        # Regularly copy the online DQN to the target DQN
        if step % copy_steps == 0:
            copy_online_to_target.run()

        # And save regularly
        if step % save_steps == 0:
            saver.save(sess, checkpoint_path)

运行 Tensorflow 时 GPU 利用率低 [英] Low GPU utilisation when running Tensorflow

问题描述

推荐答案

相关文章

Python最新文章

热门教程

热门工具

登录关闭

运行 Tensorflow 时 GPU 利用率低 [英] Low GPU utilisation when running Tensorflow

问题描述

推荐答案

相关文章

Python最新文章

热门教程

热门工具

登录 关闭

登录关闭