From 0ed84d358d555328f966b2d584e17db749685ff9 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 10 Jun 2020 00:22:36 +0000 Subject: [PATCH 1/3] Add a --num_report_steps option to specify reporting frequency. Currently the information for the following * `global_step/sec` and * `examples/sec` gets displayed (and recorded via the summary writer) after every step. This `--num_report_steps=N` option allows the user to specify the frequency (i.e. every N steps) with which such information should be displayed and recorded --- run_pretraining.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/run_pretraining.py b/run_pretraining.py index 4e7ac43b1..c7673ebe4 100644 --- a/run_pretraining.py +++ b/run_pretraining.py @@ -116,6 +116,9 @@ flags.DEFINE_string("optimizer_type", "adam", "Optimizer used for training - adam (default), lamb, nadam and nlamb") +flags.DEFINE_integer( + "num_report_steps", 10, + "How frequently should summary information be reported and recorded.") def model_fn_builder(bert_config, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, @@ -468,7 +471,7 @@ def main(_): iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), - log_step_count_steps=25, + log_step_count_steps=FLAGS.num_report_steps * FLAGS.iterations_per_loop, session_config=config) model_fn = model_fn_builder( From 7e192c943e16ae5e91678df205dd9e9c0496fbaf Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 10 Jun 2020 03:57:41 +0000 Subject: [PATCH 2/3] Enable printing and recording of throughput + loss on a periodic basis This commit adds the ability to report (i.e. display to stdout) the following information on a periodic basis * step number * throughput * total_loss * mlm_loss * nsp_loss * learning_rate The frequency of the reporting is specified via the `--num_report_steps` option. Currently only the `throughput` and `total_loss` values get recorded (to the trace events file meant for tensorboard consumption). Note that `throughput` is the same as `examples.sec` and `total_loss` is the same as `loss` both of which are already reported and recorded via the `TPUEstimator` implementation. The `LogSessionRunHook` class is based on a similar class in the NVBERT implementation. It can be easily enhanced to report and record other/more variables. --- optimization.py | 8 +++--- run_pretraining.py | 69 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/optimization.py b/optimization.py index 2bb80964e..72f486f6f 100644 --- a/optimization.py +++ b/optimization.py @@ -153,7 +153,7 @@ def __init__(self, """Constructs a AdamWeightDecayOptimizer.""" super(AdamWeightDecayOptimizer, self).__init__(False, name) - self.learning_rate = learning_rate + self.learning_rate = tf.identity(learning_rate, name='learning_rate') self.weight_decay_rate = weight_decay_rate self.beta_1 = beta_1 self.beta_2 = beta_2 @@ -253,7 +253,7 @@ def __init__(self, """Constructs a LAMBOptimizer.""" super(LAMBOptimizer, self).__init__(False, name) - self.learning_rate = learning_rate + self.learning_rate = tf.identity(learning_rate, name='learning_rate') self.weight_decay_rate = weight_decay_rate self.beta_1 = beta_1 self.beta_2 = beta_2 @@ -369,7 +369,7 @@ def __init__(self, """Constructs a NadamWeightDecayOptimizer.""" super(NadamWeightDecayOptimizer, self).__init__(False, name) - self.learning_rate = learning_rate + self.learning_rate = tf.identity(learning_rate, name='learning_rate') self.weight_decay_rate = weight_decay_rate self.beta_1 = beta_1 self.beta_2 = beta_2 @@ -481,7 +481,7 @@ def __init__(self, """Constructs a NlamOptimizer.""" super(NlambOptimizer, self).__init__(False, name) - self.learning_rate = learning_rate + self.learning_rate = tf.identity(learning_rate, name='learning_rate') self.weight_decay_rate = weight_decay_rate self.beta_1 = beta_1 self.beta_2 = beta_2 diff --git a/run_pretraining.py b/run_pretraining.py index c7673ebe4..5e4cbfc6e 100644 --- a/run_pretraining.py +++ b/run_pretraining.py @@ -24,6 +24,10 @@ import tensorflow as tf tf.compat.v1.disable_resource_variables() +import time +from tensorflow.python.training.summary_io import SummaryWriterCache +from tensorflow.core.framework.summary_pb2 import Summary + # Add Horovod to run_pretraining try: import horovod.tensorflow as hvd @@ -120,6 +124,60 @@ "num_report_steps", 10, "How frequently should summary information be reported and recorded.") + +class LogSessionRunHook(tf.estimator.SessionRunHook): + + def __init__(self, + global_batch_size, + num_report_steps=10, + output_dir=None): + self.global_batch_size = global_batch_size + self.num_report_steps = num_report_steps + self.output_dir=output_dir + self.summary_writer=None + + def begin(self): + if self.summary_writer is None and self.output_dir: + self.summary_writer = SummaryWriterCache.get(self.output_dir) + + def after_create_session(self, session, coord): + self.elapsed_secs = 0. + self.count = 0 + + def before_run(self, run_context): + self.t0 = time.time() + global_step = tf.compat.v1.train.get_global_step() + fetches = [global_step, 'learning_rate:0', 'total_loss:0', 'mlm_loss:0', 'nsp_loss:0'] + return tf.estimator.SessionRunArgs(fetches=fetches) + + def _log_and_record(self, global_step, learning_rate, total_loss, mlm_loss, nsp_loss): + time_per_step = self.elapsed_secs / self.count + throughput = self.global_batch_size / time_per_step + log_string = ' ' + log_string += 'Step = %6i'%(global_step) + log_string += ', throughput = %6.1f'%(throughput) + log_string += ', total_loss = %6.3f'%(total_loss) + log_string += ', mlm_oss = %6.4e'%(mlm_loss) + log_string += ', nsp_loss = %6.4e'%(nsp_loss) + log_string += ', learning_rate = %6.4e'%(learning_rate) + tf.compat.v1.logging.info(log_string) + + if self.summary_writer is not None: + throughput_summary = Summary(value=[Summary.Value(tag='throughput', simple_value=throughput)]) + self.summary_writer.add_summary(throughput_summary, global_step) + total_loss_summary = Summary(value=[Summary.Value(tag='total_loss', simple_value=total_loss)]) + self.summary_writer.add_summary(total_loss_summary, global_step) + + def after_run(self, run_context, run_values): + self.elapsed_secs += time.time() - self.t0 + self.count += 1 + global_step, learning_rate, total_loss, mlm_loss, nsp_loss = run_values.results[0:5] + if (global_step % self.num_report_steps) == 0: + self._log_and_record(global_step, learning_rate, total_loss, mlm_loss, nsp_loss) + self.elapsed_secs = 0. + self.count = 0 + + def model_fn_builder(bert_config, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings, use_hvd): @@ -161,6 +219,10 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument total_loss = masked_lm_loss + next_sentence_loss + masked_lm_loss = tf.identity(masked_lm_loss, name='mlm_loss') + next_sentence_loss = tf.identity(next_sentence_loss, name='nsp_loss') + total_loss = tf.identity(total_loss, name='total_loss') + tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} @@ -502,10 +564,13 @@ def main(_): max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) - hooks = None + hooks = [] + if (not use_hvd) or (hvd.rank() == 0): + global_batch_size = FLAGS.train_batch_size if not use_hvd else FLAGS.train_batch_size * hvd.size() + hooks.append(LogSessionRunHook(global_batch_size, FLAGS.num_report_steps, FLAGS.output_dir)) if use_hvd: # [HVD] Ensure all GPU's start with the same weights. - hooks = [hvd.BroadcastGlobalVariablesHook(0)] + hooks.append(hvd.BroadcastGlobalVariablesHook(0)) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps, hooks=hooks) if FLAGS.do_eval: From a122adda8583772369d159e78d869884ae0626c4 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 10 Jun 2020 12:46:20 +0000 Subject: [PATCH 3/3] Disable the log messages from being printed twice. Currently all the messages output via `tf.compat.v1.logging.info` get printed twice. For example ``` INFO:tensorflow:**** Trainable Variables **** I0610 12:40:23.553335 139787876316928 run_pretraining.py:256] **** Trainable Variables **** ``` Setting the `propgate` flag in the loggger to `False` will prevent this. For the above example, only one line will be printed ``` INFO:tensorflow:**** Trainable Variables **** ``` This makes the output log file more readable. --- run_pretraining.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/run_pretraining.py b/run_pretraining.py index 5e4cbfc6e..46086c281 100644 --- a/run_pretraining.py +++ b/run_pretraining.py @@ -481,7 +481,10 @@ def _decode_record(record, name_to_features): def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) - + + # disable the log messages from being printed twice + tf.compat.v1.get_logger().propagate = False + use_hvd = False if FLAGS.use_horovod and hvd != None: use_hvd = True