From 0ed84d358d555328f966b2d584e17db749685ff9 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 10 Jun 2020 00:22:36 +0000
Subject: [PATCH 1/3] Add a --num_report_steps option to specify reporting
 frequency.

Currently the information for the following
 * `global_step/sec` and
 * `examples/sec`

gets displayed (and recorded via the summary writer) after every step.

This `--num_report_steps=N` option allows the user to specify the frequency (i.e. every N steps) with which such information should be displayed and recorded
---
 run_pretraining.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/run_pretraining.py b/run_pretraining.py
index 4e7ac43b1..c7673ebe4 100644
--- a/run_pretraining.py
+++ b/run_pretraining.py
@@ -116,6 +116,9 @@
 
 flags.DEFINE_string("optimizer_type", "adam", "Optimizer used for training - adam (default), lamb, nadam and nlamb")
 
+flags.DEFINE_integer(
+    "num_report_steps", 10,
+    "How frequently should summary information be reported and recorded.")
 
 def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
@@ -468,7 +471,7 @@ def main(_):
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
           per_host_input_for_training=is_per_host),
-      log_step_count_steps=25,
+      log_step_count_steps=FLAGS.num_report_steps * FLAGS.iterations_per_loop,
       session_config=config)
 
   model_fn = model_fn_builder(

From 7e192c943e16ae5e91678df205dd9e9c0496fbaf Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 10 Jun 2020 03:57:41 +0000
Subject: [PATCH 2/3] Enable printing and recording of throughput + loss on a
 periodic basis

This commit adds the ability to report (i.e. display to stdout) the following information on a periodic basis
 * step number
 * throughput
 * total_loss
 * mlm_loss
 * nsp_loss
 * learning_rate

The frequency of the reporting is specified via the `--num_report_steps` option.

Currently only the `throughput` and `total_loss` values get recorded (to the trace events file meant for tensorboard consumption).

Note that `throughput` is the same as `examples.sec` and `total_loss` is the same as `loss` both of which are already reported and recorded via the `TPUEstimator` implementation.

The `LogSessionRunHook` class is based on a similar class in the NVBERT implementation. It can be easily enhanced to report and record other/more variables.
---
 optimization.py    |  8 +++---
 run_pretraining.py | 69 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/optimization.py b/optimization.py
index 2bb80964e..72f486f6f 100644
--- a/optimization.py
+++ b/optimization.py
@@ -153,7 +153,7 @@ def __init__(self,
     """Constructs a AdamWeightDecayOptimizer."""
     super(AdamWeightDecayOptimizer, self).__init__(False, name)
 
-    self.learning_rate = learning_rate
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
     self.weight_decay_rate = weight_decay_rate
     self.beta_1 = beta_1
     self.beta_2 = beta_2
@@ -253,7 +253,7 @@ def __init__(self,
     """Constructs a LAMBOptimizer."""
     super(LAMBOptimizer, self).__init__(False, name)
 
-    self.learning_rate = learning_rate
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
     self.weight_decay_rate = weight_decay_rate
     self.beta_1 = beta_1
     self.beta_2 = beta_2
@@ -369,7 +369,7 @@ def __init__(self,
     """Constructs a NadamWeightDecayOptimizer."""
     super(NadamWeightDecayOptimizer, self).__init__(False, name)
 
-    self.learning_rate = learning_rate
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
     self.weight_decay_rate = weight_decay_rate
     self.beta_1 = beta_1
     self.beta_2 = beta_2
@@ -481,7 +481,7 @@ def __init__(self,
     """Constructs a NlamOptimizer."""
     super(NlambOptimizer, self).__init__(False, name)
 
-    self.learning_rate = learning_rate
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
     self.weight_decay_rate = weight_decay_rate
     self.beta_1 = beta_1
     self.beta_2 = beta_2
diff --git a/run_pretraining.py b/run_pretraining.py
index c7673ebe4..5e4cbfc6e 100644
--- a/run_pretraining.py
+++ b/run_pretraining.py
@@ -24,6 +24,10 @@
 import tensorflow as tf
 tf.compat.v1.disable_resource_variables()
 
+import time
+from tensorflow.python.training.summary_io import SummaryWriterCache
+from tensorflow.core.framework.summary_pb2 import Summary
+
 # Add Horovod to run_pretraining
 try:
   import horovod.tensorflow as hvd
@@ -120,6 +124,60 @@
     "num_report_steps", 10,
     "How frequently should summary information be reported and recorded.")
 
+
+class LogSessionRunHook(tf.estimator.SessionRunHook):
+
+    def __init__(self,
+                 global_batch_size,
+                 num_report_steps=10,
+                 output_dir=None):
+      self.global_batch_size = global_batch_size
+      self.num_report_steps = num_report_steps
+      self.output_dir=output_dir
+      self.summary_writer=None
+
+    def begin(self):
+      if self.summary_writer is None and self.output_dir:
+        self.summary_writer = SummaryWriterCache.get(self.output_dir)
+
+    def after_create_session(self, session, coord):
+      self.elapsed_secs = 0.
+      self.count = 0
+
+    def before_run(self, run_context):
+      self.t0 = time.time()
+      global_step = tf.compat.v1.train.get_global_step()
+      fetches = [global_step, 'learning_rate:0', 'total_loss:0', 'mlm_loss:0', 'nsp_loss:0']
+      return tf.estimator.SessionRunArgs(fetches=fetches)
+
+    def _log_and_record(self, global_step, learning_rate, total_loss, mlm_loss, nsp_loss):
+      time_per_step = self.elapsed_secs / self.count
+      throughput = self.global_batch_size / time_per_step
+      log_string = '  '
+      log_string += 'Step = %6i'%(global_step)
+      log_string += ', throughput = %6.1f'%(throughput)
+      log_string += ', total_loss = %6.3f'%(total_loss)
+      log_string += ', mlm_oss = %6.4e'%(mlm_loss)
+      log_string += ', nsp_loss = %6.4e'%(nsp_loss)
+      log_string += ', learning_rate = %6.4e'%(learning_rate)
+      tf.compat.v1.logging.info(log_string)
+
+      if self.summary_writer is not None:
+        throughput_summary = Summary(value=[Summary.Value(tag='throughput', simple_value=throughput)])
+        self.summary_writer.add_summary(throughput_summary, global_step)
+        total_loss_summary = Summary(value=[Summary.Value(tag='total_loss', simple_value=total_loss)])
+        self.summary_writer.add_summary(total_loss_summary, global_step)
+
+    def after_run(self, run_context, run_values):
+      self.elapsed_secs += time.time() - self.t0
+      self.count += 1
+      global_step, learning_rate, total_loss, mlm_loss, nsp_loss = run_values.results[0:5]
+      if (global_step % self.num_report_steps) == 0:
+        self._log_and_record(global_step, learning_rate, total_loss, mlm_loss, nsp_loss)
+        self.elapsed_secs = 0.
+        self.count = 0
+
+
 def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
                      use_one_hot_embeddings, use_hvd):
@@ -161,6 +219,10 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 
     total_loss = masked_lm_loss + next_sentence_loss
 
+    masked_lm_loss = tf.identity(masked_lm_loss, name='mlm_loss')
+    next_sentence_loss = tf.identity(next_sentence_loss, name='nsp_loss')
+    total_loss = tf.identity(total_loss, name='total_loss')
+
     tvars = tf.compat.v1.trainable_variables()
 
     initialized_variable_names = {}
@@ -502,10 +564,13 @@ def main(_):
         max_predictions_per_seq=FLAGS.max_predictions_per_seq,
         is_training=True)
 
-    hooks = None
+    hooks = []
+    if (not use_hvd) or (hvd.rank() == 0):
+      global_batch_size = FLAGS.train_batch_size if not use_hvd else FLAGS.train_batch_size * hvd.size()
+      hooks.append(LogSessionRunHook(global_batch_size, FLAGS.num_report_steps, FLAGS.output_dir))
     if use_hvd:
       # [HVD] Ensure all GPU's start with the same weights.
-      hooks = [hvd.BroadcastGlobalVariablesHook(0)]
+      hooks.append(hvd.BroadcastGlobalVariablesHook(0))
     estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps, hooks=hooks)
 
   if FLAGS.do_eval:

From a122adda8583772369d159e78d869884ae0626c4 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 10 Jun 2020 12:46:20 +0000
Subject: [PATCH 3/3] Disable the log messages from being printed twice.

Currently all the messages output via `tf.compat.v1.logging.info` get printed twice. For example
```
INFO:tensorflow:**** Trainable Variables ****
I0610 12:40:23.553335 139787876316928 run_pretraining.py:256] **** Trainable Variables ****
```

Setting the `propgate` flag in the loggger to `False` will prevent this. For the above example, only one line will be printed
```
INFO:tensorflow:**** Trainable Variables ****
```

This makes the output log file more readable.
---
 run_pretraining.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/run_pretraining.py b/run_pretraining.py
index 5e4cbfc6e..46086c281 100644
--- a/run_pretraining.py
+++ b/run_pretraining.py
@@ -481,7 +481,10 @@ def _decode_record(record, name_to_features):
 
 def main(_):
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
-  
+
+  # disable the log messages from being printed twice
+  tf.compat.v1.get_logger().propagate = False
+
   use_hvd = False
   if FLAGS.use_horovod and hvd != None:
     use_hvd = True