From 1723a0789efc0650f1e41429571a715d320d79f0 Mon Sep 17 00:00:00 2001 From: XC-Xinze Date: Wed, 19 Mar 2025 12:30:41 -0400 Subject: [PATCH 1/3] Move timing. --- gpt2_model.py | 23 ----------------------- train_gpt2.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/gpt2_model.py b/gpt2_model.py index b8e5060..3bbee30 100644 --- a/gpt2_model.py +++ b/gpt2_model.py @@ -8,9 +8,6 @@ from layers.layer_norm import LayerNormalization from utils.tf_utils import * -from scripts.utils import write_csv -import timeit - _ROOT = os.path.abspath(os.path.dirname(__file__)) LOG_DIR = _ROOT + "/log" @@ -21,8 +18,6 @@ class Gpt2(tf.keras.Model): - start_time = timeit.default_timer() - skipped_time = 0 def __init__(self, num_layers, d_model, @@ -154,13 +149,9 @@ def create_checkpoint_manager(self, checkpoint_path, max_to_keep=5, load_model=T if load_model: # If want to load trained weights ckpt.restore(self.ckpt_manager.latest_checkpoint) - print_time = timeit.default_timer() print('Latest checkpoint restored...............') - Gpt2.skipped_time += timeit.default_timer() - print_time else: - print_time = timeit.default_timer() print("Initializing model from scratch..........") - Gpt2.skipped_time += timeit.default_timer() - print_time def load_model(self, filepath): ckpt = tf.train.Checkpoint(model=self) @@ -264,15 +255,11 @@ def distributed_test_step(self, inputs, targets): def get_train_test_function(self, graph_mode=False): if graph_mode: - print_time = timeit.default_timer() print("Running in graph mode.............") - Gpt2.skipped_time += timeit.default_timer() - print_time train_fuc = self.train_step test_fuc = self.test_step else: - print_time = timeit.default_timer() print("Running in eager mode.............") - Gpt2.skipped_time += timeit.default_timer() - print_time train_fuc = self._train_step test_fuc = self._test_step return train_fuc, test_fuc @@ -349,16 +336,8 @@ def fit(self, train_dataset, graph_mode): result_type="Test") ckpt_save_path = self.ckpt_manager.save() - print_time = timeit.default_timer() print('Saving checkpoint for step {} at {}'.format(step.numpy(), ckpt_save_path)) - Gpt2.skipped_time += timeit.default_timer() - print_time - - time = timeit.default_timer() - Gpt2.start_time - Gpt2.skipped_time - avg_loss = float(total_loss) / float(loss_count) - avg_accuracy = float(total_accuracy)/ float(accuracy_count) - - write_csv(__file__, count, float(avg_accuracy), float(avg_loss), time) else: with self.mirrored_strategy.scope(): train_dataset, test_dataset = train_dataset @@ -406,13 +385,11 @@ def fit(self, train_dataset, graph_mode): @staticmethod def log_summary(tf_writer, step, loss, perplexity, result_type="Train"): - print_time = timeit.default_timer() print(result_type + ':- Step {}, Loss {:.4f}, Perplexity {:.4f}'.format( step, loss, perplexity)) with tf_writer.as_default(): tf.summary.scalar("loss", loss, step=step) tf.summary.scalar("perplexity", perplexity, step=step) - Gpt2.skipped_time += timeit.default_timer() - print_time class OutputLayer(tf.keras.layers.Layer): diff --git a/train_gpt2.py b/train_gpt2.py index 06443c9..b7c4d10 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -6,6 +6,9 @@ from data_pipeline import input_fn from gpt2_model import * +from scripts.utils import write_csv +import timeit + _ROOT = os.path.abspath(os.path.dirname(__file__)) LOG_DIR = _ROOT + "/log" MODEL_DIR = _ROOT + "/model" @@ -44,6 +47,9 @@ def train(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, train_tf_records = tf_records[:train_percent] test_tf_records = tf_records[train_percent:] + start_time = timeit.default_timer() + skipped_time = 0 + train_dataset = input_fn(train_tf_records, batch_size=batch_size) test_dataset = input_fn(test_tf_records, batch_size=batch_size) @@ -70,6 +76,10 @@ def train(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, model.create_summary_writer(LOG_DIR) model.fit([train_dataset, test_dataset], graph_mode) + + time = timeit.default_timer() - start_time - skipped_time + write_csv(__file__, time=time) + print("Training Done................") From c9fbe636d9b7b0f0c7455568d8278aaacdd2b070 Mon Sep 17 00:00:00 2001 From: XC-Xinze Date: Fri, 11 Apr 2025 22:37:32 -0400 Subject: [PATCH 2/3] time test for py_func --parallel --- data_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data_pipeline.py b/data_pipeline.py index 388f459..fd08c53 100644 --- a/data_pipeline.py +++ b/data_pipeline.py @@ -58,7 +58,8 @@ def input_fn(tf_records, dataset = tf.data.TFRecordDataset(tf_records, buffer_size=10000) dataset = dataset.shuffle(buffer_size=buffer_size) - dataset = dataset.map(parse_example,num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.map(lambda + x:tf.py_function(func=parse_example,inp=[x],Tout=(tf.int32,tf.int32)),num_parallel_calls=tf.data.AUTOTUNE) dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes) dataset = dataset.repeat(epoch) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) From 20a14de8ec406835d99a0fae5bc0d743ab9fea37 Mon Sep 17 00:00:00 2001 From: XC-Xinze Date: Fri, 11 Apr 2025 23:02:32 -0400 Subject: [PATCH 3/3] sync file --- gpt2_model.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gpt2_model.py b/gpt2_model.py index 3bbee30..b8e5060 100644 --- a/gpt2_model.py +++ b/gpt2_model.py @@ -8,6 +8,9 @@ from layers.layer_norm import LayerNormalization from utils.tf_utils import * +from scripts.utils import write_csv +import timeit + _ROOT = os.path.abspath(os.path.dirname(__file__)) LOG_DIR = _ROOT + "/log" @@ -18,6 +21,8 @@ class Gpt2(tf.keras.Model): + start_time = timeit.default_timer() + skipped_time = 0 def __init__(self, num_layers, d_model, @@ -149,9 +154,13 @@ def create_checkpoint_manager(self, checkpoint_path, max_to_keep=5, load_model=T if load_model: # If want to load trained weights ckpt.restore(self.ckpt_manager.latest_checkpoint) + print_time = timeit.default_timer() print('Latest checkpoint restored...............') + Gpt2.skipped_time += timeit.default_timer() - print_time else: + print_time = timeit.default_timer() print("Initializing model from scratch..........") + Gpt2.skipped_time += timeit.default_timer() - print_time def load_model(self, filepath): ckpt = tf.train.Checkpoint(model=self) @@ -255,11 +264,15 @@ def distributed_test_step(self, inputs, targets): def get_train_test_function(self, graph_mode=False): if graph_mode: + print_time = timeit.default_timer() print("Running in graph mode.............") + Gpt2.skipped_time += timeit.default_timer() - print_time train_fuc = self.train_step test_fuc = self.test_step else: + print_time = timeit.default_timer() print("Running in eager mode.............") + Gpt2.skipped_time += timeit.default_timer() - print_time train_fuc = self._train_step test_fuc = self._test_step return train_fuc, test_fuc @@ -336,8 +349,16 @@ def fit(self, train_dataset, graph_mode): result_type="Test") ckpt_save_path = self.ckpt_manager.save() + print_time = timeit.default_timer() print('Saving checkpoint for step {} at {}'.format(step.numpy(), ckpt_save_path)) + Gpt2.skipped_time += timeit.default_timer() - print_time + + time = timeit.default_timer() - Gpt2.start_time - Gpt2.skipped_time + avg_loss = float(total_loss) / float(loss_count) + avg_accuracy = float(total_accuracy)/ float(accuracy_count) + + write_csv(__file__, count, float(avg_accuracy), float(avg_loss), time) else: with self.mirrored_strategy.scope(): train_dataset, test_dataset = train_dataset @@ -385,11 +406,13 @@ def fit(self, train_dataset, graph_mode): @staticmethod def log_summary(tf_writer, step, loss, perplexity, result_type="Train"): + print_time = timeit.default_timer() print(result_type + ':- Step {}, Loss {:.4f}, Perplexity {:.4f}'.format( step, loss, perplexity)) with tf_writer.as_default(): tf.summary.scalar("loss", loss, step=step) tf.summary.scalar("perplexity", perplexity, step=step) + Gpt2.skipped_time += timeit.default_timer() - print_time class OutputLayer(tf.keras.layers.Layer):