Merge pull request #692 from brettkoonce/minor_sp

lukaszkaiser · web-flow · commit c4ca5a4008b5 · 2018-04-11T19:55:01.000-07:00
tensor2tensor: minor spelling tweaks
diff --git a/tensor2tensor/bin/t2t_bleu.py b/tensor2tensor/bin/t2t_bleu.py
@@ -74,7 +74,7 @@
 flags.DEFINE_string("translation", None,
                     "Path to the MT system translation file")
 flags.DEFINE_string("translations_dir", None,
-                    "Directory with translated files to be evaulated.")
+                    "Directory with translated files to be evaluated.")
 flags.DEFINE_string("event_dir", None, "Where to store the event file.")
 
 flags.DEFINE_string("bleu_variant", "both",
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
@@ -181,7 +181,7 @@ def algebra_inverse_solve(left, right, var, solve_ops):
           right- Expression on the right side of the op.
           to_tree- The tree on the other side of the equal sign. The canceled
               out expression will be moved here.
-          new_from_tree- The resuling from_tree after the algebraic
+          new_from_tree- The resulting from_tree after the algebraic
               manipulation.
           new_to_tree- The resulting to_tree after the algebraic manipulation.
 
@@ -355,7 +355,7 @@ def generate_calculus_integrate_sample(vlist, ops, min_depth, max_depth,
 # functions: Dict of special function names. Maps human readable string names to
 #     single char names used in flist.
 # ops: Dict mapping op symbols (chars) to ExprOp instances.
-# solve_ops: Encodes rules for how to algebraicly cancel out each operation. See
+# solve_ops: Encodes rules for how to algebraically cancel out each operation. See
 #     doc-string for `algebra_inverse_solve`.
 # int_encoder: Function that maps a string to a list of tokens. Use this to
 #     encode an expression to feed into a model.
@@ -377,7 +377,7 @@ def math_dataset_init(alphabet_size=26, digits=None, functions=None):
 
   Args:
     alphabet_size: How many possible variables there are. Max 52.
-    digits: How many numerical digits to encode as tokens, "0" throuh
+    digits: How many numerical digits to encode as tokens, "0" through
         str(digits-1), or None to encode no digits.
     functions: Defines special functions. A dict mapping human readable string
         names, like "log", "exp", "sin", "cos", etc., to single chars. Each
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Common classes for automatic speech recogntion (ASR) datasets.
+"""Common classes for automatic speech recognition (ASR) datasets.
 
 The audio import uses sox to generate normalized waveforms, please install
 it as appropriate (e.g. using apt-get or yum).
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -348,7 +348,7 @@ def store_to_file(self, filename):
 def _escape_token(token, alphabet):
   """Escape away underscores and OOV characters and append '_'.
 
-  This allows the token to be experessed as the concatenation of a list
+  This allows the token to be expressed as the concatenation of a list
   of subtokens from the vocabulary. The underscore acts as a sentinel
   which allows us to invertibly concatenate multiple such lists.
 
diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py
@@ -172,7 +172,7 @@ class TranslateEnzhWmt32k(translate.TranslateProblem):
 
   CWMT:
     - http://nlp.nju.edu.cn/cwmt-wmt/
-    - Website contrains instructions for FTP server access.
+    - Website contains instructions for FTP server access.
     - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017,
         NEU datasets
 
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
@@ -190,7 +190,7 @@ def scramble_fraction(self):
 
 @registry.register_problem
 class LanguagemodelWikiScrambleL1k(LanguagemodelWikiScramble):
-  """Sequence length 1024, 50% scrambed."""
+  """Sequence length 1024, 50% scrambled."""
 
   @property
   def sequence_length(self):
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
@@ -83,7 +83,7 @@ def register_layer(
       default_args (list): The default parameters to add to the function.
       default_kwargs (dict): The default parameters to add to the function.
         Those arguments can be overwritten when calling the function.
-      use_dp (bool): Wrap the function call within a dataparalellism object if
+      use_dp (bool): Wrap the function call within a dataparallelism object if
         dp is available. Some layers (like MOE) must be called without dp.
       recompute_grad (bool): If True, recompute the function during the
         backward pass to save memory
@@ -1378,7 +1378,7 @@ def _relative_attention_inner(x, y, z, transpose):
     x: Tensor with shape [batch_size, heads, length, length or depth].
     y: Tensor with shape [batch_size, heads, length, depth].
     z: Tensor with shape [length, length, depth].
-    transpose: Whether to tranpose inner matrices of y and z. Should be true if
+    transpose: Whether to transpose inner matrices of y and z. Should be true if
         last dimension of x is depth, not length.
 
   Returns:
@@ -1422,7 +1422,7 @@ def dot_product_attention_relative(q,
     k: a Tensor with shape [batch, heads, length, depth].
     v: a Tensor with shape [batch, heads, length, depth].
     bias: bias Tensor.
-    max_relative_position: an integer specifying the maxmimum distance between
+    max_relative_position: an integer specifying the maximum distance between
         inputs that unique position embeddings should be learned for.
     dropout_rate: a floating point number.
     image_shapes: optional tuple of integer scalars.
@@ -2141,7 +2141,7 @@ def gather_indices_2d(x, block_shape, block_stride):
 
 
 def make_2d_block_raster_mask(query_shape, memory_flange):
-  """creates a mask for 2d block raster scany.
+  """creates a mask for 2d block raster scan.
 
   The query mask can look to the left, top left, top, and top right, but
   not to the right. Inside the query, we have the standard raster scan
@@ -2661,7 +2661,7 @@ def ffn_self_attention_layer(x,
   We use self-attention to do feedforward computations. We apply this function
   positionwise where for each position, we linearly transform the output to have
   depth filter_depth, and break up the result depth-wise into num_parts
-  contiguous parts.  The parts self-attentd, we concatenate the results
+  contiguous parts.  The parts self-attend, we concatenate the results
   depth-wise, and we linearly transform to a depth of output_depth. The
   goal is to get multiplicative interactions between components of a
   representation.
@@ -2764,7 +2764,7 @@ def parameter_attention(x,
         x, total_key_depth, use_bias=False, name="q_transform")
     if dropout_rate:
       # This is a cheaper form of attention dropout where we use to use
-      # the same dropout decisions across batch elemets and query positions,
+      # the same dropout decisions across batch elements and query positions,
       # but different decisions across heads and memory positions.
       v = tf.nn.dropout(
           v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1])
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
@@ -102,13 +102,13 @@ def basic_params1():
       moe_loss_coef=1e-2,
       # Sequences of operations to perform on layer input and layer output.
       # Used by common_layers.layer_preprocess, common_layers.layer_postprocess
-      # Each character repsesnts an operation:
+      # Each character represents an operation:
       # none: no preprocessing
       #    d: apply dropout
       #    n: apply normalization (see norm_type and norm_epsilon)
       #    a: add layer input (residual connection - only during postprocess)
       # The special string "none" is used instead of the empty string
-      # to indicate no pre/postprocesisng, since the empty string causes
+      # to indicate no pre/postprocessing, since the empty string causes
       # trouble for hyperparameter tuning.
       # TODO(noam): The current settings ("", "dan") are the published version
       # of the transformer.  ("n", "da") seems better for harder-to-learn
@@ -174,13 +174,13 @@ def basic_params1():
       # The maximum length of "input" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overridding preprocess_example() method
+      # You can change this behavior by overriding preprocess_example() method
       # in your problem class.
       max_input_seq_length=0,
       # The maximum length of "target" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overridding preprocess_example() method
+      # You can change this behavior by overriding preprocess_example() method
       # in your problem class.
       max_target_seq_length=0,
       # if nonzero, we split the target sequences on example read.
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -1201,7 +1201,7 @@ def add_timing_signal(x, min_timescale=1, max_timescale=1e4, num_timescales=16):
   and the target of the attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  experessed in terms of y, sin(x) and cos(x).
+  expressed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  For each timescale, we
diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py
@@ -70,7 +70,7 @@ def attention_lm_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly baises for diagonal alignments
+    to implement masked attention and possibly biases for diagonal alignments
   """
   if hparams.prepend_mode == "prepend_inputs_full_attention":
     decoder_self_attention_bias = (
diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py
@@ -163,7 +163,7 @@ def _diet_expert(x):
     def print_shape(x, suffix, debug=False):
       # To help debugging, print the input/output shapes at inference and eval
       # Inference for long sequences can take a long time, so that's help to
-      # see the progession of the generation
+      # see the progression of the generation
       if not debug and hparams.mode == ModeKeys.TRAIN:
         return x
       return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix))
@@ -368,7 +368,7 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly baises for diagonal alignments
+    to implement masked attention and possibly biases for diagonal alignments
     pad_remover (expert_utils.PadRemover): an util object to remove padding
   """
   targets_pad_mask = common_attention.embedding_to_padding(targets)
diff --git a/tensor2tensor/models/research/transformer_moe.py b/tensor2tensor/models/research/transformer_moe.py
@@ -46,7 +46,7 @@
 # "a/a/a#": Encoder only model (3 layers)
 # "#a/a/a": Decoder only model (3 layers)
 # "a/a-moe#a/a/a": Encoder (2 layers with 1 moe), decoder (3 layers)
-# Note that all combinaisons are not necessarily possibles (some attention
+# Note that all combinations are not necessarily possibles (some attention
 # types are not necessarily compatible with the encoder, or can't accept certain
 # types of masking)
 
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -51,7 +51,7 @@ class Transformer(t2t_model.T2TModel):
 
   def __init__(self, *args, **kwargs):
     super(Transformer, self).__init__(*args, **kwargs)
-    self.attention_weights = dict()  # For vizualizing attention heads.
+    self.attention_weights = dict()  # For visualizing attention heads.
 
   def encode(self, inputs, target_space, hparams, features=None):
     """Encode transformer inputs.
@@ -60,7 +60,7 @@ def encode(self, inputs, target_space, hparams, features=None):
       inputs: Transformer inputs [batch_size, input_length, input_height,
         hidden_dim] which will be flattened along the two spatial dimensions.
       target_space: scalar, target space ID.
-      hparams: hyperparmeters for model.
+      hparams: hyperparameters for model.
       features: optionally pass the entire features dictionary as well.
         This is needed now for "packed" datasets.
 
@@ -106,7 +106,7 @@ def decode(self,
           encoder-decoder attention. [batch_size, input_length]
       decoder_self_attention_bias: Bias and mask weights for decoder
           self-attention. [batch_size, decoder_length]
-      hparams: hyperparmeters for model.
+      hparams: hyperparameters for model.
       cache: dict, containing tensors which are the results of previous
           attentions, used for fast decoding.
       nonpadding: optional Tensor with shape [batch_size, decoder_length]
@@ -142,7 +142,7 @@ def body(self, features):
     Args:
       features: Map of features to the model. Should contain the following:
           "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
-          "tragets": Target decoder outputs.
+          "targets": Target decoder outputs.
               [batch_size, decoder_length, hidden_dim]
           "target_space_id"
 
@@ -832,7 +832,7 @@ def transformer_ffn_layer(x,
 
   Args:
     x: a Tensor of shape [batch_size, length, hparams.hidden_size]
-    hparams: hyperparmeters for model
+    hparams: hyperparameters for model
     pad_remover: an expert_utils.PadRemover object tracking the padding
       positions. If provided, when using convolutional settings, the padding
       is removed before applying the convolution, and restored afterward. This
diff --git a/tensor2tensor/models/vanilla_gan.py b/tensor2tensor/models/vanilla_gan.py
@@ -125,7 +125,7 @@ def body(self, features):
       features: a dictionary with the tensors.
 
     Returns:
-      A pair (predictions, losses) where preditions is the generated image
+      A pair (predictions, losses) where predictions is the generated image
       and losses is a dictionary of losses (that get added for the final loss).
     """
     features["targets"] = features["inputs"]
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
@@ -1,11 +1,11 @@
 # Tensor2Tensor experimental Model-Based Reinforcement Learning.
 
-The rl package intention is to provide possibility to run reinforcement
+The rl package intention is to provide the ability to run reinforcement
 algorithms within TensorFlow's computation graph, in order to do model-based
 RL using environment models from Tensor2Tensor. It's very experimental
 for now and under heavy development.
 
-Currently the only supported algorithm is Proximy Policy Optimization - PPO.
+Currently the only supported algorithm is Proximal Policy Optimization - PPO.
 
 # Sample usages
 
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
@@ -135,7 +135,7 @@ def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
       [batch_size, beam_size]. We will return the gathered scores from here.
       Scores to gather is different from scores because for grow_alive, we will
       need to return log_probs, while for grow_finished, we will need to return
-      the length penalized scors.
+      the length penalized scores.
     flags: Tensor of bools for sequences that say whether a sequence has reached
       EOS or not
     beam_size: int
@@ -229,7 +229,7 @@ def beam_search(symbols_to_logits_fn,
   Returns:
     Tuple of
     (decoded beams [batch_size, beam_size, decode_length]
-     decoding probablities [batch_size, beam_size])
+     decoding probabilities [batch_size, beam_size])
   """
   batch_size = common_layers.shape_list(initial_ids)[0]
 
@@ -495,17 +495,17 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     # If the sequence isn't finished, we multiply it's score by 0. since
     # scores are all -ve, taking the min will give us the score of the lowest
     # finished item.
-    lowest_score_of_fininshed_in_finished = tf.reduce_min(
+    lowest_score_of_finished_in_finished = tf.reduce_min(
         finished_scores * tf.to_float(finished_in_finished), axis=1)
     # If none of the sequences have finished, then the min will be 0 and
     # we have to replace it by -ve INF if it is. The score of any seq in alive
     # will be much higher than -ve INF and the termination condition will not
     # be met.
-    lowest_score_of_fininshed_in_finished += (
+    lowest_score_of_finished_in_finished += (
         (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
 
     bound_is_met = tf.reduce_all(
-        tf.greater(lowest_score_of_fininshed_in_finished,
+        tf.greater(lowest_score_of_finished_in_finished,
                    lower_bound_alive_scores))
 
     return tf.logical_and(
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
@@ -136,7 +136,7 @@ def _batching_scheme(batch_size,
                      min_length=0):
   """A batching scheme based on model hyperparameters.
 
-  Every batch containins a number of sequences divisible by `shard_multiplier`.
+  Every batch contains a number of sequences divisible by `shard_multiplier`.
 
   Args:
     batch_size: int, total number of tokens in a batch.
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
@@ -66,7 +66,7 @@ def add_scope(scope=None, scope_fn=None):
   """Return a decorator which add a TF name/variable scope to a function.
 
   Note that the function returned by the decorator accept an additional 'name'
-  parameter, which can overwritte the name scope given when the function is
+  parameter, which can overwrite the name scope given when the function is
   created.
 
   Args:
@@ -587,12 +587,12 @@ def restore(self, x):
 
 @add_name_scope("map_ids")
 def map_ids(x, indices, map_fn):
-  """Apply a function to each coordinate ids of a multidimentional tensor.
+  """Apply a function to each coordinate ids of a multidimensional tensor.
 
   This allows to process each sequence of a batch independently. This is
   similar to tf.map_fn but with tensor where the batch dim has been flatten.
 
-  Warning: The indices ids have to be contigous and orderd in memory as the
+  Warning: The indices ids have to be contiguous and ordered in memory as the
   output vector for each of the ids are simply concatenated after being
   processed.
   Ex: if your indices are [0,2,2,1,2,0], the output will contains the processed
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
@@ -254,7 +254,7 @@ def image_summary(predictions, features, hparams):
 
   Returns:
     summary_proto: containing the summary images.
-    weights: A Tensor of zeros of the same shape as preditions.
+    weights: A Tensor of zeros of the same shape as predictions.
   """
   del hparams
   results = tf.cast(tf.argmax(predictions, axis=-1), tf.uint8)