From c326265c586f0b4a781b7d9516e6da5b7d5c0286 Mon Sep 17 00:00:00 2001
From: Gideon Wenniger <gideonwenniger@domain.com>
Date: Tue, 9 Oct 2018 20:07:32 +0100
Subject: [PATCH 1/3] Changed the ctc decoder to accept an additional argument
 to specify the blank symbol with. The default blank symbol is " ". This was
 previously hard-coded in the decoder, causing problems when using it for
 handwriting recognition where the blank symbol may be different, for example
 "|".

	modified:   ctcdecode/__init__.py
	modified:   ctcdecode/src/binding.cpp
	modified:   ctcdecode/src/binding.h
	modified:   ctcdecode/src/ctc_beam_search_decoder.cpp
	modified:   ctcdecode/src/ctc_beam_search_decoder.h
	modified:   ctcdecode/src/decoder_utils.cpp
	modified:   ctcdecode/src/scorer.cpp
	modified:   ctcdecode/src/scorer.h
---
 ctcdecode/__init__.py                     |  9 ++++----
 ctcdecode/src/binding.cpp                 | 18 +++++++++++-----
 ctcdecode/src/binding.h                   |  5 ++++-
 ctcdecode/src/ctc_beam_search_decoder.cpp |  9 ++++++--
 ctcdecode/src/ctc_beam_search_decoder.h   |  5 +++++
 ctcdecode/src/decoder_utils.cpp           |  5 +++--
 ctcdecode/src/scorer.cpp                  | 26 +++++++++++++++--------
 ctcdecode/src/scorer.h                    | 10 +++++----
 8 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py
index 72394bea..14b33280 100644
--- a/ctcdecode/__init__.py
+++ b/ctcdecode/__init__.py
@@ -4,7 +4,7 @@
 
 class CTCBeamDecoder(object):
     def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
-                 num_processes=4, blank_id=0):
+                 num_processes=4, blank_id=0, space_symbol=" "):
         self.cutoff_top_n = cutoff_top_n
         self._beam_width = beam_width
         self._scorer = None
@@ -12,9 +12,10 @@ def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cu
         self._labels = ''.join(labels).encode()
         self._num_labels = len(labels)
         self._blank_id = blank_id
+	self._space_symbol=space_symbol
         if model_path:
             self._scorer = ctc_decode.paddle_get_scorer(alpha, beta, model_path.encode(), self._labels,
-                                                        self._num_labels)
+                                                        self._num_labels,space_symbol)
         self._cutoff_prob = cutoff_prob
 
     def decode(self, probs, seq_lens=None):
@@ -31,11 +32,11 @@ def decode(self, probs, seq_lens=None):
         out_seq_len = torch.IntTensor(batch_size, self._beam_width).cpu().int()
         if self._scorer:
             ctc_decode.paddle_beam_decode_lm(probs, seq_lens, self._labels, self._num_labels, self._beam_width,
-                                             self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,
+                                             self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._space_symbol,	
                                              self._scorer, output, timesteps, scores, out_seq_len)
         else:
             ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes,
-                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id, output, timesteps,
+                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol, output, timesteps,
                                           scores, out_seq_len)
 
         return output, scores, timesteps, out_seq_len
diff --git a/ctcdecode/src/binding.cpp b/ctcdecode/src/binding.cpp
index fe7db3b9..4d155303 100644
--- a/ctcdecode/src/binding.cpp
+++ b/ctcdecode/src/binding.cpp
@@ -31,6 +31,7 @@ int beam_decode(THFloatTensor *th_probs,
                 double cutoff_prob,
                 size_t cutoff_top_n,
                 size_t blank_id,
+		const std::string &space_symbol,	
                 void *scorer,
                 THIntTensor *th_output,
                 THIntTensor *th_timesteps,
@@ -62,7 +63,7 @@ int beam_decode(THFloatTensor *th_probs,
     }
 
     std::vector<std::vector<std::pair<double, Output>>> batch_results =
-    ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, ext_scorer);
+    ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, space_symbol, ext_scorer);
 
     for (int b = 0; b < batch_results.size(); ++b){
         std::vector<std::pair<double, Output>> results = batch_results[b];
@@ -95,13 +96,15 @@ extern "C"
                                double cutoff_prob,
                                size_t cutoff_top_n,
                                size_t blank_id,
+			       const char* space_symbol,
                                THIntTensor *th_output,
                                THIntTensor *th_timesteps,
                                THFloatTensor *th_scores,
                                THIntTensor *th_out_length){
 
+	    std::string space_symbol_string(space_symbol);	
             return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes,
-                        cutoff_prob, cutoff_top_n, blank_id,NULL, th_output, th_timesteps, th_scores, th_out_length);
+                        cutoff_prob, cutoff_top_n, blank_id, space_symbol_string, NULL, th_output, th_timesteps, th_scores, th_out_length);
         }
 
         int paddle_beam_decode_lm(THFloatTensor *th_probs,
@@ -113,14 +116,16 @@ extern "C"
                                   double cutoff_prob,
                                   size_t cutoff_top_n,
                                   size_t blank_id,
+				  const char* space_symbol,
                                   void *scorer,
                                   THIntTensor *th_output,
                                   THIntTensor *th_timesteps,
                                   THFloatTensor *th_scores,
                                   THIntTensor *th_out_length){
 
+	    std::string space_symbol_string(space_symbol);	
             return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes,
-                        cutoff_prob, cutoff_top_n, blank_id,scorer, th_output, th_timesteps, th_scores, th_out_length);
+                        cutoff_prob, cutoff_top_n, blank_id, space_symbol_string, scorer, th_output, th_timesteps, th_scores, th_out_length);
         }
 
 
@@ -128,10 +133,13 @@ extern "C"
                             double beta,
                             const char* lm_path,
                             const char* labels,
-                            int vocab_size) {
+                            int vocab_size,
+                            const char* space_symbol) {
         std::vector<std::string> new_vocab;
         utf8_to_utf8_char_vec(labels, new_vocab);
-        Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab);
+	// Create a string object from the char* space_symbol
+	std::string space_symbol_string(space_symbol);		
+        Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab, space_symbol_string);
         return static_cast<void*>(scorer);
     }
 
diff --git a/ctcdecode/src/binding.h b/ctcdecode/src/binding.h
index 7018b464..7b4691e9 100644
--- a/ctcdecode/src/binding.h
+++ b/ctcdecode/src/binding.h
@@ -7,6 +7,7 @@ int paddle_beam_decode(THFloatTensor *th_probs,
                        double cutoff_prob,
                        size_t cutoff_top_n,
                        size_t blank_id,
+		       const char* space_symbol,
                        THIntTensor *th_output,
                        THIntTensor *th_timesteps,
                        THFloatTensor *th_scores,
@@ -21,6 +22,7 @@ int paddle_beam_decode_lm(THFloatTensor *th_probs,
                           double cutoff_prob,
                           size_t cutoff_top_n,
                           size_t blank_id,
+			  const char* space_symbol,
                           void *scorer,
                           THIntTensor *th_output,
                           THIntTensor *th_timesteps,
@@ -31,7 +33,8 @@ void* paddle_get_scorer(double alpha,
                         double beta,
                         const char* lm_path,
                         const char* labels,
-                        int vocab_size);
+                        int vocab_size, 
+			const char* space_symbol);
 
 int is_character_based(void *scorer);
 size_t get_max_order(void *scorer);
diff --git a/ctcdecode/src/ctc_beam_search_decoder.cpp b/ctcdecode/src/ctc_beam_search_decoder.cpp
index 713d4a50..c97c4359 100644
--- a/ctcdecode/src/ctc_beam_search_decoder.cpp
+++ b/ctcdecode/src/ctc_beam_search_decoder.cpp
@@ -21,6 +21,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
     double cutoff_prob,
     size_t cutoff_top_n,
     size_t blank_id,
+    const std::string &space_symbol,
     Scorer *ext_scorer) {
   // dimension check
   size_t num_time_steps = probs_seq.size();
@@ -35,7 +36,9 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
   // size_t blank_id = vocabulary.size();
 
   // assign space id
-  auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+  // Changed by Gideon from the blank symbol " " to a custom symbol specified as argument
+  auto it = std::find(vocabulary.begin(), vocabulary.end(), space_symbol);
+  //auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
   int space_id = it - vocabulary.begin();
   // if no space in vocabulary
   if ((size_t)space_id >= vocabulary.size()) {
@@ -174,7 +177,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
       std::vector<int> timesteps;
       prefixes[i]->get_path_vec(output, timesteps);
       auto prefix_length = output.size();
-      auto words = ext_scorer->split_labels(output);
+      auto words = ext_scorer->split_labels(output, space_symbol);
       // remove word insert
       approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
       // remove language model weight:
@@ -196,6 +199,7 @@ ctc_beam_search_decoder_batch(
     double cutoff_prob,
     size_t cutoff_top_n,
     size_t blank_id,
+    const std::string &space_symbol,
     Scorer *ext_scorer) {
   VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
   // thread pool
@@ -213,6 +217,7 @@ ctc_beam_search_decoder_batch(
                                   cutoff_prob,
                                   cutoff_top_n,
                                   blank_id,
+				  space_symbol,
                                   ext_scorer));
   }
 
diff --git a/ctcdecode/src/ctc_beam_search_decoder.h b/ctcdecode/src/ctc_beam_search_decoder.h
index 4bd43eed..be8c9c5b 100644
--- a/ctcdecode/src/ctc_beam_search_decoder.h
+++ b/ctcdecode/src/ctc_beam_search_decoder.h
@@ -25,6 +25,8 @@
  *     in desending order.
 */
 
+const std::string DEFAULT_SPACE_SYMBOL =  std::string(" ");
+
 std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
     const std::vector<std::vector<double>> &probs_seq,
     const std::vector<std::string> &vocabulary,
@@ -32,6 +34,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
     double cutoff_prob = 1.0,
     size_t cutoff_top_n = 40,
     size_t blank_id = 0,
+    const std::string &space_symbol = DEFAULT_SPACE_SYMBOL,	 
     Scorer *ext_scorer = nullptr);
 
 /* CTC Beam Search Decoder for batch data
@@ -44,6 +47,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
  *     num_processes: Number of threads for beam search.
  *     cutoff_prob: Cutoff probability for pruning.
  *     cutoff_top_n: Cutoff number for pruning.
+ *     space_symbol: The symbol used to indicate spaces, default is " ".
  *     ext_scorer: External scorer to evaluate a prefix, which consists of
  *                 n-gram language model scoring and word insertion term.
  *                 Default null, decoding the input sample without scorer.
@@ -60,6 +64,7 @@ ctc_beam_search_decoder_batch(
     double cutoff_prob = 1.0,
     size_t cutoff_top_n = 40,
     size_t blank_id = 0,
+    const std::string &space_symbol = DEFAULT_SPACE_SYMBOL,
     Scorer *ext_scorer = nullptr);
 
 #endif  // CTC_BEAM_SEARCH_DECODER_H_
diff --git a/ctcdecode/src/decoder_utils.cpp b/ctcdecode/src/decoder_utils.cpp
index 375a06fe..5812a996 100644
--- a/ctcdecode/src/decoder_utils.cpp
+++ b/ctcdecode/src/decoder_utils.cpp
@@ -153,7 +153,8 @@ bool add_word_to_dictionary(
   std::vector<int> int_word;
 
   for (auto &c : characters) {
-    if (c == " ") {
+    // if (c == " ") {
+    if (c == "|") {   // Gideon: replaced the space symbol " " => "|"
       int_word.push_back(SPACE_ID);
     } else {
       auto int_c = char_map.find(c);
@@ -171,4 +172,4 @@ bool add_word_to_dictionary(
 
   add_word_to_fst(int_word, dictionary);
   return true;  // return with successful adding
-}
\ No newline at end of file
+}
diff --git a/ctcdecode/src/scorer.cpp b/ctcdecode/src/scorer.cpp
index f55e7a0b..df545153 100644
--- a/ctcdecode/src/scorer.cpp
+++ b/ctcdecode/src/scorer.cpp
@@ -16,7 +16,8 @@ using namespace lm::ngram;
 Scorer::Scorer(double alpha,
                double beta,
                const std::string& lm_path,
-               const std::vector<std::string>& vocab_list) {
+               const std::vector<std::string>& vocab_list,
+               const std::string &space_symbol) {
   this->alpha = alpha;
   this->beta = beta;
 
@@ -28,7 +29,7 @@ Scorer::Scorer(double alpha,
   dict_size_ = 0;
   SPACE_ID_ = -1;
 
-  setup(lm_path, vocab_list);
+  setup(lm_path, vocab_list, space_symbol);
 }
 
 Scorer::~Scorer() {
@@ -41,11 +42,12 @@ Scorer::~Scorer() {
 }
 
 void Scorer::setup(const std::string& lm_path,
-                   const std::vector<std::string>& vocab_list) {
+                   const std::vector<std::string>& vocab_list,
+                   const std::string &space_symbol) {
   // load language model
   load_lm(lm_path);
   // set char map for scorer
-  set_char_map(vocab_list);
+  set_char_map(vocab_list, space_symbol);
   // fill the dictionary for FST
   if (!is_character_based()) {
     fill_dictionary(true);
@@ -79,10 +81,14 @@ double Scorer::get_log_cond_prob(const std::vector<std::string>& words) {
   model->NullContextWrite(&state);
   for (size_t i = 0; i < words.size(); ++i) {
     lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]);
+    
     // encounter OOV
     if (word_index == 0) {
       return OOV_SCORE;
     }
+    // Gideon: Alternatively, comment out above (but in fact, it doesn't seem to work better) 
+    // Rather than using hard-code OOV score, assign the language model  <UNK> probability to the OOV words.
+    // See: https://github.com/parlance/ctcdecode/issues/62 	
     cond_prob = model->BaseScore(&state, word_index, &out_state);
     tmp_state = state;
     state = out_state;
@@ -132,7 +138,7 @@ std::string Scorer::vec2str(const std::vector<int>& input) {
   return word;
 }
 
-std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
+std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels, const std::string &space_symbol) {
   if (labels.empty()) return {};
 
   std::string s = vec2str(labels);
@@ -140,18 +146,20 @@ std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
   if (is_character_based_) {
     words = split_utf8_str(s);
   } else {
-    words = split_str(s, " ");
+    // words = split_str(s, " ");
+    words = split_str(s, space_symbol);  //Gideon: replaced the space character from " " to a custom string
   }
   return words;
 }
 
-void Scorer::set_char_map(const std::vector<std::string>& char_list) {
+void Scorer::set_char_map(const std::vector<std::string>& char_list, const std::string &space_symbol) {
   char_list_ = char_list;
   char_map_.clear();
 
   for (size_t i = 0; i < char_list_.size(); i++) {
-    if (char_list_[i] == " ") {
-      SPACE_ID_ = i;
+    //if (char_list_[i] == " ") {
+    if (char_list_[i] == space_symbol) { //Gideon: replaced the space character from " " to a custom string
+     SPACE_ID_ = i;
     }
     // The initial state of FST is state 0, hence the index of chars in
     // the FST should start from 1 to avoid the conflict with the initial
diff --git a/ctcdecode/src/scorer.h b/ctcdecode/src/scorer.h
index 5ebc719c..84c62815 100644
--- a/ctcdecode/src/scorer.h
+++ b/ctcdecode/src/scorer.h
@@ -43,7 +43,8 @@ class Scorer {
   Scorer(double alpha,
          double beta,
          const std::string &lm_path,
-         const std::vector<std::string> &vocabulary);
+         const std::vector<std::string> &vocabulary,
+         const std::string &space_symbol);
   ~Scorer();
 
   double get_log_cond_prob(const std::vector<std::string> &words);
@@ -67,7 +68,7 @@ class Scorer {
 
   // trransform the labels in index to the vector of words (word based lm) or
   // the vector of characters (character based lm)
-  std::vector<std::string> split_labels(const std::vector<int> &labels);
+  std::vector<std::string> split_labels(const std::vector<int> &labels, const std::string &space_symbol);
 
   // language model weight
   double alpha;
@@ -80,7 +81,8 @@ class Scorer {
 protected:
   // necessary setup: load language model, set char map, fill FST's dictionary
   void setup(const std::string &lm_path,
-             const std::vector<std::string> &vocab_list);
+             const std::vector<std::string> &vocab_list,
+             const std::string &space_symbo);
 
   // load language model from given path
   void load_lm(const std::string &lm_path);
@@ -89,7 +91,7 @@ class Scorer {
   void fill_dictionary(bool add_space);
 
   // set char map
-  void set_char_map(const std::vector<std::string> &char_list);
+  void set_char_map(const std::vector<std::string> &char_list, const std::string &space_symbol);
 
   double get_log_prob(const std::vector<std::string> &words);
 

From ddf4da8b9ba59157b5ebb8c03563da6262464f94 Mon Sep 17 00:00:00 2001
From: Gideon Wenniger <gideonwenniger@domain.com>
Date: Tue, 9 Oct 2018 20:38:33 +0100
Subject: [PATCH 2/3] Fixed a minor bug in  ctcdecode/__init__.py: the method
 "encode" must be called on the space symbol before passing when passing it as
 an argument to "ctc_decode.paddle_get_scorer" and 
 "ctc_decode.paddle_beam_decode" and "ctc_decode.paddle_beam_decode_lm".
 Otherwise it  will not be in the const* char format expected by the c++
 interface for these parameters.

	modified:   ctcdecode/__init__.py
---
 ctcdecode/__init__.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py
index 14b33280..3e8f0688 100644
--- a/ctcdecode/__init__.py
+++ b/ctcdecode/__init__.py
@@ -12,10 +12,11 @@ def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cu
         self._labels = ''.join(labels).encode()
         self._num_labels = len(labels)
         self._blank_id = blank_id
-	self._space_symbol=space_symbol
+        self._space_symbol = space_symbol
+        
         if model_path:
             self._scorer = ctc_decode.paddle_get_scorer(alpha, beta, model_path.encode(), self._labels,
-                                                        self._num_labels,space_symbol)
+                                                        self._num_labels, self._space_symbol.encode())
         self._cutoff_prob = cutoff_prob
 
     def decode(self, probs, seq_lens=None):
@@ -32,11 +33,12 @@ def decode(self, probs, seq_lens=None):
         out_seq_len = torch.IntTensor(batch_size, self._beam_width).cpu().int()
         if self._scorer:
             ctc_decode.paddle_beam_decode_lm(probs, seq_lens, self._labels, self._num_labels, self._beam_width,
-                                             self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._space_symbol,	
+                                             self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,
+                                             self._space_symbol.encode(),	
                                              self._scorer, output, timesteps, scores, out_seq_len)
         else:
             ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes,
-                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol, output, timesteps,
+                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol.encode(), output, timesteps,
                                           scores, out_seq_len)
 
         return output, scores, timesteps, out_seq_len

From b1294a66c7736f1d91a32ef911ff92cb754a6650 Mon Sep 17 00:00:00 2001
From: "G.E. Maillette de Buij Wenniger" <gideonwenniger@domain.com>
Date: Mon, 17 Jun 2019 11:20:58 +0200
Subject: [PATCH 3/3] Fixed an error in __init__py. The order of
 "self._log_probs" and "self._space_symbol.encode()" was swapped during
 merging of the code.

-                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._log_probs, self._space_symbol.encode(), output, timesteps,
-                                          scores, out_seq_len)
+                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id,  self._space_symbol.encode(), self._log_probs,
+                                          output, timesteps, scores, out_seq_len)

	modified:   ctcdecode/__init__.py
---
 ctcdecode/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py
index 57ff0c29..9f554e1d 100644
--- a/ctcdecode/__init__.py
+++ b/ctcdecode/__init__.py
@@ -41,8 +41,8 @@ def decode(self, probs, seq_lens=None):
                                              self._log_probs, self._scorer, output, timesteps, scores, out_seq_len)
         else:
             ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes,
-                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._log_probs, self._space_symbol.encode(), output, timesteps,
-                                          scores, out_seq_len)
+                                          self._cutoff_prob, self.cutoff_top_n, self._blank_id,  self._space_symbol.encode(), self._log_probs,
+                                          output, timesteps, scores, out_seq_len)
 
         return output, scores, timesteps, out_seq_len