From c326265c586f0b4a781b7d9516e6da5b7d5c0286 Mon Sep 17 00:00:00 2001 From: Gideon Wenniger Date: Tue, 9 Oct 2018 20:07:32 +0100 Subject: [PATCH 1/3] Changed the ctc decoder to accept an additional argument to specify the blank symbol with. The default blank symbol is " ". This was previously hard-coded in the decoder, causing problems when using it for handwriting recognition where the blank symbol may be different, for example "|". modified: ctcdecode/__init__.py modified: ctcdecode/src/binding.cpp modified: ctcdecode/src/binding.h modified: ctcdecode/src/ctc_beam_search_decoder.cpp modified: ctcdecode/src/ctc_beam_search_decoder.h modified: ctcdecode/src/decoder_utils.cpp modified: ctcdecode/src/scorer.cpp modified: ctcdecode/src/scorer.h --- ctcdecode/__init__.py | 9 ++++---- ctcdecode/src/binding.cpp | 18 +++++++++++----- ctcdecode/src/binding.h | 5 ++++- ctcdecode/src/ctc_beam_search_decoder.cpp | 9 ++++++-- ctcdecode/src/ctc_beam_search_decoder.h | 5 +++++ ctcdecode/src/decoder_utils.cpp | 5 +++-- ctcdecode/src/scorer.cpp | 26 +++++++++++++++-------- ctcdecode/src/scorer.h | 10 +++++---- 8 files changed, 60 insertions(+), 27 deletions(-) diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py index 72394bea..14b33280 100644 --- a/ctcdecode/__init__.py +++ b/ctcdecode/__init__.py @@ -4,7 +4,7 @@ class CTCBeamDecoder(object): def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, - num_processes=4, blank_id=0): + num_processes=4, blank_id=0, space_symbol=" "): self.cutoff_top_n = cutoff_top_n self._beam_width = beam_width self._scorer = None @@ -12,9 +12,10 @@ def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cu self._labels = ''.join(labels).encode() self._num_labels = len(labels) self._blank_id = blank_id + self._space_symbol=space_symbol if model_path: self._scorer = ctc_decode.paddle_get_scorer(alpha, beta, model_path.encode(), self._labels, - self._num_labels) + self._num_labels,space_symbol) self._cutoff_prob = cutoff_prob def decode(self, probs, seq_lens=None): @@ -31,11 +32,11 @@ def decode(self, probs, seq_lens=None): out_seq_len = torch.IntTensor(batch_size, self._beam_width).cpu().int() if self._scorer: ctc_decode.paddle_beam_decode_lm(probs, seq_lens, self._labels, self._num_labels, self._beam_width, - self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id, + self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._space_symbol, self._scorer, output, timesteps, scores, out_seq_len) else: ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes, - self._cutoff_prob, self.cutoff_top_n, self._blank_id, output, timesteps, + self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol, output, timesteps, scores, out_seq_len) return output, scores, timesteps, out_seq_len diff --git a/ctcdecode/src/binding.cpp b/ctcdecode/src/binding.cpp index fe7db3b9..4d155303 100644 --- a/ctcdecode/src/binding.cpp +++ b/ctcdecode/src/binding.cpp @@ -31,6 +31,7 @@ int beam_decode(THFloatTensor *th_probs, double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const std::string &space_symbol, void *scorer, THIntTensor *th_output, THIntTensor *th_timesteps, @@ -62,7 +63,7 @@ int beam_decode(THFloatTensor *th_probs, } std::vector>> batch_results = - ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, ext_scorer); + ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, space_symbol, ext_scorer); for (int b = 0; b < batch_results.size(); ++b){ std::vector> results = batch_results[b]; @@ -95,13 +96,15 @@ extern "C" double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const char* space_symbol, THIntTensor *th_output, THIntTensor *th_timesteps, THFloatTensor *th_scores, THIntTensor *th_out_length){ + std::string space_symbol_string(space_symbol); return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes, - cutoff_prob, cutoff_top_n, blank_id,NULL, th_output, th_timesteps, th_scores, th_out_length); + cutoff_prob, cutoff_top_n, blank_id, space_symbol_string, NULL, th_output, th_timesteps, th_scores, th_out_length); } int paddle_beam_decode_lm(THFloatTensor *th_probs, @@ -113,14 +116,16 @@ extern "C" double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const char* space_symbol, void *scorer, THIntTensor *th_output, THIntTensor *th_timesteps, THFloatTensor *th_scores, THIntTensor *th_out_length){ + std::string space_symbol_string(space_symbol); return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes, - cutoff_prob, cutoff_top_n, blank_id,scorer, th_output, th_timesteps, th_scores, th_out_length); + cutoff_prob, cutoff_top_n, blank_id, space_symbol_string, scorer, th_output, th_timesteps, th_scores, th_out_length); } @@ -128,10 +133,13 @@ extern "C" double beta, const char* lm_path, const char* labels, - int vocab_size) { + int vocab_size, + const char* space_symbol) { std::vector new_vocab; utf8_to_utf8_char_vec(labels, new_vocab); - Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab); + // Create a string object from the char* space_symbol + std::string space_symbol_string(space_symbol); + Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab, space_symbol_string); return static_cast(scorer); } diff --git a/ctcdecode/src/binding.h b/ctcdecode/src/binding.h index 7018b464..7b4691e9 100644 --- a/ctcdecode/src/binding.h +++ b/ctcdecode/src/binding.h @@ -7,6 +7,7 @@ int paddle_beam_decode(THFloatTensor *th_probs, double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const char* space_symbol, THIntTensor *th_output, THIntTensor *th_timesteps, THFloatTensor *th_scores, @@ -21,6 +22,7 @@ int paddle_beam_decode_lm(THFloatTensor *th_probs, double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const char* space_symbol, void *scorer, THIntTensor *th_output, THIntTensor *th_timesteps, @@ -31,7 +33,8 @@ void* paddle_get_scorer(double alpha, double beta, const char* lm_path, const char* labels, - int vocab_size); + int vocab_size, + const char* space_symbol); int is_character_based(void *scorer); size_t get_max_order(void *scorer); diff --git a/ctcdecode/src/ctc_beam_search_decoder.cpp b/ctcdecode/src/ctc_beam_search_decoder.cpp index 713d4a50..c97c4359 100644 --- a/ctcdecode/src/ctc_beam_search_decoder.cpp +++ b/ctcdecode/src/ctc_beam_search_decoder.cpp @@ -21,6 +21,7 @@ std::vector> ctc_beam_search_decoder( double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const std::string &space_symbol, Scorer *ext_scorer) { // dimension check size_t num_time_steps = probs_seq.size(); @@ -35,7 +36,9 @@ std::vector> ctc_beam_search_decoder( // size_t blank_id = vocabulary.size(); // assign space id - auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); + // Changed by Gideon from the blank symbol " " to a custom symbol specified as argument + auto it = std::find(vocabulary.begin(), vocabulary.end(), space_symbol); + //auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); int space_id = it - vocabulary.begin(); // if no space in vocabulary if ((size_t)space_id >= vocabulary.size()) { @@ -174,7 +177,7 @@ std::vector> ctc_beam_search_decoder( std::vector timesteps; prefixes[i]->get_path_vec(output, timesteps); auto prefix_length = output.size(); - auto words = ext_scorer->split_labels(output); + auto words = ext_scorer->split_labels(output, space_symbol); // remove word insert approx_ctc = approx_ctc - prefix_length * ext_scorer->beta; // remove language model weight: @@ -196,6 +199,7 @@ ctc_beam_search_decoder_batch( double cutoff_prob, size_t cutoff_top_n, size_t blank_id, + const std::string &space_symbol, Scorer *ext_scorer) { VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); // thread pool @@ -213,6 +217,7 @@ ctc_beam_search_decoder_batch( cutoff_prob, cutoff_top_n, blank_id, + space_symbol, ext_scorer)); } diff --git a/ctcdecode/src/ctc_beam_search_decoder.h b/ctcdecode/src/ctc_beam_search_decoder.h index 4bd43eed..be8c9c5b 100644 --- a/ctcdecode/src/ctc_beam_search_decoder.h +++ b/ctcdecode/src/ctc_beam_search_decoder.h @@ -25,6 +25,8 @@ * in desending order. */ +const std::string DEFAULT_SPACE_SYMBOL = std::string(" "); + std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, const std::vector &vocabulary, @@ -32,6 +34,7 @@ std::vector> ctc_beam_search_decoder( double cutoff_prob = 1.0, size_t cutoff_top_n = 40, size_t blank_id = 0, + const std::string &space_symbol = DEFAULT_SPACE_SYMBOL, Scorer *ext_scorer = nullptr); /* CTC Beam Search Decoder for batch data @@ -44,6 +47,7 @@ std::vector> ctc_beam_search_decoder( * num_processes: Number of threads for beam search. * cutoff_prob: Cutoff probability for pruning. * cutoff_top_n: Cutoff number for pruning. + * space_symbol: The symbol used to indicate spaces, default is " ". * ext_scorer: External scorer to evaluate a prefix, which consists of * n-gram language model scoring and word insertion term. * Default null, decoding the input sample without scorer. @@ -60,6 +64,7 @@ ctc_beam_search_decoder_batch( double cutoff_prob = 1.0, size_t cutoff_top_n = 40, size_t blank_id = 0, + const std::string &space_symbol = DEFAULT_SPACE_SYMBOL, Scorer *ext_scorer = nullptr); #endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/ctcdecode/src/decoder_utils.cpp b/ctcdecode/src/decoder_utils.cpp index 375a06fe..5812a996 100644 --- a/ctcdecode/src/decoder_utils.cpp +++ b/ctcdecode/src/decoder_utils.cpp @@ -153,7 +153,8 @@ bool add_word_to_dictionary( std::vector int_word; for (auto &c : characters) { - if (c == " ") { + // if (c == " ") { + if (c == "|") { // Gideon: replaced the space symbol " " => "|" int_word.push_back(SPACE_ID); } else { auto int_c = char_map.find(c); @@ -171,4 +172,4 @@ bool add_word_to_dictionary( add_word_to_fst(int_word, dictionary); return true; // return with successful adding -} \ No newline at end of file +} diff --git a/ctcdecode/src/scorer.cpp b/ctcdecode/src/scorer.cpp index f55e7a0b..df545153 100644 --- a/ctcdecode/src/scorer.cpp +++ b/ctcdecode/src/scorer.cpp @@ -16,7 +16,8 @@ using namespace lm::ngram; Scorer::Scorer(double alpha, double beta, const std::string& lm_path, - const std::vector& vocab_list) { + const std::vector& vocab_list, + const std::string &space_symbol) { this->alpha = alpha; this->beta = beta; @@ -28,7 +29,7 @@ Scorer::Scorer(double alpha, dict_size_ = 0; SPACE_ID_ = -1; - setup(lm_path, vocab_list); + setup(lm_path, vocab_list, space_symbol); } Scorer::~Scorer() { @@ -41,11 +42,12 @@ Scorer::~Scorer() { } void Scorer::setup(const std::string& lm_path, - const std::vector& vocab_list) { + const std::vector& vocab_list, + const std::string &space_symbol) { // load language model load_lm(lm_path); // set char map for scorer - set_char_map(vocab_list); + set_char_map(vocab_list, space_symbol); // fill the dictionary for FST if (!is_character_based()) { fill_dictionary(true); @@ -79,10 +81,14 @@ double Scorer::get_log_cond_prob(const std::vector& words) { model->NullContextWrite(&state); for (size_t i = 0; i < words.size(); ++i) { lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]); + // encounter OOV if (word_index == 0) { return OOV_SCORE; } + // Gideon: Alternatively, comment out above (but in fact, it doesn't seem to work better) + // Rather than using hard-code OOV score, assign the language model probability to the OOV words. + // See: https://github.com/parlance/ctcdecode/issues/62 cond_prob = model->BaseScore(&state, word_index, &out_state); tmp_state = state; state = out_state; @@ -132,7 +138,7 @@ std::string Scorer::vec2str(const std::vector& input) { return word; } -std::vector Scorer::split_labels(const std::vector& labels) { +std::vector Scorer::split_labels(const std::vector& labels, const std::string &space_symbol) { if (labels.empty()) return {}; std::string s = vec2str(labels); @@ -140,18 +146,20 @@ std::vector Scorer::split_labels(const std::vector& labels) { if (is_character_based_) { words = split_utf8_str(s); } else { - words = split_str(s, " "); + // words = split_str(s, " "); + words = split_str(s, space_symbol); //Gideon: replaced the space character from " " to a custom string } return words; } -void Scorer::set_char_map(const std::vector& char_list) { +void Scorer::set_char_map(const std::vector& char_list, const std::string &space_symbol) { char_list_ = char_list; char_map_.clear(); for (size_t i = 0; i < char_list_.size(); i++) { - if (char_list_[i] == " ") { - SPACE_ID_ = i; + //if (char_list_[i] == " ") { + if (char_list_[i] == space_symbol) { //Gideon: replaced the space character from " " to a custom string + SPACE_ID_ = i; } // The initial state of FST is state 0, hence the index of chars in // the FST should start from 1 to avoid the conflict with the initial diff --git a/ctcdecode/src/scorer.h b/ctcdecode/src/scorer.h index 5ebc719c..84c62815 100644 --- a/ctcdecode/src/scorer.h +++ b/ctcdecode/src/scorer.h @@ -43,7 +43,8 @@ class Scorer { Scorer(double alpha, double beta, const std::string &lm_path, - const std::vector &vocabulary); + const std::vector &vocabulary, + const std::string &space_symbol); ~Scorer(); double get_log_cond_prob(const std::vector &words); @@ -67,7 +68,7 @@ class Scorer { // trransform the labels in index to the vector of words (word based lm) or // the vector of characters (character based lm) - std::vector split_labels(const std::vector &labels); + std::vector split_labels(const std::vector &labels, const std::string &space_symbol); // language model weight double alpha; @@ -80,7 +81,8 @@ class Scorer { protected: // necessary setup: load language model, set char map, fill FST's dictionary void setup(const std::string &lm_path, - const std::vector &vocab_list); + const std::vector &vocab_list, + const std::string &space_symbo); // load language model from given path void load_lm(const std::string &lm_path); @@ -89,7 +91,7 @@ class Scorer { void fill_dictionary(bool add_space); // set char map - void set_char_map(const std::vector &char_list); + void set_char_map(const std::vector &char_list, const std::string &space_symbol); double get_log_prob(const std::vector &words); From ddf4da8b9ba59157b5ebb8c03563da6262464f94 Mon Sep 17 00:00:00 2001 From: Gideon Wenniger Date: Tue, 9 Oct 2018 20:38:33 +0100 Subject: [PATCH 2/3] Fixed a minor bug in ctcdecode/__init__.py: the method "encode" must be called on the space symbol before passing when passing it as an argument to "ctc_decode.paddle_get_scorer" and "ctc_decode.paddle_beam_decode" and "ctc_decode.paddle_beam_decode_lm". Otherwise it will not be in the const* char format expected by the c++ interface for these parameters. modified: ctcdecode/__init__.py --- ctcdecode/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py index 14b33280..3e8f0688 100644 --- a/ctcdecode/__init__.py +++ b/ctcdecode/__init__.py @@ -12,10 +12,11 @@ def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cu self._labels = ''.join(labels).encode() self._num_labels = len(labels) self._blank_id = blank_id - self._space_symbol=space_symbol + self._space_symbol = space_symbol + if model_path: self._scorer = ctc_decode.paddle_get_scorer(alpha, beta, model_path.encode(), self._labels, - self._num_labels,space_symbol) + self._num_labels, self._space_symbol.encode()) self._cutoff_prob = cutoff_prob def decode(self, probs, seq_lens=None): @@ -32,11 +33,12 @@ def decode(self, probs, seq_lens=None): out_seq_len = torch.IntTensor(batch_size, self._beam_width).cpu().int() if self._scorer: ctc_decode.paddle_beam_decode_lm(probs, seq_lens, self._labels, self._num_labels, self._beam_width, - self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._space_symbol, + self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id, + self._space_symbol.encode(), self._scorer, output, timesteps, scores, out_seq_len) else: ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes, - self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol, output, timesteps, + self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol.encode(), output, timesteps, scores, out_seq_len) return output, scores, timesteps, out_seq_len From b1294a66c7736f1d91a32ef911ff92cb754a6650 Mon Sep 17 00:00:00 2001 From: "G.E. Maillette de Buij Wenniger" Date: Mon, 17 Jun 2019 11:20:58 +0200 Subject: [PATCH 3/3] Fixed an error in __init__py. The order of "self._log_probs" and "self._space_symbol.encode()" was swapped during merging of the code. - self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._log_probs, self._space_symbol.encode(), output, timesteps, - scores, out_seq_len) + self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol.encode(), self._log_probs, + output, timesteps, scores, out_seq_len) modified: ctcdecode/__init__.py --- ctcdecode/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctcdecode/__init__.py b/ctcdecode/__init__.py index 57ff0c29..9f554e1d 100644 --- a/ctcdecode/__init__.py +++ b/ctcdecode/__init__.py @@ -41,8 +41,8 @@ def decode(self, probs, seq_lens=None): self._log_probs, self._scorer, output, timesteps, scores, out_seq_len) else: ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes, - self._cutoff_prob, self.cutoff_top_n, self._blank_id,self._log_probs, self._space_symbol.encode(), output, timesteps, - scores, out_seq_len) + self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol.encode(), self._log_probs, + output, timesteps, scores, out_seq_len) return output, scores, timesteps, out_seq_len