Merge branch 'master' of https://github.com/k2-fsa/sherpa

shaynemei · Aug 31, 2023 · e62b912 · e62b912
2 parents 57d7b94 + ac59664
commit e62b912
Show file tree

Hide file tree

Showing 23 changed files with 519 additions and 38 deletions.
diff --git a/sherpa/bin/offline_transducer_asr.py b/sherpa/bin/offline_transducer_asr.py
@@ -155,6 +155,13 @@ def add_model_args(parser: argparse.ArgumentParser):
         help="Feature dimension of the model",
     )
 
+    parser.add_argument(
+        "--use-bbpe",
+        type=str2bool,
+        default=False,
+        help="Whether the model to be used is trained with bbpe",
+    )
+
 
 def add_decoding_args(parser: argparse.ArgumentParser):
     parser.add_argument(
@@ -413,6 +420,7 @@ def create_recognizer(args) -> sherpa.OfflineRecognizer:
         use_gpu=args.use_gpu,
         num_active_paths=args.num_active_paths,
         context_score=args.context_score,
+        use_bbpe=args.use_bbpe,
         feat_config=feat_config,
         decoding_method=args.decoding_method,
         fast_beam_search_config=fast_beam_search_config,

diff --git a/sherpa/bin/offline_transducer_server.py b/sherpa/bin/offline_transducer_server.py
@@ -91,6 +91,13 @@ def add_model_args(parser: argparse.ArgumentParser):
         help="Feature dimension of the model",
     )
 
+    parser.add_argument(
+        "--use-bbpe",
+        type=sherpa.str2bool,
+        default=False,
+        help="Whether the model to be used is trained with bbpe",
+    )
+
 
 def add_decoding_args(parser: argparse.ArgumentParser):
     parser.add_argument(
@@ -645,6 +652,7 @@ def create_recognizer(args) -> sherpa.OfflineRecognizer:
         tokens=args.tokens,
         use_gpu=args.use_gpu,
         num_active_paths=args.num_active_paths,
+        use_bbpe=args.use_bbpe,
         feat_config=feat_config,
         decoding_method=args.decoding_method,
         fast_beam_search_config=fast_beam_search_config,

diff --git a/sherpa/bin/online_transducer_asr.py b/sherpa/bin/online_transducer_asr.py
@@ -144,6 +144,13 @@ def add_model_args(parser: argparse.ArgumentParser):
         help="Feature dimension of the model",
     )
 
+    parser.add_argument(
+        "--use-bbpe",
+        type=str2bool,
+        default=False,
+        help="Whether the model to be used is trained with bbpe",
+    )
+
 
 def add_decoding_args(parser: argparse.ArgumentParser):
     parser.add_argument(
@@ -402,6 +409,7 @@ def create_recognizer(args) -> sherpa.OnlineRecognizer:
         use_gpu=args.use_gpu,
         num_active_paths=args.num_active_paths,
         context_score=args.context_score,
+        use_bbpe=args.use_bbpe,
         feat_config=feat_config,
         decoding_method=args.decoding_method,
         fast_beam_search_config=fast_beam_search_config,

diff --git a/sherpa/bin/streaming_server.py b/sherpa/bin/streaming_server.py
@@ -151,6 +151,13 @@ def add_model_args(parser: argparse.ArgumentParser):
         help="Feature dimension of the model",
     )
 
+    parser.add_argument(
+        "--use-bbpe",
+        type=sherpa.str2bool,
+        default=False,
+        help="Whether the model to be used is trained with bbpe",
+    )
+
 
 def add_decoding_args(parser: argparse.ArgumentParser):
     parser.add_argument(
@@ -413,6 +420,7 @@ def create_recognizer(args) -> sherpa.OnlineRecognizer:
         tokens=args.tokens,
         use_gpu=args.use_gpu,
         num_active_paths=args.num_active_paths,
+        use_bbpe=args.use_bbpe,
         temperature=args.temperature,
         feat_config=feat_config,
         decoding_method=args.decoding_method,

diff --git a/sherpa/cpp_api/feature-config.cc b/sherpa/cpp_api/feature-config.cc
@@ -40,6 +40,12 @@ void FeatureConfig::Register(ParseOptions *po) {
   fbank_opts.mel_opts.num_bins = 80;
   RegisterMelBanksOptions(po, &fbank_opts.mel_opts);
 
+  fbank_opts.mel_opts.high_freq = -400;
+  fbank_opts.frame_opts.remove_dc_offset = true;
+  fbank_opts.frame_opts.round_to_power_of_two = true;
+  fbank_opts.energy_floor = 1e-10;
+  fbank_opts.frame_opts.snip_edges = false;
+  fbank_opts.frame_opts.samp_freq = 16000;
   po->Register("normalize-samples", &normalize_samples,
                "true to use samples in the range [-1, 1]. "
                "false to use samples in the range [-32768, 32767]. "

diff --git a/sherpa/cpp_api/offline-recognizer-transducer-impl.h b/sherpa/cpp_api/offline-recognizer-transducer-impl.h
@@ -12,6 +12,7 @@
 
 #include "sherpa/cpp_api/feature-config.h"
 #include "sherpa/cpp_api/offline-recognizer-impl.h"
+#include "sherpa/csrc/byte_util.h"
 #include "sherpa/csrc/context-graph.h"
 #include "sherpa/csrc/offline-conformer-transducer-model.h"
 #include "sherpa/csrc/offline-transducer-decoder.h"
@@ -25,7 +26,7 @@ namespace sherpa {
 
 static OfflineRecognitionResult Convert(
     const OfflineTransducerDecoderResult &src, const SymbolTable &sym_table,
-    int32_t frame_shift_ms, int32_t subsampling_factor) {
+    int32_t frame_shift_ms, int32_t subsampling_factor, bool use_bbpe) {
   OfflineRecognitionResult r;
   r.tokens.reserve(src.tokens.size());
   r.timestamps.reserve(src.timestamps.size());
@@ -37,6 +38,12 @@ static OfflineRecognitionResult Convert(
 
     r.tokens.push_back(std::move(sym));
   }
+
+  if (use_bbpe) {
+    auto bu = GetByteUtil();
+    text = bu->Decode(text);
+  }
+
   r.text = std::move(text);
 
   float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor;
@@ -69,7 +76,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
           std::make_unique<OfflineTransducerGreedySearchDecoder>(model_.get());
     } else if (config.decoding_method == "modified_beam_search") {
       decoder_ = std::make_unique<OfflineTransducerModifiedBeamSearchDecoder>(
-          model_.get(), config.num_active_paths);
+          model_.get(), config.num_active_paths, config.temperature);
     } else if (config.decoding_method == "fast_beam_search") {
       config.fast_beam_search_config.Validate();
 
@@ -133,7 +140,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
       auto ans =
           Convert(results[i], symbol_table_,
                   config_.feat_config.fbank_opts.frame_opts.frame_shift_ms,
-                  model_->SubsamplingFactor());
+                  model_->SubsamplingFactor(), config_.use_bbpe);
 
       ss[i]->SetResult(ans);
     }

diff --git a/sherpa/cpp_api/offline-recognizer.cc b/sherpa/cpp_api/offline-recognizer.cc
@@ -110,6 +110,13 @@ void OfflineRecognizerConfig::Register(ParseOptions *po) {
                "The bonus score for each token in context word/phrase. "
                "Used only when decoding_method is modified_beam_search");
 
+  po->Register("use-bbpe", &use_bbpe,
+               "true if the model to use is trained with byte level bpe, "
+               "The byte level bpe modeling unit is mainly used on CJK "
+               "languages or multilingual datasets, it can further break "
+               "the multi-byte unicode characters into byte sequence and "
+               "then train some kind of sub-char bpes.");
+
   po->Register("temperature", &temperature,
                "Softmax temperature,. "
                "Used only when decoding_method is modified_beam_search.");
@@ -155,6 +162,7 @@ std::string OfflineRecognizerConfig::ToString() const {
   os << "decoding_method=\"" << decoding_method << "\", ";
   os << "num_active_paths=" << num_active_paths << ", ";
   os << "context_score=" << context_score << ", ";
+  os << "use_bbpe=" << (use_bbpe ? "True" : "False") << ", ";
   os << "temperature=" << temperature << ")";
 
   return os.str();

diff --git a/sherpa/cpp_api/offline-recognizer.h b/sherpa/cpp_api/offline-recognizer.h
@@ -67,6 +67,9 @@ struct OfflineRecognizerConfig {
   /// used only for modified_beam_search
   float context_score = 1.5;
 
+  // True if the model used is trained with byte level bpe.
+  bool use_bbpe = false;
+
   // temperature for the softmax in the joiner
   float temperature = 1.0;
 

diff --git a/sherpa/cpp_api/online-recognizer.cc b/sherpa/cpp_api/online-recognizer.cc
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "nlohmann/json.hpp"
+#include "sherpa/csrc/byte_util.h"
 #include "sherpa/csrc/file-utils.h"
 #include "sherpa/csrc/log.h"
 #include "sherpa/csrc/online-conformer-transducer-model.h"
@@ -114,6 +115,13 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) {
                "pruned_transducer_stateless7_streaming in icefall."
                "Number of frames before subsampling during decoding.");
 
+  po->Register("use-bbpe", &use_bbpe,
+               "true if the model to use is trained with byte level bpe, "
+               "The byte level bpe modeling unit is mainly used on CJK "
+               "languages or multilingual datasets, it can further break "
+               "the multi-byte unicode characters into byte sequence and "
+               "then train some kind of sub-char bpes.");
+
   po->Register("temperature", &temperature,
                "Softmax temperature,. "
                "Used only when decoding_method is modified_beam_search.");
@@ -177,14 +185,16 @@ std::string OnlineRecognizerConfig::ToString() const {
   os << "left_context=" << left_context << ", ";
   os << "right_context=" << right_context << ", ";
   os << "chunk_size=" << chunk_size << ", ";
+  os << "use_bbpe=" << (use_bbpe ? "True" : "False") << ", ";
   os << "temperature=" << temperature << ")";
   return os.str();
 }
 
 static OnlineRecognitionResult Convert(const OnlineTransducerDecoderResult &src,
                                        const SymbolTable &sym_table,
                                        int32_t frame_shift_ms,
-                                       int32_t subsampling_factor) {
+                                       int32_t subsampling_factor,
+                                       bool use_bbpe) {
   OnlineRecognitionResult r;
   r.tokens.reserve(src.tokens.size());
   r.timestamps.reserve(src.timestamps.size());
@@ -196,6 +206,12 @@ static OnlineRecognitionResult Convert(const OnlineTransducerDecoderResult &src,
 
     r.tokens.push_back(std::move(sym));
   }
+
+  if (use_bbpe) {
+    auto bu = GetByteUtil();
+    text = bu->Decode(text);
+  }
+
   r.text = std::move(text);
 
   float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor;
@@ -440,7 +456,7 @@ class OnlineRecognizer::OnlineRecognizerImpl {
 
     auto ans = Convert(r, symbol_table_,
                        config_.feat_config.fbank_opts.frame_opts.frame_shift_ms,
-                       model_->SubsamplingFactor());
+                       model_->SubsamplingFactor(), config_.use_bbpe);
 
     ans.is_final = is_final;
     ans.segment = s->GetWavSegment();

diff --git a/sherpa/cpp_api/online-recognizer.h b/sherpa/cpp_api/online-recognizer.h
@@ -69,6 +69,9 @@ struct OnlineRecognizerConfig {
   // In number of frames after subsampling
   int32_t chunk_size = 12;
 
+  // True if the model used is trained with byte level bpe.
+  bool use_bbpe = false;
+
   // temperature for the softmax in the joiner
   float temperature = 1.0;
 

diff --git a/sherpa/csrc/CMakeLists.txt b/sherpa/csrc/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Please sort the filenames alphabetically
 set(sherpa_srcs
+  byte_util.cc
   context-graph.cc
   fbank-features.cc
   file-utils.cc
@@ -66,6 +67,7 @@ if(SHERPA_ENABLE_TESTS)
     # test-offline-conformer-transducer-model.cc
     # test-online-conv-emformer-transducer-model.cc
 
+    test-byte-util.cc
     test-context-graph.cc
     test-hypothesis.cc
     test-log.cc