diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt new file mode 100644 index 000000000..a14dfa330 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav +{"text": " AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "timestamps": [0.00, 0.36, 0.52, 0.68, 0.96, 1.00, 1.08, 1.28, 1.40, 1.48, 1.60, 1.76, 1.80, 1.88, 1.92, 2.00, 2.20, 2.32, 2.36, 2.48, 2.60, 2.80, 2.84, 2.92, 3.12, 3.32, 3.56, 3.76, 4.04, 4.24, 4.32, 4.40, 4.56, 4.80, 4.92, 5.08, 5.36, 5.48, 5.64, 5.72, 5.88, 6.04, 6.24], "tokens":[" AFTER", " E", "AR", "LY", " ", "N", "IGHT", "F", "AL", "L", " THE", " ", "Y", "E", "LL", "OW", " LA", "M", "P", "S", " WOULD", " ", "L", "IGHT", " UP", " HERE", " AND", " THERE", " THE", " S", "QU", "AL", "ID", " QU", "AR", "TER", " OF", " THE", " B", "RO", "TH", "EL", "S"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav +{"text": " GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN", "timestamps": [0.00, 0.16, 0.40, 0.68, 0.84, 0.96, 1.08, 1.12, 1.32, 1.52, 1.68, 1.76, 2.00, 2.12, 2.28, 2.40, 2.64, 2.92, 3.20, 3.32, 3.52, 3.64, 3.76, 3.96, 4.12, 4.36, 4.52, 4.72, 4.92, 5.16, 5.40, 5.64, 5.76, 5.88, 6.12, 6.28, 6.52, 6.84, 7.08, 7.32, 7.60, 7.92, 8.12, 8.24, 8.36, 8.48, 8.64, 8.76, 8.88, 9.12, 9.32, 9.48, 9.56, 9.60, 9.76, 10.00, 10.12, 10.20, 10.44, 10.68, 10.80, 11.00, 11.20, 11.36, 11.52, 11.76, 12.00, 12.12, 12.24, 12.28, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.44, 13.64, 13.76, 14.00, 14.08, 14.24, 14.52, 14.68, 14.80, 15.00, 15.04, 15.28, 15.48, 15.76, 16.00, 16.12, 16.16, 16.32], "tokens":[" GO", "D", " AS", " A", " DI", "RE", "C", "T", " CON", "SE", "QU", "ENCE", " OF", " THE", " S", "IN", " WHICH", " MAN", " TH", "US", " P", "UN", "ISH", "ED", " HAD", " GIVE", "N", " HER", " A", " LOVE", "LY", " CHI", "L", "D", " WHO", "SE", " PLACE", " WAS", " ON", " THAT", " SAME", " DIS", "HO", "N", "OR", "ED", " BO", "S", "OM", " TO", " CON", "NE", "C", "T", " HER", " PA", "R", "ENT", " FOR", " E", "VER", " WITH", " THE", " RA", "CE", " AND", " DE", "S", "C", "ENT", " OF", " MO", "R", "T", "AL", "S", " AND", " TO", " BE", " F", "IN", "ALLY", " A", " B", "LES", "S", "ED", " SO", "UL", " IN", " HE", "A", "VE", "N"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav +{"text": " YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION", "timestamps": [0.00, 0.04, 0.12, 0.40, 0.68, 0.88, 0.96, 1.12, 1.24, 1.32, 1.44, 1.48, 1.64, 1.76, 1.88, 2.04, 2.16, 2.28, 2.32, 2.52, 2.68, 2.72, 2.88, 3.12, 3.32, 3.52, 3.80, 4.00, 4.16, 4.24, 4.40, 4.48], "tokens":[" ", "Y", "ET", " THESE", " THOUGH", "T", "S", " A", "FF", "E", "C", "TED", " HE", "S", "TER", " P", "RY", "N", "NE", " LE", "S", "S", " WITH", " HO", "PE", " THAN", " APP", "RE", "HE", "N", "S", "ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.101 s +Real time factor (RTF): 1.101 / 28.165 = 0.039 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt new file mode 100644 index 000000000..dc01d36e0 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav +{"text": " AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "timestamps": [0.00, 0.36, 0.52, 0.68, 0.96, 1.00, 1.08, 1.28, 1.40, 1.48, 1.60, 1.76, 1.80, 1.88, 1.92, 2.00, 2.20, 2.32, 2.36, 2.48, 2.60, 2.80, 2.84, 2.92, 3.12, 3.32, 3.56, 3.76, 4.04, 4.20, 4.32, 4.40, 4.56, 4.80, 4.92, 5.08, 5.36, 5.48, 5.64, 5.72, 5.88, 6.04, 6.24], "tokens":[" AFTER", " E", "AR", "LY", " ", "N", "IGHT", "F", "AL", "L", " THE", " ", "Y", "E", "LL", "OW", " LA", "M", "P", "S", " WOULD", " ", "L", "IGHT", " UP", " HERE", " AND", " THERE", " THE", " S", "QU", "AL", "ID", " QU", "AR", "TER", " OF", " THE", " B", "RO", "TH", "EL", "S"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav +{"text": " GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN", "timestamps": [0.00, 0.16, 0.40, 0.68, 0.84, 0.96, 1.04, 1.12, 1.32, 1.52, 1.68, 1.76, 2.00, 2.12, 2.28, 2.40, 2.64, 2.92, 3.20, 3.32, 3.52, 3.64, 3.76, 3.96, 4.12, 4.36, 4.52, 4.72, 4.92, 5.16, 5.40, 5.64, 5.76, 5.88, 6.12, 6.28, 6.48, 6.84, 7.08, 7.32, 7.60, 7.92, 8.12, 8.24, 8.36, 8.48, 8.64, 8.76, 8.88, 9.12, 9.32, 9.48, 9.56, 9.60, 9.76, 10.00, 10.12, 10.20, 10.44, 10.68, 10.80, 11.00, 11.20, 11.36, 11.52, 11.76, 12.00, 12.12, 12.24, 12.28, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.40, 13.64, 13.76, 14.00, 14.08, 14.24, 14.52, 14.68, 14.80, 15.00, 15.04, 15.28, 15.52, 15.76, 16.00, 16.12, 16.20, 16.32], "tokens":[" GO", "D", " AS", " A", " DI", "RE", "C", "T", " CON", "SE", "QU", "ENCE", " OF", " THE", " S", "IN", " WHICH", " MAN", " TH", "US", " P", "UN", "ISH", "ED", " HAD", " GIVE", "N", " HER", " A", " LOVE", "LY", " CHI", "L", "D", " WHO", "SE", " PLACE", " WAS", " ON", " THAT", " SAME", " DIS", "HO", "N", "OR", "ED", " BO", "S", "OM", " TO", " CON", "NE", "C", "T", " HER", " PA", "R", "ENT", " FOR", " E", "VER", " WITH", " THE", " RA", "CE", " AND", " DE", "S", "C", "ENT", " OF", " MO", "R", "T", "AL", "S", " AND", " TO", " BE", " F", "IN", "ALLY", " A", " B", "LES", "S", "ED", " SO", "UL", " IN", " HE", "A", "VE", "N"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav +{"text": " YET THESE THOUGHTS AFFECTED HESTER PRYNE LESS WITH HOPE THAN APPREHENSION", "timestamps": [0.00, 0.04, 0.12, 0.40, 0.68, 0.88, 0.96, 1.12, 1.20, 1.32, 1.44, 1.48, 1.64, 1.76, 1.88, 2.04, 2.16, 2.28, 2.52, 2.68, 2.72, 2.88, 3.12, 3.28, 3.52, 3.80, 4.00, 4.16, 4.24, 4.40, 4.48], "tokens":[" ", "Y", "ET", " THESE", " THOUGH", "T", "S", " A", "FF", "E", "C", "TED", " HE", "S", "TER", " P", "RY", "NE", " LE", "S", "S", " WITH", " HO", "PE", " THAN", " APP", "RE", "HE", "N", "S", "ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.407 s +Real time factor (RTF): 1.407 / 28.165 = 0.050 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst b/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst index 5b24901e5..74f94fe24 100644 --- a/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst +++ b/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst @@ -8,6 +8,131 @@ Zipformer-transducer-based Models Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx` before you read this section. + +sherpa-onnx-zipformer-gigaspeech-2023-12-12 (English) +----------------------------------------------------- + +Training code for this model is ``_. +It supports only English since it is trained on the `GigaSpeech`_ dataset. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + tar xf sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + ls -lh sherpa-onnx-zipformer-gigaspeech-2023-12-12 + +You should see the following output: + +.. code-block:: bash + + $ ls -lh sherpa-onnx-zipformer-gigaspeech-2023-12-12 + total 656184 + -rw-r--r-- 1 fangjun staff 28B Dec 12 19:00 README.md + -rw-r--r-- 1 fangjun staff 239K Dec 12 19:00 bpe.model + -rw-r--r-- 1 fangjun staff 528K Dec 12 19:00 decoder-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Dec 12 19:00 decoder-epoch-30-avg-1.onnx + -rw-r--r-- 1 fangjun staff 68M Dec 12 19:00 encoder-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 249M Dec 12 19:00 encoder-epoch-30-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Dec 12 19:00 joiner-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Dec 12 19:00 joiner-epoch-30-avg-1.onnx + drwxr-xr-x 5 fangjun staff 160B Dec 12 19:00 test_wavs + -rw-r--r-- 1 fangjun staff 4.9K Dec 12 19:00 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx + zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2 (Chinese) ------------------------------------------------------------ diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt new file mode 100644 index 000000000..da09a3e4f --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx", joiner="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), tokens="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.5, Real time factor (RTF): 0.088 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.32, 0.64, 0.76, 0.84, 1.04, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.88, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.72], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt new file mode 100644 index 000000000..62e0520e9 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx", decoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx", joiner="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), tokens="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.65, Real time factor (RTF): 0.12 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.32, 0.64, 0.76, 0.84, 1.08, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.72], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst b/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst index 9e4fff54e..b55127b5c 100644 --- a/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst +++ b/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst @@ -8,6 +8,142 @@ Zipformer-transducer-based Models Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ before you read this section. +sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 (Chinese) +------------------------------------------------------------------ + +Training code for this model can be found at ``_. +It supports only Chinese. + +Please refer to ``_ +for the detailed information about the training data. In total, there are 14k hours of training data. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + tar xf sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + rm sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + ls -lh sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 + total 668864 + -rw-r--r-- 1 fangjun staff 28B Dec 12 18:59 README.md + -rw-r--r-- 1 fangjun staff 131B Dec 12 18:59 bpe.model + -rw-r--r-- 1 fangjun staff 1.2M Dec 12 18:59 decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 4.9M Dec 12 18:59 decoder-epoch-20-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 67M Dec 12 18:59 encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 249M Dec 12 18:59 encoder-epoch-20-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 1.0M Dec 12 18:59 joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 3.9M Dec 12 18:59 joiner-epoch-20-avg-1-chunk-16-left-128.onnx + drwxr-xr-x 8 fangjun staff 256B Dec 12 18:59 test_wavs + -rw-r--r-- 1 fangjun staff 18K Dec 12 18:59 tokens.txt + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \ + ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + .. _sherpa-onnx-wenetspeech-2023-06-15-streaming: