Skip to content

Commit

Permalink
Support text normalization (#55)
Browse files Browse the repository at this point in the history
* Support text normalization

* Release v1.7.7

* Fix typos

* minor fixes

* Add ccache to CI
  • Loading branch information
csukuangfj authored Nov 4, 2023
1 parent 6c2035f commit 0872dbc
Show file tree
Hide file tree
Showing 16 changed files with 221 additions and 4 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/build-doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ jobs:
with:
fetch-depth: 0

- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: ${{ matrix.os }}-${{ matrix.python-version }}

- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
Expand All @@ -58,6 +63,10 @@ jobs:
- name: Build doc
shell: bash
run: |
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
cd docs
python3 -m pip install -r ./requirements.txt
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/build-pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: ${{ matrix.os }}-${{ matrix.python-version }}

# see https://github.com/microsoft/setup-msbuild
- name: Add msbuild to PATH
if: startsWith(matrix.os, 'windows')
Expand All @@ -52,6 +57,10 @@ jobs:
- name: Build
shell: bash
run: |
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
python3 -m pip install -U pip wheel numpy
python3 setup.py bdist_wheel
ls -lh dist
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ jobs:
with:
fetch-depth: 0

- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: ${{ matrix.os }}-${{ matrix.python-version }}

- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
Expand All @@ -58,6 +63,10 @@ jobs:
- name: Build
shell: bash
run: |
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
mkdir build
cd build
cmake -DKALDIFST_BUILD_TESTS=ON ..
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ jobs:
with:
fetch-depth: 0

- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: ${{ matrix.os }}-${{ matrix.python-version }}

# see https://github.com/microsoft/setup-msbuild
- name: Add msbuild to PATH
if: startsWith(matrix.os, 'windows')
Expand All @@ -57,6 +62,10 @@ jobs:
- name: Install kaldifst
shell: bash
run: |
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
pip3 install --verbose -U kaldifst
- name: Display kaldifst version
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/test-pip-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ jobs:
with:
fetch-depth: 0

- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: ${{ matrix.os }}-${{ matrix.python-version }}

- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
Expand All @@ -47,6 +52,10 @@ jobs:
- name: Install kaldifst
shell: bash
run: |
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
pip3 install --verbose kaldifst
- name: Run test
Expand Down
10 changes: 7 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)

project(kaldifst CXX)

set(KALDIFST_VERSION "1.7.6")
set(KALDIFST_VERSION "1.7.7")

if(NOT CMAKE_BUILD_TYPE)
message(STATUS "No CMAKE_BUILD_TYPE given, default to Release")
Expand Down Expand Up @@ -52,12 +52,16 @@ set(CMAKE_INSTALL_RPATH ${kaldifst_rpath_origin})
set(CMAKE_BUILD_RPATH ${kaldifst_rpath_origin})


set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
endif()

message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
set(CMAKE_CXX_EXTENSIONS OFF)

list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)


if(WIN32)
set(disabled_warnings
/wd4018
Expand Down
1 change: 1 addition & 0 deletions kaldifst/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(srcs
kaldi-semaphore.cc
kaldi-table.cc
parse-options.cc
text-normalizer.cc
text-utils.cc
)

Expand Down
15 changes: 15 additions & 0 deletions kaldifst/csrc/kaldi-fst-io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,21 @@ VectorFst<StdArc> *CastOrConvertToVectorFst(Fst<StdArc> *fst) {
}
}

ConstFst<StdArc> *CastOrConvertToConstFst(Fst<StdArc> *fst) {
// This version currently supports ConstFst<StdArc> or VectorFst<StdArc>
std::string real_type = fst->Type();
KALDIFST_ASSERT(real_type == "vector" || real_type == "const");
if (real_type == "const") {
return dynamic_cast<ConstFst<StdArc> *>(fst);
} else {
// As the 'fst' can't cast to VectorFst, we create a new
// VectorFst<StdArc> initialized by 'fst', and delete 'fst'.
ConstFst<StdArc> *new_fst = new ConstFst<StdArc>(*fst);
delete fst;
return new_fst;
}
}

void ReadFstKaldi(std::string rxfilename, fst::StdVectorFst *ofst) {
fst::StdVectorFst *fst = ReadFstKaldi(rxfilename);
*ofst = *fst;
Expand Down
2 changes: 2 additions & 0 deletions kaldifst/csrc/kaldi-fst-io.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Fst<StdArc> *ReadFstKaldiGeneric(std::string rxfilename,
// initialized by 'fst'), prints a warning, and deletes 'fst'.
VectorFst<StdArc> *CastOrConvertToVectorFst(Fst<StdArc> *fst);

ConstFst<StdArc> *CastOrConvertToConstFst(Fst<StdArc> *fst);

// Version of ReadFstKaldi() that writes to a pointer. Assumes
// the FST is binary with no binary marker. Crashes on error.
void ReadFstKaldi(std::string rxfilename, VectorFst<StdArc> *ofst);
Expand Down
80 changes: 80 additions & 0 deletions kaldifst/csrc/text-normalizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// kaldifst/csrc/text-normalizer.cc
//
// Copyright (c) 2023 Xiaomi Corporation

#include "kaldifst/csrc/text-normalizer.h"

#include <memory>
#include <string>
#include <utility>

#include "fst/arcsort.h"
#include "kaldifst/csrc/kaldi-fst-io.h"
#include "kaldifst/csrc/table-matcher.h"

namespace fst {

// This variable is copied from
// https://github.com/pzelasko/Pynini/blob/master/src/stringcompile.h#L81
constexpr uint64_t kCompiledStringProps =
kAcceptor | kIDeterministic | kODeterministic | kILabelSorted |
kOLabelSorted | kUnweighted | kAcyclic | kInitialAcyclic | kTopSorted |
kAccessible | kCoAccessible | kString | kUnweightedCycles;
} // namespace fst

namespace kaldifst {

// We don't use StringCompiler<StdArc> here since it treats bytes as
// signed integers.
static fst::StdVectorFst StringToFst(const std::string &text) {
using Weight = typename fst::StdArc::Weight;
using Arc = fst::StdArc;

fst::StdVectorFst ans;
ans.ReserveStates(text.size());

auto s = ans.AddState();
ans.SetStart(s);
// CAUTION(fangjun): We need to use uint8_t here.
for (const uint8_t label : text) {
const auto nextstate = ans.AddState();
ans.AddArc(s, Arc(label, label, Weight::One(), nextstate));
s = nextstate;
}

ans.SetFinal(s, Weight::One());
ans.SetProperties(fst::kCompiledStringProps, fst::kCompiledStringProps);

return ans;
}

TextNormalizer::TextNormalizer(const std::string &rule) {
rule_ = std::unique_ptr<fst::StdConstFst>(
CastOrConvertToConstFst(fst::ReadFstKaldiGeneric(rule)));
}

TextNormalizer::TextNormalizer(std::unique_ptr<fst::StdConstFst> rule)
: rule_(std::move(rule)) {}

std::string TextNormalizer::Normalize(const std::string &s) const {
// Step 1: Convert the input text into an FST
fst::StdVectorFst text = StringToFst(s);

// Step 2: Compose the input text with the rule FST
fst::StdVectorFst composed_fst;
fst::Compose(text, *rule_, &composed_fst);

// Step 3: Get the best path from the composed FST
fst::StdVectorFst one_best;
fst::ShortestPath(composed_fst, &one_best, 1);

// Step 4: Concatenate the output labels of the best path
fst::StringPrinter<fst::StdArc> string_printer(fst::StringTokenType::BYTE);

std::string normalized;
string_printer(one_best, &normalized);

return normalized;
}

} // namespace kaldifst
31 changes: 31 additions & 0 deletions kaldifst/csrc/text-normalizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// kaldifst/csrc/text-normalizer.h
//
// Copyright (c) 2023 Xiaomi Corporation

#ifndef KALDIFST_CSRC_TEXT_NORMALIZER_H_
#define KALDIFST_CSRC_TEXT_NORMALIZER_H_

#include <memory>
#include <string>

#include "fst/fst.h"
#include "fst/fstlib.h"

namespace kaldifst {

class TextNormalizer {
public:
// Path to rule.fst
explicit TextNormalizer(const std::string &rule);

explicit TextNormalizer(std::unique_ptr<fst::StdConstFst> rule);

std::string Normalize(const std::string &s) const;

private:
std::unique_ptr<fst::StdConstFst> rule_;
};

} // namespace kaldifst

#endif // KALDIFST_CSRC_TEXT_NORMALIZER_H_
1 change: 1 addition & 0 deletions kaldifst/python/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pybind11_add_module(_kaldifst
shortest-path.cc
symbol-table.cc
table-matcher.cc
text-normalizer.cc
vector-fst.cc
)
target_link_libraries(_kaldifst PRIVATE kaldifst_core fstscript)
Expand Down
2 changes: 2 additions & 0 deletions kaldifst/python/csrc/kaldifst.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "kaldifst/python/csrc/shortest-path.h"
#include "kaldifst/python/csrc/symbol-table.h"
#include "kaldifst/python/csrc/table-matcher.h"
#include "kaldifst/python/csrc/text-normalizer.h"
#include "kaldifst/python/csrc/vector-fst.h"

namespace kaldifst {
Expand Down Expand Up @@ -69,6 +70,7 @@ PYBIND11_MODULE(_kaldifst, m) {
PybindKaldiTable(m);
PybindLatticeUtils(&m);
PybindShortestPath(&m);
PybindTextNormalizer(&m);
}

} // namespace kaldifst
21 changes: 21 additions & 0 deletions kaldifst/python/csrc/text-normalizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// kaldifst/python/csrc/text-normalizer.cc
//
// Copyright (c) 2023 Xiaomi Corporation

#include "kaldifst/csrc/text-normalizer.h"

#include <string>

#include "kaldifst/python/csrc/text-normalizer.h"

namespace kaldifst {

void PybindTextNormalizer(py::module *m) {
using PyClass = TextNormalizer;
py::class_<PyClass>(*m, "TextNormalizer")
.def(py::init<const std::string &>(), py::arg("rule"))
.def("normalize", &PyClass::Normalize)
.def("__call__", &PyClass::Normalize);
}

} // namespace kaldifst
14 changes: 14 additions & 0 deletions kaldifst/python/csrc/text-normalizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// kaldifst/python/csrc/text-normalizer.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_
#define KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_

#include "kaldifst/python/csrc/kaldifst.h"
namespace kaldifst {

void PybindTextNormalizer(py::module *m);

}

#endif // KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_
3 changes: 2 additions & 1 deletion kaldifst/python/kaldifst/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
StdFst,
StdVectorFst,
SymbolTable,
TextNormalizer,
TropicalWeight,
add_self_loops,
arcsort,
Expand All @@ -30,8 +31,8 @@
plus,
reverse,
rmepsilon,
times,
shortest_path,
times,
)

from .iterator import ArcIterator, StateIterator
Expand Down

0 comments on commit 0872dbc

Please sign in to comment.