From 1efa6c98c1c3975388161b9f8dc0e97110eb47eb Mon Sep 17 00:00:00 2001 From: flatsiedatsie Date: Sat, 3 Feb 2024 14:43:24 +0100 Subject: [PATCH 1/3] Add --server option This runs the core of the project in a loop until the process receives a termination signal. --- src/cpp/main.cpp | 253 ++++++++++++++++++++++++++--------------------- 1 file changed, 140 insertions(+), 113 deletions(-) diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index bd750066..aee4d83c 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,8 @@ using namespace std; using json = nlohmann::json; +bool running = true; + enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW }; struct RunConfig { @@ -88,8 +91,16 @@ struct RunConfig { // true to use CUDA execution provider bool useCuda = false; + + // true to keep running continuously, parsing any input on demand + bool server = false; }; +void signalHandler( int signum ) { + cout << "Interrupt signal (" << signum << ") received.\n"; + running = false; + exit(signum); +} void parseArgs(int argc, char *argv[], RunConfig &runConfig); void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, condition_variable &cvAudio, bool &audioReady, @@ -100,6 +111,8 @@ void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, int main(int argc, char *argv[]) { spdlog::set_default_logger(spdlog::stderr_color_st("piper")); + signal(SIGINT, signalHandler); + RunConfig runConfig; parseArgs(argc, argv, runConfig); @@ -228,138 +241,148 @@ int main(int argc, char *argv[]) { string line; piper::SynthesisResult result; - while (getline(cin, line)) { - auto outputType = runConfig.outputType; - auto speakerId = voice.synthesisConfig.speakerId; - std::optional maybeOutputPath = runConfig.outputPath; - - if (runConfig.jsonInput) { - // Each line is a JSON object - json lineRoot = json::parse(line); - - // Text is required - line = lineRoot["text"].get(); - - if (lineRoot.contains("output_file")) { - // Override output WAV file path - outputType = OUTPUT_FILE; - maybeOutputPath = - filesystem::path(lineRoot["output_file"].get()); - } + while (running){ + + while (getline(cin, line)) { + auto outputType = runConfig.outputType; + auto speakerId = voice.synthesisConfig.speakerId; + std::optional maybeOutputPath = runConfig.outputPath; + + if (runConfig.jsonInput) { + // Each line is a JSON object + json lineRoot = json::parse(line); + + // Text is required + line = lineRoot["text"].get(); + + if (lineRoot.contains("output_file")) { + // Override output WAV file path + outputType = OUTPUT_FILE; + maybeOutputPath = + filesystem::path(lineRoot["output_file"].get()); + } - if (lineRoot.contains("speaker_id")) { - // Override speaker id - voice.synthesisConfig.speakerId = - lineRoot["speaker_id"].get(); - } else if (lineRoot.contains("speaker")) { - // Resolve to id using speaker id map - auto speakerName = lineRoot["speaker"].get(); - if ((voice.modelConfig.speakerIdMap) && - (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) { + if (lineRoot.contains("speaker_id")) { + // Override speaker id voice.synthesisConfig.speakerId = - (*voice.modelConfig.speakerIdMap)[speakerName]; - } else { - spdlog::warn("No speaker named: {}", speakerName); + lineRoot["speaker_id"].get(); + } else if (lineRoot.contains("speaker")) { + // Resolve to id using speaker id map + auto speakerName = lineRoot["speaker"].get(); + if ((voice.modelConfig.speakerIdMap) && + (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) { + voice.synthesisConfig.speakerId = + (*voice.modelConfig.speakerIdMap)[speakerName]; + } else { + spdlog::warn("No speaker named: {}", speakerName); + } } } - } - // Timestamp is used for path to output WAV file - const auto now = chrono::system_clock::now(); - const auto timestamp = - chrono::duration_cast(now.time_since_epoch()) - .count(); - - if (outputType == OUTPUT_DIRECTORY) { - // Generate path using timestamp - stringstream outputName; - outputName << timestamp << ".wav"; - filesystem::path outputPath = runConfig.outputPath.value(); - outputPath.append(outputName.str()); - - // Output audio to automatically-named WAV file in a directory - ofstream audioFile(outputPath.string(), ios::binary); - piper::textToWavFile(piperConfig, voice, line, audioFile, result); - cout << outputPath.string() << endl; - } else if (outputType == OUTPUT_FILE) { - if (!maybeOutputPath || maybeOutputPath->empty()) { - throw runtime_error("No output path provided"); - } - - filesystem::path outputPath = maybeOutputPath.value(); - - if (!runConfig.jsonInput) { - // Read all of standard input before synthesizing. - // Otherwise, we would overwrite the output file for each line. - stringstream text; - text << line; - while (getline(cin, line)) { - text << " " << line; + // Timestamp is used for path to output WAV file + const auto now = chrono::system_clock::now(); + const auto timestamp = + chrono::duration_cast(now.time_since_epoch()) + .count(); + + if (outputType == OUTPUT_DIRECTORY) { + // Generate path using timestamp + stringstream outputName; + outputName << timestamp << ".wav"; + filesystem::path outputPath = runConfig.outputPath.value(); + outputPath.append(outputName.str()); + + // Output audio to automatically-named WAV file in a directory + ofstream audioFile(outputPath.string(), ios::binary); + piper::textToWavFile(piperConfig, voice, line, audioFile, result); + cout << outputPath.string() << endl; + } else if (outputType == OUTPUT_FILE) { + if (!maybeOutputPath || maybeOutputPath->empty()) { + throw runtime_error("No output path provided"); } - line = text.str(); - } + filesystem::path outputPath = maybeOutputPath.value(); - // Output audio to WAV file - ofstream audioFile(outputPath.string(), ios::binary); - piper::textToWavFile(piperConfig, voice, line, audioFile, result); - cout << outputPath.string() << endl; - } else if (outputType == OUTPUT_STDOUT) { - // Output WAV to stdout - piper::textToWavFile(piperConfig, voice, line, cout, result); - } else if (outputType == OUTPUT_RAW) { - // Raw output to stdout - mutex mutAudio; - condition_variable cvAudio; - bool audioReady = false; - bool audioFinished = false; - vector audioBuffer; - vector sharedAudioBuffer; + if (!runConfig.jsonInput) { + // Read all of standard input before synthesizing. + // Otherwise, we would overwrite the output file for each line. + stringstream text; + text << line; + while (getline(cin, line)) { + text << " " << line; + } -#ifdef _WIN32 - // Needed on Windows to avoid terminal conversions - setmode(fileno(stdout), O_BINARY); - setmode(fileno(stdin), O_BINARY); -#endif + line = text.str(); + } - thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer), - ref(mutAudio), ref(cvAudio), ref(audioReady), - ref(audioFinished)); - auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio, - &cvAudio, &audioReady]() { - // Signal thread that audio is ready + // Output audio to WAV file + ofstream audioFile(outputPath.string(), ios::binary); + piper::textToWavFile(piperConfig, voice, line, audioFile, result); + cout << outputPath.string() << endl; + } else if (outputType == OUTPUT_STDOUT) { + // Output WAV to stdout + piper::textToWavFile(piperConfig, voice, line, cout, result); + } else if (outputType == OUTPUT_RAW) { + // Raw output to stdout + mutex mutAudio; + condition_variable cvAudio; + bool audioReady = false; + bool audioFinished = false; + vector audioBuffer; + vector sharedAudioBuffer; + + #ifdef _WIN32 + // Needed on Windows to avoid terminal conversions + setmode(fileno(stdout), O_BINARY); + setmode(fileno(stdin), O_BINARY); + #endif + + thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer), + ref(mutAudio), ref(cvAudio), ref(audioReady), + ref(audioFinished)); + auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio, + &cvAudio, &audioReady]() { + // Signal thread that audio is ready + { + unique_lock lockAudio(mutAudio); + copy(audioBuffer.begin(), audioBuffer.end(), + back_inserter(sharedAudioBuffer)); + audioReady = true; + cvAudio.notify_one(); + } + }; + piper::textToAudio(piperConfig, voice, line, audioBuffer, result, + audioCallback); + + // Signal thread that there is no more audio { unique_lock lockAudio(mutAudio); - copy(audioBuffer.begin(), audioBuffer.end(), - back_inserter(sharedAudioBuffer)); audioReady = true; + audioFinished = true; cvAudio.notify_one(); } - }; - piper::textToAudio(piperConfig, voice, line, audioBuffer, result, - audioCallback); - - // Signal thread that there is no more audio - { - unique_lock lockAudio(mutAudio); - audioReady = true; - audioFinished = true; - cvAudio.notify_one(); - } - // Wait for audio output to finish - spdlog::info("Waiting for audio to finish playing..."); - rawOutputThread.join(); - } + // Wait for audio output to finish + spdlog::info("Waiting for audio to finish playing..."); + rawOutputThread.join(); + } - spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)", - result.realTimeFactor, result.inferSeconds, - result.audioSeconds); + spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)", + result.realTimeFactor, result.inferSeconds, + result.audioSeconds); - // Restore config (--json-input) - voice.synthesisConfig.speakerId = speakerId; + // Restore config (--json-input) + voice.synthesisConfig.speakerId = speakerId; - } // for each line + } // for each line + + if (!runConfig.server) { + break; + } + + sleep(0.001); + } + piper::terminate(piperConfig); @@ -440,6 +463,8 @@ void printUsage(char *argv[]) { << endl; cerr << " --use-cuda use CUDA execution provider" << endl; + cerr << " --server Keep running until closed" + << endl; cerr << " --debug print DEBUG messages to the console" << endl; cerr << " -q --quiet disable logging" << endl; @@ -525,6 +550,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { runConfig.jsonInput = true; } else if (arg == "--use_cuda" || arg == "--use-cuda") { runConfig.useCuda = true; + } else if (arg == "--server") { + runConfig.server = true; } else if (arg == "--version") { std::cout << piper::getVersion() << std::endl; exit(0); From 5c46e311ada662f8e8d0815d8b9323759ab04fdc Mon Sep 17 00:00:00 2001 From: flatsiedatsie Date: Thu, 15 Feb 2024 20:42:14 +0100 Subject: [PATCH 2/3] Add files via upload --- main.cpp | 607 ++++++++++++++++++++++++++++++++++++++++++++++++++ piper.cpp | 644 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ piper.hpp | 133 +++++++++++ 3 files changed, 1384 insertions(+) create mode 100644 main.cpp create mode 100644 piper.cpp create mode 100644 piper.hpp diff --git a/main.cpp b/main.cpp new file mode 100644 index 00000000..614853f8 --- /dev/null +++ b/main.cpp @@ -0,0 +1,607 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#endif + +#ifdef _WIN32 +#include +#include +#endif + +#ifdef __APPLE__ +#include +#endif + +#include +#include + +#include "json.hpp" +#include "piper.hpp" + +using namespace std; +using json = nlohmann::json; + +bool running = true; + +enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW }; + +struct RunConfig { + // Path to .onnx voice file + filesystem::path modelPath; + + // Path to JSON voice config file + filesystem::path modelConfigPath; + + // Type of output to produce. + // Default is to write a WAV file in the current directory. + OutputType outputType = OUTPUT_DIRECTORY; + + // Path for output + optional outputPath = filesystem::path("."); + + // Numerical id of the default speaker (multi-speaker voices) + optional speakerId; + + // Amount of noise to add during audio generation + optional noiseScale; + + // Audio output volume + //optional speakerId; + optional volumeLevel; + + // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower) + optional lengthScale; + + // Variation in phoneme lengths + optional noiseW; + + // Seconds of silence to add after each sentence + optional sentenceSilenceSeconds; + + // Path to espeak-ng data directory (default is next to piper executable) + optional eSpeakDataPath; + + // Path to libtashkeel ort model + // https://github.com/mush42/libtashkeel/ + optional tashkeelModelPath; + + // stdin input is lines of JSON instead of text with format: + // { + // "text": str, (required) + // "speaker_id": int, (optional) + // "speaker": str, (optional) + // "output_file": str, (optional) + // } + bool jsonInput = false; + + // Seconds of extra silence to insert after a single phoneme + optional> phonemeSilenceSeconds; + + // true to use CUDA execution provider + bool useCuda = false; + + // true to keep running continuously, parsing any input on demand + bool server = false; +}; + +void signalHandler( int signum ) { + cout << "Interrupt signal (" << signum << ") received.\n"; + running = false; + exit(signum); +} +void parseArgs(int argc, char *argv[], RunConfig &runConfig); +void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, + condition_variable &cvAudio, bool &audioReady, + bool &audioFinished); + +// ---------------------------------------------------------------------------- + +int main(int argc, char *argv[]) { + spdlog::set_default_logger(spdlog::stderr_color_st("piper")); + + signal(SIGINT, signalHandler); + + RunConfig runConfig; + parseArgs(argc, argv, runConfig); + +#ifdef _WIN32 + // Required on Windows to show IPA symbols + SetConsoleOutputCP(CP_UTF8); +#endif + + piper::PiperConfig piperConfig; + piper::Voice voice; + + spdlog::debug("Loading voice from {} (config={})", + runConfig.modelPath.string(), + runConfig.modelConfigPath.string()); + + auto startTime = chrono::steady_clock::now(); + loadVoice(piperConfig, runConfig.modelPath.string(), + runConfig.modelConfigPath.string(), voice, runConfig.speakerId, + runConfig.useCuda); + auto endTime = chrono::steady_clock::now(); + spdlog::info("Loaded voice in {} second(s)", + chrono::duration(endTime - startTime).count()); + + // Get the path to the piper executable so we can locate espeak-ng-data, etc. + // next to it. +#ifdef _MSC_VER + auto exePath = []() { + wchar_t moduleFileName[MAX_PATH] = {0}; + GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); + return filesystem::path(moduleFileName); + }(); +#else +#ifdef __APPLE__ + auto exePath = []() { + char moduleFileName[PATH_MAX] = {0}; + uint32_t moduleFileNameSize = std::size(moduleFileName); + _NSGetExecutablePath(moduleFileName, &moduleFileNameSize); + return filesystem::path(moduleFileName); + }(); +#else + auto exePath = filesystem::canonical("/proc/self/exe"); +#endif +#endif + + if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) { + spdlog::debug("Voice uses eSpeak phonemes ({})", + voice.phonemizeConfig.eSpeak.voice); + + if (runConfig.eSpeakDataPath) { + // User provided path + piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string(); + } else { + // Assume next to piper executable + piperConfig.eSpeakDataPath = + std::filesystem::absolute( + exePath.parent_path().append("espeak-ng-data")) + .string(); + + spdlog::debug("espeak-ng-data directory is expected at {}", + piperConfig.eSpeakDataPath); + } + } else { + // Not using eSpeak + piperConfig.useESpeak = false; + } + + // Enable libtashkeel for Arabic + if (voice.phonemizeConfig.eSpeak.voice == "ar") { + piperConfig.useTashkeel = true; + if (runConfig.tashkeelModelPath) { + // User provided path + piperConfig.tashkeelModelPath = + runConfig.tashkeelModelPath.value().string(); + } else { + // Assume next to piper executable + piperConfig.tashkeelModelPath = + std::filesystem::absolute( + exePath.parent_path().append("libtashkeel_model.ort")) + .string(); + + spdlog::debug("libtashkeel model is expected at {}", + piperConfig.tashkeelModelPath.value()); + } + } + + piper::initialize(piperConfig); + + // Scales + if (runConfig.noiseScale) { + voice.synthesisConfig.noiseScale = runConfig.noiseScale.value(); + } + + if (runConfig.volumeLevel) { + voice.synthesisConfig.volumeLevel = runConfig.volumeLevel.value(); + } + + if (runConfig.lengthScale) { + voice.synthesisConfig.lengthScale = runConfig.lengthScale.value(); + } + + if (runConfig.noiseW) { + voice.synthesisConfig.noiseW = runConfig.noiseW.value(); + } + + if (runConfig.sentenceSilenceSeconds) { + voice.synthesisConfig.sentenceSilenceSeconds = + runConfig.sentenceSilenceSeconds.value(); + } + + if (runConfig.phonemeSilenceSeconds) { + if (!voice.synthesisConfig.phonemeSilenceSeconds) { + // Overwrite + voice.synthesisConfig.phonemeSilenceSeconds = + runConfig.phonemeSilenceSeconds; + } else { + // Merge + for (const auto &[phoneme, silenceSeconds] : + *runConfig.phonemeSilenceSeconds) { + voice.synthesisConfig.phonemeSilenceSeconds->try_emplace( + phoneme, silenceSeconds); + } + } + + } // if phonemeSilenceSeconds + + if (runConfig.outputType == OUTPUT_DIRECTORY) { + runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); + spdlog::info("Output directory: {}", runConfig.outputPath.value().string()); + } + + string line; + piper::SynthesisResult result; + while (running){ + + while (getline(cin, line)) { + auto outputType = runConfig.outputType; + auto speakerId = voice.synthesisConfig.speakerId; + auto volumeLevel = voice.synthesisConfig.volumeLevel; + std::optional maybeOutputPath = runConfig.outputPath; + + if (runConfig.jsonInput) { + // Each line is a JSON object + json lineRoot = json::parse(line); + + // Text is required + line = lineRoot["text"].get(); + + if (lineRoot.contains("output_file")) { + // Override output WAV file path + outputType = OUTPUT_FILE; + maybeOutputPath = + filesystem::path(lineRoot["output_file"].get()); + } + + if (lineRoot.contains("speaker_id")) { + // Override speaker id + voice.synthesisConfig.speakerId = + lineRoot["speaker_id"].get(); + } else if (lineRoot.contains("volume_level")) { + // Override volume level + voice.synthesisConfig.volumeLevel = + std::stof(lineRoot["volume_level"].get()); + } else if (lineRoot.contains("speaker")) { + // Resolve to id using speaker id map + auto speakerName = lineRoot["speaker"].get(); + if ((voice.modelConfig.speakerIdMap) && + (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) { + voice.synthesisConfig.speakerId = + (*voice.modelConfig.speakerIdMap)[speakerName]; + } else { + spdlog::warn("No speaker named: {}", speakerName); + } + } + } + + // Timestamp is used for path to output WAV file + const auto now = chrono::system_clock::now(); + const auto timestamp = + chrono::duration_cast(now.time_since_epoch()) + .count(); + + if (outputType == OUTPUT_DIRECTORY) { + // Generate path using timestamp + stringstream outputName; + outputName << timestamp << ".wav"; + filesystem::path outputPath = runConfig.outputPath.value(); + outputPath.append(outputName.str()); + + // Output audio to automatically-named WAV file in a directory + ofstream audioFile(outputPath.string(), ios::binary); + piper::textToWavFile(piperConfig, voice, line, audioFile, result); + cout << outputPath.string() << endl; + } else if (outputType == OUTPUT_FILE) { + if (!maybeOutputPath || maybeOutputPath->empty()) { + throw runtime_error("No output path provided"); + } + + filesystem::path outputPath = maybeOutputPath.value(); + + if (!runConfig.jsonInput) { + // Read all of standard input before synthesizing. + // Otherwise, we would overwrite the output file for each line. + stringstream text; + text << line; + while (getline(cin, line)) { + text << " " << line; + } + + line = text.str(); + } + + // Output audio to WAV file + ofstream audioFile(outputPath.string(), ios::binary); + piper::textToWavFile(piperConfig, voice, line, audioFile, result); + cout << outputPath.string() << endl; + } else if (outputType == OUTPUT_STDOUT) { + // Output WAV to stdout + piper::textToWavFile(piperConfig, voice, line, cout, result); + } else if (outputType == OUTPUT_RAW) { + // Raw output to stdout + mutex mutAudio; + condition_variable cvAudio; + bool audioReady = false; + bool audioFinished = false; + vector audioBuffer; + vector sharedAudioBuffer; + + #ifdef _WIN32 + // Needed on Windows to avoid terminal conversions + setmode(fileno(stdout), O_BINARY); + setmode(fileno(stdin), O_BINARY); + #endif + + thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer), + ref(mutAudio), ref(cvAudio), ref(audioReady), + ref(audioFinished)); + auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio, + &cvAudio, &audioReady]() { + // Signal thread that audio is ready + { + unique_lock lockAudio(mutAudio); + copy(audioBuffer.begin(), audioBuffer.end(), + back_inserter(sharedAudioBuffer)); + audioReady = true; + cvAudio.notify_one(); + } + }; + piper::textToAudio(piperConfig, voice, line, audioBuffer, result, + audioCallback); + + // Signal thread that there is no more audio + { + unique_lock lockAudio(mutAudio); + audioReady = true; + audioFinished = true; + cvAudio.notify_one(); + } + + // Wait for audio output to finish + spdlog::info("Waiting for audio to finish playing..."); + rawOutputThread.join(); + } + + spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)", + result.realTimeFactor, result.inferSeconds, + result.audioSeconds); + + // Restore config (--json-input) + voice.synthesisConfig.speakerId = speakerId; + voice.synthesisConfig.volumeLevel = volumeLevel; + + } // for each line + + if (!runConfig.server) { + break; + } + + sleep(0.001); + } + + + piper::terminate(piperConfig); + + return EXIT_SUCCESS; +} + +// ---------------------------------------------------------------------------- + +void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, + condition_variable &cvAudio, bool &audioReady, + bool &audioFinished) { + vector internalAudioBuffer; + while (true) { + { + unique_lock lockAudio{mutAudio}; + cvAudio.wait(lockAudio, [&audioReady] { return audioReady; }); + + if (sharedAudioBuffer.empty() && audioFinished) { + break; + } + + copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(), + back_inserter(internalAudioBuffer)); + + sharedAudioBuffer.clear(); + + if (!audioFinished) { + audioReady = false; + } + } + + cout.write((const char *)internalAudioBuffer.data(), + sizeof(int16_t) * internalAudioBuffer.size()); + cout.flush(); + internalAudioBuffer.clear(); + } + +} // rawOutputProc + +// ---------------------------------------------------------------------------- + +void printUsage(char *argv[]) { + cerr << endl; + cerr << "usage: " << argv[0] << " [options]" << endl; + cerr << endl; + cerr << "options:" << endl; + cerr << " -h --help show this message and exit" << endl; + cerr << " -m FILE --model FILE path to onnx model file" << endl; + cerr << " -c FILE --config FILE path to model config file " + "(default: model path + .json)" + << endl; + cerr << " -f FILE --output_file FILE path to output WAV file ('-' for " + "stdout)" + << endl; + cerr << " -d DIR --output_dir DIR path to output directory (default: " + "cwd)" + << endl; + cerr << " --output_raw output raw audio to stdout as it " + "becomes available" + << endl; + cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl; + cerr << " --noise_scale NUM generator noise (default: 0.667)" + << endl; + cerr << " -v NUM --volume NUM Volume level (default: 1.0)" + << endl; + cerr << " --length_scale NUM phoneme length (default: 1.0)" + << endl; + cerr << " --noise_w NUM phoneme width noise (default: 0.8)" + << endl; + cerr << " --sentence_silence NUM seconds of silence after each " + "sentence (default: 0.2)" + << endl; + cerr << " --espeak_data DIR path to espeak-ng data directory" + << endl; + cerr << " --tashkeel_model FILE path to libtashkeel onnx model " + "(arabic)" + << endl; + cerr << " --json-input stdin input is lines of JSON " + "instead of plain text" + << endl; + cerr << " --use-cuda use CUDA execution provider" + << endl; + cerr << " --server Keep running until closed" + << endl; + cerr << " --debug print DEBUG messages to the console" + << endl; + cerr << " -q --quiet disable logging" << endl; + cerr << endl; +} + +void ensureArg(int argc, char *argv[], int argi) { + if ((argi + 1) >= argc) { + printUsage(argv); + exit(0); + } +} + +// Parse command-line arguments +void parseArgs(int argc, char *argv[], RunConfig &runConfig) { + optional modelConfigPath; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-m" || arg == "--model") { + ensureArg(argc, argv, i); + runConfig.modelPath = filesystem::path(argv[++i]); + } else if (arg == "-c" || arg == "--config") { + ensureArg(argc, argv, i); + modelConfigPath = filesystem::path(argv[++i]); + } else if (arg == "-f" || arg == "--output_file" || + arg == "--output-file") { + ensureArg(argc, argv, i); + std::string filePath = argv[++i]; + if (filePath == "-") { + runConfig.outputType = OUTPUT_STDOUT; + runConfig.outputPath = nullopt; + } else { + runConfig.outputType = OUTPUT_FILE; + runConfig.outputPath = filesystem::path(filePath); + } + } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") { + ensureArg(argc, argv, i); + runConfig.outputType = OUTPUT_DIRECTORY; + runConfig.outputPath = filesystem::path(argv[++i]); + } else if (arg == "--output_raw" || arg == "--output-raw") { + runConfig.outputType = OUTPUT_RAW; + } else if (arg == "-s" || arg == "--speaker") { + ensureArg(argc, argv, i); + runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]); + } else if (arg == "--noise_scale" || arg == "--noise-scale") { + ensureArg(argc, argv, i); + runConfig.noiseScale = stof(argv[++i]); + } else if (arg == "-v" || arg == "--volume") { + ensureArg(argc, argv, i); + runConfig.volumeLevel = stof(argv[++i]); + } else if (arg == "--length_scale" || arg == "--length-scale") { + ensureArg(argc, argv, i); + runConfig.lengthScale = stof(argv[++i]); + } else if (arg == "--noise_w" || arg == "--noise-w") { + ensureArg(argc, argv, i); + runConfig.noiseW = stof(argv[++i]); + } else if (arg == "--sentence_silence" || arg == "--sentence-silence") { + ensureArg(argc, argv, i); + runConfig.sentenceSilenceSeconds = stof(argv[++i]); + } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") { + ensureArg(argc, argv, i); + ensureArg(argc, argv, i + 1); + auto phonemeStr = std::string(argv[++i]); + if (!piper::isSingleCodepoint(phonemeStr)) { + std::cerr << "Phoneme '" << phonemeStr + << "' is not a single codepoint (--phoneme_silence)" + << std::endl; + exit(1); + } + + if (!runConfig.phonemeSilenceSeconds) { + runConfig.phonemeSilenceSeconds.emplace(); + } + + auto phoneme = piper::getCodepoint(phonemeStr); + (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]); + } else if (arg == "--espeak_data" || arg == "--espeak-data") { + ensureArg(argc, argv, i); + runConfig.eSpeakDataPath = filesystem::path(argv[++i]); + } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") { + ensureArg(argc, argv, i); + runConfig.tashkeelModelPath = filesystem::path(argv[++i]); + } else if (arg == "--json_input" || arg == "--json-input") { + runConfig.jsonInput = true; + } else if (arg == "--use_cuda" || arg == "--use-cuda") { + runConfig.useCuda = true; + } else if (arg == "--server") { + runConfig.server = true; + } else if (arg == "--version") { + std::cout << piper::getVersion() << std::endl; + exit(0); + } else if (arg == "--debug") { + // Set DEBUG logging + spdlog::set_level(spdlog::level::debug); + } else if (arg == "-q" || arg == "--quiet") { + // diable logging + spdlog::set_level(spdlog::level::off); + } else if (arg == "-h" || arg == "--help") { + printUsage(argv); + exit(0); + } + } + + // Verify model file exists + ifstream modelFile(runConfig.modelPath.c_str(), ios::binary); + if (!modelFile.good()) { + throw runtime_error("Model file doesn't exist"); + } + + if (!modelConfigPath) { + runConfig.modelConfigPath = + filesystem::path(runConfig.modelPath.string() + ".json"); + } else { + runConfig.modelConfigPath = modelConfigPath.value(); + } + + // Verify model config exists + ifstream modelConfigFile(runConfig.modelConfigPath.c_str()); + if (!modelConfigFile.good()) { + throw runtime_error("Model config doesn't exist"); + } +} diff --git a/piper.cpp b/piper.cpp new file mode 100644 index 00000000..21e3e246 --- /dev/null +++ b/piper.cpp @@ -0,0 +1,644 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "json.hpp" +#include "piper.hpp" +#include "utf8.h" +#include "wavfile.hpp" + +namespace piper { + +#ifdef _PIPER_VERSION +// https://stackoverflow.com/questions/47346133/how-to-use-a-define-inside-a-format-string +#define _STR(x) #x +#define STR(x) _STR(x) +const std::string VERSION = STR(_PIPER_VERSION); +#else +const std::string VERSION = ""; +#endif + +// Maximum value for 16-bit signed WAV sample +const float MAX_WAV_VALUE = 32767.0f; + +const std::string instanceName{"piper"}; + +std::string getVersion() { return VERSION; } + +// True if the string is a single UTF-8 codepoint +bool isSingleCodepoint(std::string s) { + return utf8::distance(s.begin(), s.end()) == 1; +} + +// Get the first UTF-8 codepoint of a string +Phoneme getCodepoint(std::string s) { + utf8::iterator character_iter(s.begin(), s.begin(), s.end()); + return *character_iter; +} + +// Load JSON config information for phonemization +void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { + // { + // "espeak": { + // "voice": "" + // }, + // "phoneme_type": "", + // "phoneme_map": { + // "": ["", "", ...] + // }, + // "phoneme_id_map": { + // "": [, , ...] + // } + // } + + if (configRoot.contains("espeak")) { + auto espeakValue = configRoot["espeak"]; + if (espeakValue.contains("voice")) { + phonemizeConfig.eSpeak.voice = espeakValue["voice"].get(); + } + } + + if (configRoot.contains("phoneme_type")) { + auto phonemeTypeStr = configRoot["phoneme_type"].get(); + if (phonemeTypeStr == "text") { + phonemizeConfig.phonemeType = TextPhonemes; + } + } + + // phoneme to [id] map + // Maps phonemes to one or more phoneme ids (required). + if (configRoot.contains("phoneme_id_map")) { + auto phonemeIdMapValue = configRoot["phoneme_id_map"]; + for (auto &fromPhonemeItem : phonemeIdMapValue.items()) { + std::string fromPhoneme = fromPhonemeItem.key(); + if (!isSingleCodepoint(fromPhoneme)) { + std::stringstream idsStr; + for (auto &toIdValue : fromPhonemeItem.value()) { + PhonemeId toId = toIdValue.get(); + idsStr << toId << ","; + } + + spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme, + idsStr.str()); + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme id map)"); + } + + auto fromCodepoint = getCodepoint(fromPhoneme); + for (auto &toIdValue : fromPhonemeItem.value()) { + PhonemeId toId = toIdValue.get(); + phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId); + } + } + } + + // phoneme to [phoneme] map + // Maps phonemes to one or more other phonemes (not normally used). + if (configRoot.contains("phoneme_map")) { + if (!phonemizeConfig.phonemeMap) { + phonemizeConfig.phonemeMap.emplace(); + } + + auto phonemeMapValue = configRoot["phoneme_map"]; + for (auto &fromPhonemeItem : phonemeMapValue.items()) { + std::string fromPhoneme = fromPhonemeItem.key(); + if (!isSingleCodepoint(fromPhoneme)) { + spdlog::error("\"{}\" is not a single codepoint", fromPhoneme); + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme map)"); + } + + auto fromCodepoint = getCodepoint(fromPhoneme); + for (auto &toPhonemeValue : fromPhonemeItem.value()) { + std::string toPhoneme = toPhonemeValue.get(); + if (!isSingleCodepoint(toPhoneme)) { + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme map)"); + } + + auto toCodepoint = getCodepoint(toPhoneme); + (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint); + } + } + } + +} /* parsePhonemizeConfig */ + +// Load JSON config for audio synthesis +void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { + // { + // "audio": { + // "sample_rate": 22050 + // }, + // "inference": { + // "noise_scale": 0.667, + // "length_scale": 1, + // "noise_w": 0.8, + // "phoneme_silence": { + // "": , + // ... + // } + // } + // } + + if (configRoot.contains("audio")) { + auto audioValue = configRoot["audio"]; + if (audioValue.contains("sample_rate")) { + // Default sample rate is 22050 Hz + synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050); + } + } + + if (configRoot.contains("inference")) { + // Overrides default inference settings + auto inferenceValue = configRoot["inference"]; + if (inferenceValue.contains("noise_scale")) { + synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f); + } + + if (inferenceValue.contains("volume_level")) { + synthesisConfig.volumeLevel = inferenceValue.value("volume_level", 1.0f); + } + + if (inferenceValue.contains("length_scale")) { + synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f); + } + + if (inferenceValue.contains("noise_w")) { + synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f); + } + + if (inferenceValue.contains("phoneme_silence")) { + // phoneme -> seconds of silence to add after + synthesisConfig.phonemeSilenceSeconds.emplace(); + auto phonemeSilenceValue = inferenceValue["phoneme_silence"]; + for (auto &phonemeItem : phonemeSilenceValue.items()) { + std::string phonemeStr = phonemeItem.key(); + if (!isSingleCodepoint(phonemeStr)) { + spdlog::error("\"{}\" is not a single codepoint", phonemeStr); + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme silence)"); + } + + auto phoneme = getCodepoint(phonemeStr); + (*synthesisConfig.phonemeSilenceSeconds)[phoneme] = + phonemeItem.value().get(); + } + + } // if phoneme_silence + + } // if inference + +} /* parseSynthesisConfig */ + +void parseModelConfig(json &configRoot, ModelConfig &modelConfig) { + + modelConfig.numSpeakers = configRoot["num_speakers"].get(); + + if (configRoot.contains("speaker_id_map")) { + if (!modelConfig.speakerIdMap) { + modelConfig.speakerIdMap.emplace(); + } + + auto speakerIdMapValue = configRoot["speaker_id_map"]; + for (auto &speakerItem : speakerIdMapValue.items()) { + std::string speakerName = speakerItem.key(); + (*modelConfig.speakerIdMap)[speakerName] = + speakerItem.value().get(); + } + } + +} /* parseModelConfig */ + +void initialize(PiperConfig &config) { + if (config.useESpeak) { + // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator + // See: https://github.com/rhasspy/espeak-ng + spdlog::debug("Initializing eSpeak"); + int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, + /*buflength*/ 0, + /*path*/ config.eSpeakDataPath.c_str(), + /*options*/ 0); + if (result < 0) { + throw std::runtime_error("Failed to initialize eSpeak-ng"); + } + + spdlog::debug("Initialized eSpeak"); + } + + // Load onnx model for libtashkeel + // https://github.com/mush42/libtashkeel/ + if (config.useTashkeel) { + spdlog::debug("Using libtashkeel for diacritization"); + if (!config.tashkeelModelPath) { + throw std::runtime_error("No path to libtashkeel model"); + } + + spdlog::debug("Loading libtashkeel model from {}", + config.tashkeelModelPath.value()); + config.tashkeelState = std::make_unique(); + tashkeel::tashkeel_load(config.tashkeelModelPath.value(), + *config.tashkeelState); + spdlog::debug("Initialized libtashkeel"); + } + + spdlog::info("Initialized piper"); +} + +void terminate(PiperConfig &config) { + if (config.useESpeak) { + // Clean up espeak-ng + spdlog::debug("Terminating eSpeak"); + espeak_Terminate(); + spdlog::debug("Terminated eSpeak"); + } + + spdlog::info("Terminated piper"); +} + +void loadModel(std::string modelPath, ModelSession &session, bool useCuda) { + spdlog::debug("Loading onnx model from {}", modelPath); + session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, + instanceName.c_str()); + session.env.DisableTelemetryEvents(); + + if (useCuda) { + // Use CUDA provider + OrtCUDAProviderOptions cuda_options{}; + cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic; + session.options.AppendExecutionProvider_CUDA(cuda_options); + } + + // Slows down performance by ~2x + // session.options.SetIntraOpNumThreads(1); + + // Roughly doubles load time for no visible inference benefit + // session.options.SetGraphOptimizationLevel( + // GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + + session.options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_DISABLE_ALL); + + // Slows down performance very slightly + // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + + session.options.DisableCpuMemArena(); + session.options.DisableMemPattern(); + session.options.DisableProfiling(); + + auto startTime = std::chrono::steady_clock::now(); + +#ifdef _WIN32 + auto modelPathW = std::wstring(modelPath.begin(), modelPath.end()); + auto modelPathStr = modelPathW.c_str(); +#else + auto modelPathStr = modelPath.c_str(); +#endif + + session.onnx = Ort::Session(session.env, modelPathStr, session.options); + + auto endTime = std::chrono::steady_clock::now(); + spdlog::debug("Loaded onnx model in {} second(s)", + std::chrono::duration(endTime - startTime).count()); +} + +// Load Onnx model and JSON config file +void loadVoice(PiperConfig &config, std::string modelPath, + std::string modelConfigPath, Voice &voice, + std::optional &speakerId, bool useCuda) { + spdlog::debug("Parsing voice config at {}", modelConfigPath); + std::ifstream modelConfigFile(modelConfigPath); + voice.configRoot = json::parse(modelConfigFile); + + parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig); + parseSynthesisConfig(voice.configRoot, voice.synthesisConfig); + parseModelConfig(voice.configRoot, voice.modelConfig); + + if (voice.modelConfig.numSpeakers > 1) { + // Multi-speaker model + if (speakerId) { + voice.synthesisConfig.speakerId = speakerId; + } else { + // Default speaker + voice.synthesisConfig.speakerId = 0; + } + } + + spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers); + + loadModel(modelPath, voice.session, useCuda); + +} /* loadVoice */ + +// Phoneme ids to WAV audio +void synthesize(std::vector &phonemeIds, + SynthesisConfig &synthesisConfig, ModelSession &session, + std::vector &audioBuffer, SynthesisResult &result) { + spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size()); + + auto memoryInfo = Ort::MemoryInfo::CreateCpu( + OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + + // Allocate + std::vector phonemeIdLengths{(int64_t)phonemeIds.size()}; + std::vector scales{synthesisConfig.noiseScale, + synthesisConfig.lengthScale, + synthesisConfig.noiseW}; + + std::vector inputTensors; + std::vector phonemeIdsShape{1, (int64_t)phonemeIds.size()}; + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(), + phonemeIdsShape.size())); + + std::vector phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()}; + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(), + phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size())); + + std::vector scalesShape{(int64_t)scales.size()}; + inputTensors.push_back( + Ort::Value::CreateTensor(memoryInfo, scales.data(), scales.size(), + scalesShape.data(), scalesShape.size())); + + // Add speaker id. + // NOTE: These must be kept outside the "if" below to avoid being deallocated. + std::vector speakerId{ + (int64_t)synthesisConfig.speakerId.value_or(0)}; + std::vector speakerIdShape{(int64_t)speakerId.size()}; + + if (synthesisConfig.speakerId) { + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(), + speakerIdShape.size())); + } + + // From export_onnx.py + std::array inputNames = {"input", "input_lengths", "scales", + "sid"}; + std::array outputNames = {"output"}; + + // Infer + auto startTime = std::chrono::steady_clock::now(); + auto outputTensors = session.onnx.Run( + Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(), + inputTensors.size(), outputNames.data(), outputNames.size()); + auto endTime = std::chrono::steady_clock::now(); + + if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) { + throw std::runtime_error("Invalid output tensors"); + } + auto inferDuration = std::chrono::duration(endTime - startTime); + result.inferSeconds = inferDuration.count(); + + const float *audio = outputTensors.front().GetTensorData(); + auto audioShape = + outputTensors.front().GetTensorTypeAndShapeInfo().GetShape(); + int64_t audioCount = audioShape[audioShape.size() - 1]; + + result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate; + result.realTimeFactor = 0.0; + if (result.audioSeconds > 0) { + result.realTimeFactor = result.inferSeconds / result.audioSeconds; + } + spdlog::debug("Synthesized {} second(s) of audio in {} second(s)", + result.audioSeconds, result.inferSeconds); + + // Get max audio value for scaling + float maxAudioValue = 0.01f; + for (int64_t i = 0; i < audioCount; i++) { + float audioValue = abs(audio[i]); + if (audioValue > maxAudioValue) { + maxAudioValue = audioValue; + } + } + + // We know the size up front + audioBuffer.reserve(audioCount); + + // Scale audio to fill range and convert to int16 + float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue)); + + // Scale to desired volume level + audioScale = audioScale * synthesisConfig.volumeLevel; + + for (int64_t i = 0; i < audioCount; i++) { + int16_t intAudioValue = static_cast( + std::clamp(audio[i] * audioScale, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); + + audioBuffer.push_back(intAudioValue); + } + + // Clean up + for (std::size_t i = 0; i < outputTensors.size(); i++) { + Ort::detail::OrtRelease(outputTensors[i].release()); + } + + for (std::size_t i = 0; i < inputTensors.size(); i++) { + Ort::detail::OrtRelease(inputTensors[i].release()); + } +} + +// ---------------------------------------------------------------------------- + +// Phonemize text and synthesize audio +void textToAudio(PiperConfig &config, Voice &voice, std::string text, + std::vector &audioBuffer, SynthesisResult &result, + const std::function &audioCallback) { + + std::size_t sentenceSilenceSamples = 0; + if (voice.synthesisConfig.sentenceSilenceSeconds > 0) { + sentenceSilenceSamples = (std::size_t)( + voice.synthesisConfig.sentenceSilenceSeconds * + voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels); + } + + if (config.useTashkeel) { + if (!config.tashkeelState) { + throw std::runtime_error("Tashkeel model is not loaded"); + } + + spdlog::debug("Diacritizing text with libtashkeel: {}", text); + text = tashkeel::tashkeel_run(text, *config.tashkeelState); + } + + // Phonemes for each sentence + spdlog::debug("Phonemizing text: {}", text); + std::vector> phonemes; + + if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) { + // Use espeak-ng for phonemization + eSpeakPhonemeConfig eSpeakConfig; + eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice; + phonemize_eSpeak(text, eSpeakConfig, phonemes); + } else { + // Use UTF-8 codepoints as "phonemes" + CodepointsPhonemeConfig codepointsConfig; + phonemize_codepoints(text, codepointsConfig, phonemes); + } + + // Synthesize each sentence independently. + std::vector phonemeIds; + std::map missingPhonemes; + for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end(); + ++phonemesIter) { + std::vector &sentencePhonemes = *phonemesIter; + + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phonemes + std::string phonemesStr; + for (auto phoneme : sentencePhonemes) { + utf8::append(phoneme, std::back_inserter(phonemesStr)); + } + + spdlog::debug("Converting {} phoneme(s) to ids: {}", + sentencePhonemes.size(), phonemesStr); + } + + std::vector>> phrasePhonemes; + std::vector phraseResults; + std::vector phraseSilenceSamples; + + // Use phoneme/id map from config + PhonemeIdConfig idConfig; + idConfig.phonemeIdMap = + std::make_shared(voice.phonemizeConfig.phonemeIdMap); + + if (voice.synthesisConfig.phonemeSilenceSeconds) { + // Split into phrases + std::map &phonemeSilenceSeconds = + *voice.synthesisConfig.phonemeSilenceSeconds; + + auto currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + + for (auto sentencePhonemesIter = sentencePhonemes.begin(); + sentencePhonemesIter != sentencePhonemes.end(); + sentencePhonemesIter++) { + Phoneme ¤tPhoneme = *sentencePhonemesIter; + currentPhrasePhonemes->push_back(currentPhoneme); + + if (phonemeSilenceSeconds.count(currentPhoneme) > 0) { + // Split at phrase boundary + phraseSilenceSamples.push_back( + (std::size_t)(phonemeSilenceSeconds[currentPhoneme] * + voice.synthesisConfig.sampleRate * + voice.synthesisConfig.channels)); + + currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + } + } + } else { + // Use all phonemes + phrasePhonemes.push_back( + std::make_shared>(sentencePhonemes)); + } + + // Ensure results/samples are the same size + while (phraseResults.size() < phrasePhonemes.size()) { + phraseResults.emplace_back(); + } + + while (phraseSilenceSamples.size() < phrasePhonemes.size()) { + phraseSilenceSamples.push_back(0); + } + + // phonemes -> ids -> audio + for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) { + if (phrasePhonemes[phraseIdx]->size() <= 0) { + continue; + } + + // phonemes -> ids + phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds, + missingPhonemes); + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phoneme ids + std::stringstream phonemeIdsStr; + for (auto phonemeId : phonemeIds) { + phonemeIdsStr << phonemeId << ", "; + } + + spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", + phrasePhonemes[phraseIdx]->size(), phonemeIds.size(), + phonemeIdsStr.str()); + } + + // ids -> audio + synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, + phraseResults[phraseIdx]); + + // Add end of phrase silence + for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) { + audioBuffer.push_back(0); + } + + result.audioSeconds += phraseResults[phraseIdx].audioSeconds; + result.inferSeconds += phraseResults[phraseIdx].inferSeconds; + + phonemeIds.clear(); + } + + // Add end of sentence silence + if (sentenceSilenceSamples > 0) { + for (std::size_t i = 0; i < sentenceSilenceSamples; i++) { + audioBuffer.push_back(0); + } + } + + if (audioCallback) { + // Call back must copy audio since it is cleared afterwards. + audioCallback(); + audioBuffer.clear(); + } + + phonemeIds.clear(); + } + + if (missingPhonemes.size() > 0) { + spdlog::warn("Missing {} phoneme(s) from phoneme/id map!", + missingPhonemes.size()); + + for (auto phonemeCount : missingPhonemes) { + std::string phonemeStr; + utf8::append(phonemeCount.first, std::back_inserter(phonemeStr)); + spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr, + (uint32_t)phonemeCount.first, phonemeCount.second); + } + } + + if (result.audioSeconds > 0) { + result.realTimeFactor = result.inferSeconds / result.audioSeconds; + } + +} /* textToAudio */ + +// Phonemize text and synthesize audio to WAV file +void textToWavFile(PiperConfig &config, Voice &voice, std::string text, + std::ostream &audioFile, SynthesisResult &result) { + + std::vector audioBuffer; + textToAudio(config, voice, text, audioBuffer, result, NULL); + + // Write WAV + auto synthesisConfig = voice.synthesisConfig; + writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth, + synthesisConfig.channels, (int32_t)audioBuffer.size(), + audioFile); + + audioFile.write((const char *)audioBuffer.data(), + sizeof(int16_t) * audioBuffer.size()); + +} /* textToWavFile */ + +} // namespace piper diff --git a/piper.hpp b/piper.hpp new file mode 100644 index 00000000..60c5b4b5 --- /dev/null +++ b/piper.hpp @@ -0,0 +1,133 @@ +#ifndef PIPER_H_ +#define PIPER_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "json.hpp" + +using json = nlohmann::json; + +namespace piper { + +typedef int64_t SpeakerId; + +struct eSpeakConfig { + std::string voice = "en-us"; +}; + +struct PiperConfig { + std::string eSpeakDataPath; + bool useESpeak = true; + + bool useTashkeel = false; + std::optional tashkeelModelPath; + std::unique_ptr tashkeelState; +}; + +enum PhonemeType { eSpeakPhonemes, TextPhonemes }; + +struct PhonemizeConfig { + PhonemeType phonemeType = eSpeakPhonemes; + std::optional>> phonemeMap; + std::map> phonemeIdMap; + + PhonemeId idPad = 0; // padding (optionally interspersed) + PhonemeId idBos = 1; // beginning of sentence + PhonemeId idEos = 2; // end of sentence + bool interspersePad = true; + + eSpeakConfig eSpeak; +}; + +struct SynthesisConfig { + // VITS inference settings + float noiseScale = 0.667f; + float volumeLevel = 1.0f; + float lengthScale = 1.0f; + float noiseW = 0.8f; + + // Audio settings + int sampleRate = 22050; + int sampleWidth = 2; // 16-bit + int channels = 1; // mono + + // Speaker id from 0 to numSpeakers - 1 + std::optional speakerId; + + // Extra silence + float sentenceSilenceSeconds = 0.2f; + std::optional> phonemeSilenceSeconds; +}; + +struct ModelConfig { + int numSpeakers; + + // speaker name -> id + std::optional> speakerIdMap; +}; + +struct ModelSession { + Ort::Session onnx; + Ort::AllocatorWithDefaultOptions allocator; + Ort::SessionOptions options; + Ort::Env env; + + ModelSession() : onnx(nullptr){}; +}; + +struct SynthesisResult { + double inferSeconds; + double audioSeconds; + double realTimeFactor; +}; + +struct Voice { + json configRoot; + PhonemizeConfig phonemizeConfig; + SynthesisConfig synthesisConfig; + ModelConfig modelConfig; + ModelSession session; +}; + +// True if the string is a single UTF-8 codepoint +bool isSingleCodepoint(std::string s); + +// Get the first UTF-8 codepoint of a string +Phoneme getCodepoint(std::string s); + +// Get version of Piper +std::string getVersion(); + +// Must be called before using textTo* functions +void initialize(PiperConfig &config); + +// Clean up +void terminate(PiperConfig &config); + +// Load Onnx model and JSON config file +void loadVoice(PiperConfig &config, std::string modelPath, + std::string modelConfigPath, Voice &voice, + std::optional &speakerId, bool useCuda); + +// Phonemize text and synthesize audio +void textToAudio(PiperConfig &config, Voice &voice, std::string text, + std::vector &audioBuffer, SynthesisResult &result, + const std::function &audioCallback); + +// Phonemize text and synthesize audio to WAV file +void textToWavFile(PiperConfig &config, Voice &voice, std::string text, + std::ostream &audioFile, SynthesisResult &result); + +} // namespace piper + +#endif // PIPER_H_ From 73693b1e228ea3c20a5ca044ca17180aa58476de Mon Sep 17 00:00:00 2001 From: flatsiedatsie Date: Thu, 15 Feb 2024 21:21:58 +0100 Subject: [PATCH 3/3] Add files via upload --- src/cpp/main.cpp | 18 ++++++++++++++++++ src/cpp/piper.cpp | 8 ++++++++ src/cpp/piper.hpp | 1 + 3 files changed, 27 insertions(+) diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index aee4d83c..f5d4d9ba 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -60,6 +60,9 @@ struct RunConfig { // Amount of noise to add during audio generation optional noiseScale; + + // Audio output volume + optional volumeLevel; // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower) optional lengthScale; @@ -204,6 +207,10 @@ int main(int argc, char *argv[]) { if (runConfig.noiseScale) { voice.synthesisConfig.noiseScale = runConfig.noiseScale.value(); } + + if (runConfig.volumeLevel) { + voice.synthesisConfig.volumeLevel = runConfig.volumeLevel.value(); + } if (runConfig.lengthScale) { voice.synthesisConfig.lengthScale = runConfig.lengthScale.value(); @@ -246,6 +253,7 @@ int main(int argc, char *argv[]) { while (getline(cin, line)) { auto outputType = runConfig.outputType; auto speakerId = voice.synthesisConfig.speakerId; + auto volumeLevel = voice.synthesisConfig.volumeLevel; std::optional maybeOutputPath = runConfig.outputPath; if (runConfig.jsonInput) { @@ -266,6 +274,10 @@ int main(int argc, char *argv[]) { // Override speaker id voice.synthesisConfig.speakerId = lineRoot["speaker_id"].get(); + } else if (lineRoot.contains("volume_level")) { + // Override volume level + voice.synthesisConfig.volumeLevel = + std::stof(lineRoot["volume_level"].get()); } else if (lineRoot.contains("speaker")) { // Resolve to id using speaker id map auto speakerName = lineRoot["speaker"].get(); @@ -373,6 +385,7 @@ int main(int argc, char *argv[]) { // Restore config (--json-input) voice.synthesisConfig.speakerId = speakerId; + voice.synthesisConfig.volumeLevel = volumeLevel; } // for each line @@ -446,6 +459,8 @@ void printUsage(char *argv[]) { cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl; cerr << " --noise_scale NUM generator noise (default: 0.667)" << endl; + cerr << " -v NUM --volume NUM Volume level (default: 1.0)" + << endl; cerr << " --length_scale NUM phoneme length (default: 1.0)" << endl; cerr << " --noise_w NUM phoneme width noise (default: 0.8)" @@ -514,6 +529,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { } else if (arg == "--noise_scale" || arg == "--noise-scale") { ensureArg(argc, argv, i); runConfig.noiseScale = stof(argv[++i]); + } else if (arg == "-v" || arg == "--volume") { + ensureArg(argc, argv, i); + runConfig.volumeLevel = stof(argv[++i]); } else if (arg == "--length_scale" || arg == "--length-scale") { ensureArg(argc, argv, i); runConfig.lengthScale = stof(argv[++i]); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 00d4a47a..21e3e246 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -162,6 +162,10 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { if (inferenceValue.contains("noise_scale")) { synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f); } + + if (inferenceValue.contains("volume_level")) { + synthesisConfig.volumeLevel = inferenceValue.value("volume_level", 1.0f); + } if (inferenceValue.contains("length_scale")) { synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f); @@ -421,6 +425,10 @@ void synthesize(std::vector &phonemeIds, // Scale audio to fill range and convert to int16 float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue)); + + // Scale to desired volume level + audioScale = audioScale * synthesisConfig.volumeLevel; + for (int64_t i = 0; i < audioCount; i++) { int16_t intAudioValue = static_cast( std::clamp(audio[i] * audioScale, diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp index 7b956f79..60c5b4b5 100644 --- a/src/cpp/piper.hpp +++ b/src/cpp/piper.hpp @@ -52,6 +52,7 @@ struct PhonemizeConfig { struct SynthesisConfig { // VITS inference settings float noiseScale = 0.667f; + float volumeLevel = 1.0f; float lengthScale = 1.0f; float noiseW = 0.8f;