From 1efa6c98c1c3975388161b9f8dc0e97110eb47eb Mon Sep 17 00:00:00 2001
From: flatsiedatsie <unfold@gmail.com>
Date: Sat, 3 Feb 2024 14:43:24 +0100
Subject: [PATCH 1/3] Add --server option

This runs the core of the project in a loop until the process receives a termination signal.
---
 src/cpp/main.cpp | 253 ++++++++++++++++++++++++++---------------------
 1 file changed, 140 insertions(+), 113 deletions(-)
diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index bd750066..aee4d83c 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <functional>
 #include <iostream>
+#include <csignal>
 #include <map>
 #include <mutex>
 #include <sstream>
@@ -36,6 +37,8 @@
 using namespace std;
 using json = nlohmann::json;
 
+bool running = true;
+
 enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
 
 struct RunConfig {
@@ -88,8 +91,16 @@ struct RunConfig {
 
   // true to use CUDA execution provider
   bool useCuda = false;
+  
+  // true to keep running continuously, parsing any input on demand
+  bool server = false;
 };
 
+void signalHandler( int signum ) {
+   cout << "Interrupt signal (" << signum << ") received.\n";
+   running = false;
+   exit(signum);  
+}
 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
 void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
                    condition_variable &cvAudio, bool &audioReady,
@@ -100,6 +111,8 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
 int main(int argc, char *argv[]) {
   spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
 
+  signal(SIGINT, signalHandler);  
+  
   RunConfig runConfig;
   parseArgs(argc, argv, runConfig);
 
@@ -228,138 +241,148 @@ int main(int argc, char *argv[]) {
 
   string line;
   piper::SynthesisResult result;
-  while (getline(cin, line)) {
-    auto outputType = runConfig.outputType;
-    auto speakerId = voice.synthesisConfig.speakerId;
-    std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
-
-    if (runConfig.jsonInput) {
-      // Each line is a JSON object
-      json lineRoot = json::parse(line);
-
-      // Text is required
-      line = lineRoot["text"].get<std::string>();
-
-      if (lineRoot.contains("output_file")) {
-        // Override output WAV file path
-        outputType = OUTPUT_FILE;
-        maybeOutputPath =
-            filesystem::path(lineRoot["output_file"].get<std::string>());
-      }
+  while (running){
+	
+    while (getline(cin, line)) {
+      auto outputType = runConfig.outputType;
+      auto speakerId = voice.synthesisConfig.speakerId;
+      std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
+
+      if (runConfig.jsonInput) {
+        // Each line is a JSON object
+        json lineRoot = json::parse(line);
+
+        // Text is required
+        line = lineRoot["text"].get<std::string>();
+
+        if (lineRoot.contains("output_file")) {
+          // Override output WAV file path
+          outputType = OUTPUT_FILE;
+          maybeOutputPath =
+              filesystem::path(lineRoot["output_file"].get<std::string>());
+        }
 
-      if (lineRoot.contains("speaker_id")) {
-        // Override speaker id
-        voice.synthesisConfig.speakerId =
-            lineRoot["speaker_id"].get<piper::SpeakerId>();
-      } else if (lineRoot.contains("speaker")) {
-        // Resolve to id using speaker id map
-        auto speakerName = lineRoot["speaker"].get<std::string>();
-        if ((voice.modelConfig.speakerIdMap) &&
-            (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
+        if (lineRoot.contains("speaker_id")) {
+          // Override speaker id
           voice.synthesisConfig.speakerId =
-              (*voice.modelConfig.speakerIdMap)[speakerName];
-        } else {
-          spdlog::warn("No speaker named: {}", speakerName);
+              lineRoot["speaker_id"].get<piper::SpeakerId>();
+        } else if (lineRoot.contains("speaker")) {
+          // Resolve to id using speaker id map
+          auto speakerName = lineRoot["speaker"].get<std::string>();
+          if ((voice.modelConfig.speakerIdMap) &&
+              (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
+            voice.synthesisConfig.speakerId =
+                (*voice.modelConfig.speakerIdMap)[speakerName];
+          } else {
+            spdlog::warn("No speaker named: {}", speakerName);
+          }
         }
       }
-    }
 
-    // Timestamp is used for path to output WAV file
-    const auto now = chrono::system_clock::now();
-    const auto timestamp =
-        chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
-            .count();
-
-    if (outputType == OUTPUT_DIRECTORY) {
-      // Generate path using timestamp
-      stringstream outputName;
-      outputName << timestamp << ".wav";
-      filesystem::path outputPath = runConfig.outputPath.value();
-      outputPath.append(outputName.str());
-
-      // Output audio to automatically-named WAV file in a directory
-      ofstream audioFile(outputPath.string(), ios::binary);
-      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
-      cout << outputPath.string() << endl;
-    } else if (outputType == OUTPUT_FILE) {
-      if (!maybeOutputPath || maybeOutputPath->empty()) {
-        throw runtime_error("No output path provided");
-      }
-
-      filesystem::path outputPath = maybeOutputPath.value();
-
-      if (!runConfig.jsonInput) {
-        // Read all of standard input before synthesizing.
-        // Otherwise, we would overwrite the output file for each line.
-        stringstream text;
-        text << line;
-        while (getline(cin, line)) {
-          text << " " << line;
+      // Timestamp is used for path to output WAV file
+      const auto now = chrono::system_clock::now();
+      const auto timestamp =
+          chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
+              .count();
+
+      if (outputType == OUTPUT_DIRECTORY) {
+        // Generate path using timestamp
+        stringstream outputName;
+        outputName << timestamp << ".wav";
+        filesystem::path outputPath = runConfig.outputPath.value();
+        outputPath.append(outputName.str());
+
+        // Output audio to automatically-named WAV file in a directory
+        ofstream audioFile(outputPath.string(), ios::binary);
+        piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+        cout << outputPath.string() << endl;
+      } else if (outputType == OUTPUT_FILE) {
+        if (!maybeOutputPath || maybeOutputPath->empty()) {
+          throw runtime_error("No output path provided");
         }
 
-        line = text.str();
-      }
+        filesystem::path outputPath = maybeOutputPath.value();
 
-      // Output audio to WAV file
-      ofstream audioFile(outputPath.string(), ios::binary);
-      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
-      cout << outputPath.string() << endl;
-    } else if (outputType == OUTPUT_STDOUT) {
-      // Output WAV to stdout
-      piper::textToWavFile(piperConfig, voice, line, cout, result);
-    } else if (outputType == OUTPUT_RAW) {
-      // Raw output to stdout
-      mutex mutAudio;
-      condition_variable cvAudio;
-      bool audioReady = false;
-      bool audioFinished = false;
-      vector<int16_t> audioBuffer;
-      vector<int16_t> sharedAudioBuffer;
+        if (!runConfig.jsonInput) {
+          // Read all of standard input before synthesizing.
+          // Otherwise, we would overwrite the output file for each line.
+          stringstream text;
+          text << line;
+          while (getline(cin, line)) {
+            text << " " << line;
+          }
 
-#ifdef _WIN32
-      // Needed on Windows to avoid terminal conversions
-      setmode(fileno(stdout), O_BINARY);
-      setmode(fileno(stdin), O_BINARY);
-#endif
+          line = text.str();
+        }
 
-      thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
-                             ref(mutAudio), ref(cvAudio), ref(audioReady),
-                             ref(audioFinished));
-      auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
-                            &cvAudio, &audioReady]() {
-        // Signal thread that audio is ready
+        // Output audio to WAV file
+        ofstream audioFile(outputPath.string(), ios::binary);
+        piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+        cout << outputPath.string() << endl;
+      } else if (outputType == OUTPUT_STDOUT) {
+        // Output WAV to stdout
+        piper::textToWavFile(piperConfig, voice, line, cout, result);
+      } else if (outputType == OUTPUT_RAW) {
+        // Raw output to stdout
+        mutex mutAudio;
+        condition_variable cvAudio;
+        bool audioReady = false;
+        bool audioFinished = false;
+        vector<int16_t> audioBuffer;
+        vector<int16_t> sharedAudioBuffer;
+
+  #ifdef _WIN32
+        // Needed on Windows to avoid terminal conversions
+        setmode(fileno(stdout), O_BINARY);
+        setmode(fileno(stdin), O_BINARY);
+  #endif
+
+        thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
+                               ref(mutAudio), ref(cvAudio), ref(audioReady),
+                               ref(audioFinished));
+        auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
+                              &cvAudio, &audioReady]() {
+          // Signal thread that audio is ready
+          {
+            unique_lock lockAudio(mutAudio);
+            copy(audioBuffer.begin(), audioBuffer.end(),
+                 back_inserter(sharedAudioBuffer));
+            audioReady = true;
+            cvAudio.notify_one();
+          }
+        };
+        piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
+                           audioCallback);
+
+        // Signal thread that there is no more audio
         {
           unique_lock lockAudio(mutAudio);
-          copy(audioBuffer.begin(), audioBuffer.end(),
-               back_inserter(sharedAudioBuffer));
           audioReady = true;
+          audioFinished = true;
           cvAudio.notify_one();
         }
-      };
-      piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
-                         audioCallback);
-
-      // Signal thread that there is no more audio
-      {
-        unique_lock lockAudio(mutAudio);
-        audioReady = true;
-        audioFinished = true;
-        cvAudio.notify_one();
-      }
 
-      // Wait for audio output to finish
-      spdlog::info("Waiting for audio to finish playing...");
-      rawOutputThread.join();
-    }
+        // Wait for audio output to finish
+        spdlog::info("Waiting for audio to finish playing...");
+        rawOutputThread.join();
+      }
 
-    spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
-                 result.realTimeFactor, result.inferSeconds,
-                 result.audioSeconds);
+      spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
+                   result.realTimeFactor, result.inferSeconds,
+                   result.audioSeconds);
 
-    // Restore config (--json-input)
-    voice.synthesisConfig.speakerId = speakerId;
+      // Restore config (--json-input)
+      voice.synthesisConfig.speakerId = speakerId;
 
-  } // for each line
+    } // for each line
+	
+	if (!runConfig.server) {
+		break;
+	}
+	
+	sleep(0.001);
+  }
+  
 
   piper::terminate(piperConfig);
 
@@ -440,6 +463,8 @@ void printUsage(char *argv[]) {
        << endl;
   cerr << "   --use-cuda                    use CUDA execution provider"
        << endl;
+  cerr << "   --server                      Keep running until closed"
+       << endl;
   cerr << "   --debug                       print DEBUG messages to the console"
        << endl;
   cerr << "   -q       --quiet              disable logging" << endl;
@@ -525,6 +550,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
       runConfig.jsonInput = true;
     } else if (arg == "--use_cuda" || arg == "--use-cuda") {
       runConfig.useCuda = true;
+	} else if (arg == "--server") {
+	  runConfig.server = true;
     } else if (arg == "--version") {
       std::cout << piper::getVersion() << std::endl;
       exit(0);

From 5c46e311ada662f8e8d0815d8b9323759ab04fdc Mon Sep 17 00:00:00 2001
From: flatsiedatsie <unfold@gmail.com>
Date: Thu, 15 Feb 2024 20:42:14 +0100
Subject: [PATCH 2/3] Add files via upload

---
 main.cpp  | 607 ++++++++++++++++++++++++++++++++++++++++++++++++++
 piper.cpp | 644 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 piper.hpp | 133 +++++++++++
 3 files changed, 1384 insertions(+)
 create mode 100644 main.cpp
 create mode 100644 piper.cpp
 create mode 100644 piper.hpp

diff --git a/main.cpp b/main.cpp
new file mode 100644
index 00000000..614853f8
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,607 @@
+#include <chrono>
+#include <condition_variable>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <csignal>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <vector>
+
+#ifdef _MSC_VER
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#endif
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+
+#include "json.hpp"
+#include "piper.hpp"
+
+using namespace std;
+using json = nlohmann::json;
+
+bool running = true;
+
+enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
+
+struct RunConfig {
+  // Path to .onnx voice file
+  filesystem::path modelPath;
+
+  // Path to JSON voice config file
+  filesystem::path modelConfigPath;
+
+  // Type of output to produce.
+  // Default is to write a WAV file in the current directory.
+  OutputType outputType = OUTPUT_DIRECTORY;
+
+  // Path for output
+  optional<filesystem::path> outputPath = filesystem::path(".");
+
+  // Numerical id of the default speaker (multi-speaker voices)
+  optional<piper::SpeakerId> speakerId;
+
+  // Amount of noise to add during audio generation
+  optional<float> noiseScale;
+  
+  // Audio output volume
+  //optional<std::String> speakerId;
+  optional<float> volumeLevel;
+
+  // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
+  optional<float> lengthScale;
+
+  // Variation in phoneme lengths
+  optional<float> noiseW;
+
+  // Seconds of silence to add after each sentence
+  optional<float> sentenceSilenceSeconds;
+
+  // Path to espeak-ng data directory (default is next to piper executable)
+  optional<filesystem::path> eSpeakDataPath;
+
+  // Path to libtashkeel ort model
+  // https://github.com/mush42/libtashkeel/
+  optional<filesystem::path> tashkeelModelPath;
+
+  // stdin input is lines of JSON instead of text with format:
+  // {
+  //   "text": str,               (required)
+  //   "speaker_id": int,         (optional)
+  //   "speaker": str,            (optional)
+  //   "output_file": str,        (optional)
+  // }
+  bool jsonInput = false;
+
+  // Seconds of extra silence to insert after a single phoneme
+  optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
+
+  // true to use CUDA execution provider
+  bool useCuda = false;
+  
+  // true to keep running continuously, parsing any input on demand
+  bool server = false;
+};
+
+void signalHandler( int signum ) {
+   cout << "Interrupt signal (" << signum << ") received.\n";
+   running = false;
+   exit(signum);  
+}
+void parseArgs(int argc, char *argv[], RunConfig &runConfig);
+void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
+                   condition_variable &cvAudio, bool &audioReady,
+                   bool &audioFinished);
+
+// ----------------------------------------------------------------------------
+
+int main(int argc, char *argv[]) {
+  spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
+
+  signal(SIGINT, signalHandler);  
+  
+  RunConfig runConfig;
+  parseArgs(argc, argv, runConfig);
+
+#ifdef _WIN32
+  // Required on Windows to show IPA symbols
+  SetConsoleOutputCP(CP_UTF8);
+#endif
+
+  piper::PiperConfig piperConfig;
+  piper::Voice voice;
+
+  spdlog::debug("Loading voice from {} (config={})",
+                runConfig.modelPath.string(),
+                runConfig.modelConfigPath.string());
+
+  auto startTime = chrono::steady_clock::now();
+  loadVoice(piperConfig, runConfig.modelPath.string(),
+            runConfig.modelConfigPath.string(), voice, runConfig.speakerId,
+            runConfig.useCuda);
+  auto endTime = chrono::steady_clock::now();
+  spdlog::info("Loaded voice in {} second(s)",
+               chrono::duration<double>(endTime - startTime).count());
+
+  // Get the path to the piper executable so we can locate espeak-ng-data, etc.
+  // next to it.
+#ifdef _MSC_VER
+  auto exePath = []() {
+    wchar_t moduleFileName[MAX_PATH] = {0};
+    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
+    return filesystem::path(moduleFileName);
+  }();
+#else
+#ifdef __APPLE__
+  auto exePath = []() {
+    char moduleFileName[PATH_MAX] = {0};
+    uint32_t moduleFileNameSize = std::size(moduleFileName);
+    _NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
+    return filesystem::path(moduleFileName);
+  }();
+#else
+  auto exePath = filesystem::canonical("/proc/self/exe");
+#endif
+#endif
+
+  if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
+    spdlog::debug("Voice uses eSpeak phonemes ({})",
+                  voice.phonemizeConfig.eSpeak.voice);
+
+    if (runConfig.eSpeakDataPath) {
+      // User provided path
+      piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.eSpeakDataPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("espeak-ng-data"))
+              .string();
+
+      spdlog::debug("espeak-ng-data directory is expected at {}",
+                    piperConfig.eSpeakDataPath);
+    }
+  } else {
+    // Not using eSpeak
+    piperConfig.useESpeak = false;
+  }
+
+  // Enable libtashkeel for Arabic
+  if (voice.phonemizeConfig.eSpeak.voice == "ar") {
+    piperConfig.useTashkeel = true;
+    if (runConfig.tashkeelModelPath) {
+      // User provided path
+      piperConfig.tashkeelModelPath =
+          runConfig.tashkeelModelPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.tashkeelModelPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("libtashkeel_model.ort"))
+              .string();
+
+      spdlog::debug("libtashkeel model is expected at {}",
+                    piperConfig.tashkeelModelPath.value());
+    }
+  }
+
+  piper::initialize(piperConfig);
+
+  // Scales
+  if (runConfig.noiseScale) {
+    voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
+  }
+  
+  if (runConfig.volumeLevel) {
+    voice.synthesisConfig.volumeLevel = runConfig.volumeLevel.value();
+  }
+
+  if (runConfig.lengthScale) {
+    voice.synthesisConfig.lengthScale = runConfig.lengthScale.value();
+  }
+
+  if (runConfig.noiseW) {
+    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
+  }
+
+  if (runConfig.sentenceSilenceSeconds) {
+    voice.synthesisConfig.sentenceSilenceSeconds =
+        runConfig.sentenceSilenceSeconds.value();
+  }
+
+  if (runConfig.phonemeSilenceSeconds) {
+    if (!voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Overwrite
+      voice.synthesisConfig.phonemeSilenceSeconds =
+          runConfig.phonemeSilenceSeconds;
+    } else {
+      // Merge
+      for (const auto &[phoneme, silenceSeconds] :
+           *runConfig.phonemeSilenceSeconds) {
+        voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
+            phoneme, silenceSeconds);
+      }
+    }
+
+  } // if phonemeSilenceSeconds
+
+  if (runConfig.outputType == OUTPUT_DIRECTORY) {
+    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
+    spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
+  }
+
+  string line;
+  piper::SynthesisResult result;
+  while (running){
+	
+    while (getline(cin, line)) {
+      auto outputType = runConfig.outputType;
+      auto speakerId = voice.synthesisConfig.speakerId;
+	  auto volumeLevel = voice.synthesisConfig.volumeLevel;
+      std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
+
+      if (runConfig.jsonInput) {
+        // Each line is a JSON object
+        json lineRoot = json::parse(line);
+
+        // Text is required
+        line = lineRoot["text"].get<std::string>();
+
+        if (lineRoot.contains("output_file")) {
+          // Override output WAV file path
+          outputType = OUTPUT_FILE;
+          maybeOutputPath =
+              filesystem::path(lineRoot["output_file"].get<std::string>());
+        }
+
+        if (lineRoot.contains("speaker_id")) {
+          // Override speaker id
+          voice.synthesisConfig.speakerId =
+              lineRoot["speaker_id"].get<piper::SpeakerId>();
+        } else if (lineRoot.contains("volume_level")) {
+          // Override volume level
+          voice.synthesisConfig.volumeLevel = 
+			  std::stof(lineRoot["volume_level"].get<std::string>());
+        } else if (lineRoot.contains("speaker")) {
+          // Resolve to id using speaker id map
+          auto speakerName = lineRoot["speaker"].get<std::string>();
+          if ((voice.modelConfig.speakerIdMap) &&
+              (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
+            voice.synthesisConfig.speakerId =
+                (*voice.modelConfig.speakerIdMap)[speakerName];
+          } else {
+            spdlog::warn("No speaker named: {}", speakerName);
+          }
+        }
+      }
+
+      // Timestamp is used for path to output WAV file
+      const auto now = chrono::system_clock::now();
+      const auto timestamp =
+          chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
+              .count();
+
+      if (outputType == OUTPUT_DIRECTORY) {
+        // Generate path using timestamp
+        stringstream outputName;
+        outputName << timestamp << ".wav";
+        filesystem::path outputPath = runConfig.outputPath.value();
+        outputPath.append(outputName.str());
+
+        // Output audio to automatically-named WAV file in a directory
+        ofstream audioFile(outputPath.string(), ios::binary);
+        piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+        cout << outputPath.string() << endl;
+      } else if (outputType == OUTPUT_FILE) {
+        if (!maybeOutputPath || maybeOutputPath->empty()) {
+          throw runtime_error("No output path provided");
+        }
+
+        filesystem::path outputPath = maybeOutputPath.value();
+
+        if (!runConfig.jsonInput) {
+          // Read all of standard input before synthesizing.
+          // Otherwise, we would overwrite the output file for each line.
+          stringstream text;
+          text << line;
+          while (getline(cin, line)) {
+            text << " " << line;
+          }
+
+          line = text.str();
+        }
+
+        // Output audio to WAV file
+        ofstream audioFile(outputPath.string(), ios::binary);
+        piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+        cout << outputPath.string() << endl;
+      } else if (outputType == OUTPUT_STDOUT) {
+        // Output WAV to stdout
+        piper::textToWavFile(piperConfig, voice, line, cout, result);
+      } else if (outputType == OUTPUT_RAW) {
+        // Raw output to stdout
+        mutex mutAudio;
+        condition_variable cvAudio;
+        bool audioReady = false;
+        bool audioFinished = false;
+        vector<int16_t> audioBuffer;
+        vector<int16_t> sharedAudioBuffer;
+
+  #ifdef _WIN32
+        // Needed on Windows to avoid terminal conversions
+        setmode(fileno(stdout), O_BINARY);
+        setmode(fileno(stdin), O_BINARY);
+  #endif
+
+        thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
+                               ref(mutAudio), ref(cvAudio), ref(audioReady),
+                               ref(audioFinished));
+        auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
+                              &cvAudio, &audioReady]() {
+          // Signal thread that audio is ready
+          {
+            unique_lock lockAudio(mutAudio);
+            copy(audioBuffer.begin(), audioBuffer.end(),
+                 back_inserter(sharedAudioBuffer));
+            audioReady = true;
+            cvAudio.notify_one();
+          }
+        };
+        piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
+                           audioCallback);
+
+        // Signal thread that there is no more audio
+        {
+          unique_lock lockAudio(mutAudio);
+          audioReady = true;
+          audioFinished = true;
+          cvAudio.notify_one();
+        }
+
+        // Wait for audio output to finish
+        spdlog::info("Waiting for audio to finish playing...");
+        rawOutputThread.join();
+      }
+
+      spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
+                   result.realTimeFactor, result.inferSeconds,
+                   result.audioSeconds);
+
+      // Restore config (--json-input)
+      voice.synthesisConfig.speakerId = speakerId;
+	  voice.synthesisConfig.volumeLevel = volumeLevel;
+
+    } // for each line
+	
+	if (!runConfig.server) {
+		break;
+	}
+	
+	sleep(0.001);
+  }
+  
+
+  piper::terminate(piperConfig);
+
+  return EXIT_SUCCESS;
+}
+
+// ----------------------------------------------------------------------------
+
+void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
+                   condition_variable &cvAudio, bool &audioReady,
+                   bool &audioFinished) {
+  vector<int16_t> internalAudioBuffer;
+  while (true) {
+    {
+      unique_lock lockAudio{mutAudio};
+      cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
+
+      if (sharedAudioBuffer.empty() && audioFinished) {
+        break;
+      }
+
+      copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
+           back_inserter(internalAudioBuffer));
+
+      sharedAudioBuffer.clear();
+
+      if (!audioFinished) {
+        audioReady = false;
+      }
+    }
+
+    cout.write((const char *)internalAudioBuffer.data(),
+               sizeof(int16_t) * internalAudioBuffer.size());
+    cout.flush();
+    internalAudioBuffer.clear();
+  }
+
+} // rawOutputProc
+
+// ----------------------------------------------------------------------------
+
+void printUsage(char *argv[]) {
+  cerr << endl;
+  cerr << "usage: " << argv[0] << " [options]" << endl;
+  cerr << endl;
+  cerr << "options:" << endl;
+  cerr << "   -h        --help              show this message and exit" << endl;
+  cerr << "   -m  FILE  --model       FILE  path to onnx model file" << endl;
+  cerr << "   -c  FILE  --config      FILE  path to model config file "
+          "(default: model path + .json)"
+       << endl;
+  cerr << "   -f  FILE  --output_file FILE  path to output WAV file ('-' for "
+          "stdout)"
+       << endl;
+  cerr << "   -d  DIR   --output_dir  DIR   path to output directory (default: "
+          "cwd)"
+       << endl;
+  cerr << "   --output_raw                  output raw audio to stdout as it "
+          "becomes available"
+       << endl;
+  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
+  cerr << "   --noise_scale           NUM   generator noise (default: 0.667)"
+       << endl;
+  cerr << "   -v  NUM   --volume      NUM   Volume level (default: 1.0)"
+       << endl;
+  cerr << "   --length_scale          NUM   phoneme length (default: 1.0)"
+       << endl;
+  cerr << "   --noise_w               NUM   phoneme width noise (default: 0.8)"
+       << endl;
+  cerr << "   --sentence_silence      NUM   seconds of silence after each "
+          "sentence (default: 0.2)"
+       << endl;
+  cerr << "   --espeak_data           DIR   path to espeak-ng data directory"
+       << endl;
+  cerr << "   --tashkeel_model        FILE  path to libtashkeel onnx model "
+          "(arabic)"
+       << endl;
+  cerr << "   --json-input                  stdin input is lines of JSON "
+          "instead of plain text"
+       << endl;
+  cerr << "   --use-cuda                    use CUDA execution provider"
+       << endl;
+  cerr << "   --server                      Keep running until closed"
+       << endl;
+  cerr << "   --debug                       print DEBUG messages to the console"
+       << endl;
+  cerr << "   -q       --quiet              disable logging" << endl;
+  cerr << endl;
+}
+
+void ensureArg(int argc, char *argv[], int argi) {
+  if ((argi + 1) >= argc) {
+    printUsage(argv);
+    exit(0);
+  }
+}
+
+// Parse command-line arguments
+void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
+  optional<filesystem::path> modelConfigPath;
+
+  for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+
+    if (arg == "-m" || arg == "--model") {
+      ensureArg(argc, argv, i);
+      runConfig.modelPath = filesystem::path(argv[++i]);
+    } else if (arg == "-c" || arg == "--config") {
+      ensureArg(argc, argv, i);
+      modelConfigPath = filesystem::path(argv[++i]);
+    } else if (arg == "-f" || arg == "--output_file" ||
+               arg == "--output-file") {
+      ensureArg(argc, argv, i);
+      std::string filePath = argv[++i];
+      if (filePath == "-") {
+        runConfig.outputType = OUTPUT_STDOUT;
+        runConfig.outputPath = nullopt;
+      } else {
+        runConfig.outputType = OUTPUT_FILE;
+        runConfig.outputPath = filesystem::path(filePath);
+      }
+    } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
+      ensureArg(argc, argv, i);
+      runConfig.outputType = OUTPUT_DIRECTORY;
+      runConfig.outputPath = filesystem::path(argv[++i]);
+    } else if (arg == "--output_raw" || arg == "--output-raw") {
+      runConfig.outputType = OUTPUT_RAW;
+    } else if (arg == "-s" || arg == "--speaker") {
+      ensureArg(argc, argv, i);
+      runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
+    } else if (arg == "--noise_scale" || arg == "--noise-scale") {
+      ensureArg(argc, argv, i);
+      runConfig.noiseScale = stof(argv[++i]);
+    } else if (arg == "-v" || arg == "--volume") {
+      ensureArg(argc, argv, i);
+      runConfig.volumeLevel = stof(argv[++i]);
+    } else if (arg == "--length_scale" || arg == "--length-scale") {
+      ensureArg(argc, argv, i);
+      runConfig.lengthScale = stof(argv[++i]);
+    } else if (arg == "--noise_w" || arg == "--noise-w") {
+      ensureArg(argc, argv, i);
+      runConfig.noiseW = stof(argv[++i]);
+    } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
+      ensureArg(argc, argv, i);
+      runConfig.sentenceSilenceSeconds = stof(argv[++i]);
+    } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
+      ensureArg(argc, argv, i);
+      ensureArg(argc, argv, i + 1);
+      auto phonemeStr = std::string(argv[++i]);
+      if (!piper::isSingleCodepoint(phonemeStr)) {
+        std::cerr << "Phoneme '" << phonemeStr
+                  << "' is not a single codepoint (--phoneme_silence)"
+                  << std::endl;
+        exit(1);
+      }
+
+      if (!runConfig.phonemeSilenceSeconds) {
+        runConfig.phonemeSilenceSeconds.emplace();
+      }
+
+      auto phoneme = piper::getCodepoint(phonemeStr);
+      (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
+    } else if (arg == "--espeak_data" || arg == "--espeak-data") {
+      ensureArg(argc, argv, i);
+      runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
+    } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
+      ensureArg(argc, argv, i);
+      runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
+    } else if (arg == "--json_input" || arg == "--json-input") {
+      runConfig.jsonInput = true;
+    } else if (arg == "--use_cuda" || arg == "--use-cuda") {
+      runConfig.useCuda = true;
+	} else if (arg == "--server") {
+	  runConfig.server = true;
+    } else if (arg == "--version") {
+      std::cout << piper::getVersion() << std::endl;
+      exit(0);
+    } else if (arg == "--debug") {
+      // Set DEBUG logging
+      spdlog::set_level(spdlog::level::debug);
+    } else if (arg == "-q" || arg == "--quiet") {
+      // diable logging
+      spdlog::set_level(spdlog::level::off);
+    } else if (arg == "-h" || arg == "--help") {
+      printUsage(argv);
+      exit(0);
+    }
+  }
+
+  // Verify model file exists
+  ifstream modelFile(runConfig.modelPath.c_str(), ios::binary);
+  if (!modelFile.good()) {
+    throw runtime_error("Model file doesn't exist");
+  }
+
+  if (!modelConfigPath) {
+    runConfig.modelConfigPath =
+        filesystem::path(runConfig.modelPath.string() + ".json");
+  } else {
+    runConfig.modelConfigPath = modelConfigPath.value();
+  }
+
+  // Verify model config exists
+  ifstream modelConfigFile(runConfig.modelConfigPath.c_str());
+  if (!modelConfigFile.good()) {
+    throw runtime_error("Model config doesn't exist");
+  }
+}
diff --git a/piper.cpp b/piper.cpp
new file mode 100644
index 00000000..21e3e246
--- /dev/null
+++ b/piper.cpp
@@ -0,0 +1,644 @@
+#include <array>
+#include <chrono>
+#include <fstream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+
+#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <spdlog/spdlog.h>
+
+#include "json.hpp"
+#include "piper.hpp"
+#include "utf8.h"
+#include "wavfile.hpp"
+
+namespace piper {
+
+#ifdef _PIPER_VERSION
+// https://stackoverflow.com/questions/47346133/how-to-use-a-define-inside-a-format-string
+#define _STR(x) #x
+#define STR(x) _STR(x)
+const std::string VERSION = STR(_PIPER_VERSION);
+#else
+const std::string VERSION = "";
+#endif
+
+// Maximum value for 16-bit signed WAV sample
+const float MAX_WAV_VALUE = 32767.0f;
+
+const std::string instanceName{"piper"};
+
+std::string getVersion() { return VERSION; }
+
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s) {
+  return utf8::distance(s.begin(), s.end()) == 1;
+}
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s) {
+  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
+  return *character_iter;
+}
+
+// Load JSON config information for phonemization
+void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+  // {
+  //     "espeak": {
+  //         "voice": "<language code>"
+  //     },
+  //     "phoneme_type": "<espeak or text>",
+  //     "phoneme_map": {
+  //         "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
+  //     },
+  //     "phoneme_id_map": {
+  //         "<phoneme>": [<id1>, <id2>, ...]
+  //     }
+  // }
+
+  if (configRoot.contains("espeak")) {
+    auto espeakValue = configRoot["espeak"];
+    if (espeakValue.contains("voice")) {
+      phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
+    }
+  }
+
+  if (configRoot.contains("phoneme_type")) {
+    auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
+    if (phonemeTypeStr == "text") {
+      phonemizeConfig.phonemeType = TextPhonemes;
+    }
+  }
+
+  // phoneme to [id] map
+  // Maps phonemes to one or more phoneme ids (required).
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        std::stringstream idsStr;
+        for (auto &toIdValue : fromPhonemeItem.value()) {
+          PhonemeId toId = toIdValue.get<PhonemeId>();
+          idsStr << toId << ",";
+        }
+
+        spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme,
+                      idsStr.str());
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
+  // phoneme to [phoneme] map
+  // Maps phonemes to one or more other phonemes (not normally used).
+  if (configRoot.contains("phoneme_map")) {
+    if (!phonemizeConfig.phonemeMap) {
+      phonemizeConfig.phonemeMap.emplace();
+    }
+
+    auto phonemeMapValue = configRoot["phoneme_map"];
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        spdlog::error("\"{}\" is not a single codepoint", fromPhoneme);
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
+        std::string toPhoneme = toPhonemeValue.get<std::string>();
+        if (!isSingleCodepoint(toPhoneme)) {
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme map)");
+        }
+
+        auto toCodepoint = getCodepoint(toPhoneme);
+        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
+      }
+    }
+  }
+
+} /* parsePhonemizeConfig */
+
+// Load JSON config for audio synthesis
+void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
+  // {
+  //     "audio": {
+  //         "sample_rate": 22050
+  //     },
+  //     "inference": {
+  //         "noise_scale": 0.667,
+  //         "length_scale": 1,
+  //         "noise_w": 0.8,
+  //         "phoneme_silence": {
+  //           "<phoneme>": <seconds of silence>,
+  //           ...
+  //         }
+  //     }
+  // }
+
+  if (configRoot.contains("audio")) {
+    auto audioValue = configRoot["audio"];
+    if (audioValue.contains("sample_rate")) {
+      // Default sample rate is 22050 Hz
+      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
+    }
+  }
+
+  if (configRoot.contains("inference")) {
+    // Overrides default inference settings
+    auto inferenceValue = configRoot["inference"];
+    if (inferenceValue.contains("noise_scale")) {
+      synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
+    }
+	
+    if (inferenceValue.contains("volume_level")) {
+      synthesisConfig.volumeLevel = inferenceValue.value("volume_level", 1.0f);
+    }
+
+    if (inferenceValue.contains("length_scale")) {
+      synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
+    }
+
+    if (inferenceValue.contains("noise_w")) {
+      synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
+    }
+
+    if (inferenceValue.contains("phoneme_silence")) {
+      // phoneme -> seconds of silence to add after
+      synthesisConfig.phonemeSilenceSeconds.emplace();
+      auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
+      for (auto &phonemeItem : phonemeSilenceValue.items()) {
+        std::string phonemeStr = phonemeItem.key();
+        if (!isSingleCodepoint(phonemeStr)) {
+          spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme silence)");
+        }
+
+        auto phoneme = getCodepoint(phonemeStr);
+        (*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
+            phonemeItem.value().get<float>();
+      }
+
+    } // if phoneme_silence
+
+  } // if inference
+
+} /* parseSynthesisConfig */
+
+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+  if (configRoot.contains("speaker_id_map")) {
+    if (!modelConfig.speakerIdMap) {
+      modelConfig.speakerIdMap.emplace();
+    }
+
+    auto speakerIdMapValue = configRoot["speaker_id_map"];
+    for (auto &speakerItem : speakerIdMapValue.items()) {
+      std::string speakerName = speakerItem.key();
+      (*modelConfig.speakerIdMap)[speakerName] =
+          speakerItem.value().get<SpeakerId>();
+    }
+  }
+
+} /* parseModelConfig */
+
+void initialize(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
+    // See: https://github.com/rhasspy/espeak-ng
+    spdlog::debug("Initializing eSpeak");
+    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                   /*buflength*/ 0,
+                                   /*path*/ config.eSpeakDataPath.c_str(),
+                                   /*options*/ 0);
+    if (result < 0) {
+      throw std::runtime_error("Failed to initialize eSpeak-ng");
+    }
+
+    spdlog::debug("Initialized eSpeak");
+  }
+
+  // Load onnx model for libtashkeel
+  // https://github.com/mush42/libtashkeel/
+  if (config.useTashkeel) {
+    spdlog::debug("Using libtashkeel for diacritization");
+    if (!config.tashkeelModelPath) {
+      throw std::runtime_error("No path to libtashkeel model");
+    }
+
+    spdlog::debug("Loading libtashkeel model from {}",
+                  config.tashkeelModelPath.value());
+    config.tashkeelState = std::make_unique<tashkeel::State>();
+    tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
+                            *config.tashkeelState);
+    spdlog::debug("Initialized libtashkeel");
+  }
+
+  spdlog::info("Initialized piper");
+}
+
+void terminate(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Clean up espeak-ng
+    spdlog::debug("Terminating eSpeak");
+    espeak_Terminate();
+    spdlog::debug("Terminated eSpeak");
+  }
+
+  spdlog::info("Terminated piper");
+}
+
+void loadModel(std::string modelPath, ModelSession &session, bool useCuda) {
+  spdlog::debug("Loading onnx model from {}", modelPath);
+  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
+                         instanceName.c_str());
+  session.env.DisableTelemetryEvents();
+
+  if (useCuda) {
+    // Use CUDA provider
+    OrtCUDAProviderOptions cuda_options{};
+    cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
+    session.options.AppendExecutionProvider_CUDA(cuda_options);
+  }
+
+  // Slows down performance by ~2x
+  // session.options.SetIntraOpNumThreads(1);
+
+  // Roughly doubles load time for no visible inference benefit
+  // session.options.SetGraphOptimizationLevel(
+  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+  session.options.SetGraphOptimizationLevel(
+      GraphOptimizationLevel::ORT_DISABLE_ALL);
+
+  // Slows down performance very slightly
+  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+
+  session.options.DisableCpuMemArena();
+  session.options.DisableMemPattern();
+  session.options.DisableProfiling();
+
+  auto startTime = std::chrono::steady_clock::now();
+
+#ifdef _WIN32
+  auto modelPathW = std::wstring(modelPath.begin(), modelPath.end());
+  auto modelPathStr = modelPathW.c_str();
+#else
+  auto modelPathStr = modelPath.c_str();
+#endif
+
+  session.onnx = Ort::Session(session.env, modelPathStr, session.options);
+
+  auto endTime = std::chrono::steady_clock::now();
+  spdlog::debug("Loaded onnx model in {} second(s)",
+                std::chrono::duration<double>(endTime - startTime).count());
+}
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId, bool useCuda) {
+  spdlog::debug("Parsing voice config at {}", modelConfigPath);
+  std::ifstream modelConfigFile(modelConfigPath);
+  voice.configRoot = json::parse(modelConfigFile);
+
+  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
+  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multi-speaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }
+
+  spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
+
+  loadModel(modelPath, voice.session, useCuda);
+
+} /* loadVoice */
+
+// Phoneme ids to WAV audio
+void synthesize(std::vector<PhonemeId> &phonemeIds,
+                SynthesisConfig &synthesisConfig, ModelSession &session,
+                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
+  spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
+
+  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
+      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+
+  // Allocate
+  std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
+  std::vector<float> scales{synthesisConfig.noiseScale,
+                            synthesisConfig.lengthScale,
+                            synthesisConfig.noiseW};
+
+  std::vector<Ort::Value> inputTensors;
+  std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
+      phonemeIdsShape.size()));
+
+  std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
+      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
+
+  std::vector<int64_t> scalesShape{(int64_t)scales.size()};
+  inputTensors.push_back(
+      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
+                                      scalesShape.data(), scalesShape.size()));
+
+  // Add speaker id.
+  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
+  std::vector<int64_t> speakerId{
+      (int64_t)synthesisConfig.speakerId.value_or(0)};
+  std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
+
+  if (synthesisConfig.speakerId) {
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
+  // From export_onnx.py
+  std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
+                                            "sid"};
+  std::array<const char *, 1> outputNames = {"output"};
+
+  // Infer
+  auto startTime = std::chrono::steady_clock::now();
+  auto outputTensors = session.onnx.Run(
+      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
+      inputTensors.size(), outputNames.data(), outputNames.size());
+  auto endTime = std::chrono::steady_clock::now();
+
+  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
+    throw std::runtime_error("Invalid output tensors");
+  }
+  auto inferDuration = std::chrono::duration<double>(endTime - startTime);
+  result.inferSeconds = inferDuration.count();
+
+  const float *audio = outputTensors.front().GetTensorData<float>();
+  auto audioShape =
+      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
+  int64_t audioCount = audioShape[audioShape.size() - 1];
+
+  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
+  result.realTimeFactor = 0.0;
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+  spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
+                result.audioSeconds, result.inferSeconds);
+
+  // Get max audio value for scaling
+  float maxAudioValue = 0.01f;
+  for (int64_t i = 0; i < audioCount; i++) {
+    float audioValue = abs(audio[i]);
+    if (audioValue > maxAudioValue) {
+      maxAudioValue = audioValue;
+    }
+  }
+
+  // We know the size up front
+  audioBuffer.reserve(audioCount);
+
+  // Scale audio to fill range and convert to int16
+  float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
+  
+  // Scale to desired volume level
+  audioScale = audioScale * synthesisConfig.volumeLevel;
+  
+  for (int64_t i = 0; i < audioCount; i++) {
+    int16_t intAudioValue = static_cast<int16_t>(
+        std::clamp(audio[i] * audioScale,
+                   static_cast<float>(std::numeric_limits<int16_t>::min()),
+                   static_cast<float>(std::numeric_limits<int16_t>::max())));
+
+    audioBuffer.push_back(intAudioValue);
+  }
+
+  // Clean up
+  for (std::size_t i = 0; i < outputTensors.size(); i++) {
+    Ort::detail::OrtRelease(outputTensors[i].release());
+  }
+
+  for (std::size_t i = 0; i < inputTensors.size(); i++) {
+    Ort::detail::OrtRelease(inputTensors[i].release());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback) {
+
+  std::size_t sentenceSilenceSamples = 0;
+  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
+    sentenceSilenceSamples = (std::size_t)(
+        voice.synthesisConfig.sentenceSilenceSeconds *
+        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
+  }
+
+  if (config.useTashkeel) {
+    if (!config.tashkeelState) {
+      throw std::runtime_error("Tashkeel model is not loaded");
+    }
+
+    spdlog::debug("Diacritizing text with libtashkeel: {}", text);
+    text = tashkeel::tashkeel_run(text, *config.tashkeelState);
+  }
+
+  // Phonemes for each sentence
+  spdlog::debug("Phonemizing text: {}", text);
+  std::vector<std::vector<Phoneme>> phonemes;
+
+  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
+    // Use espeak-ng for phonemization
+    eSpeakPhonemeConfig eSpeakConfig;
+    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
+    phonemize_eSpeak(text, eSpeakConfig, phonemes);
+  } else {
+    // Use UTF-8 codepoints as "phonemes"
+    CodepointsPhonemeConfig codepointsConfig;
+    phonemize_codepoints(text, codepointsConfig, phonemes);
+  }
+
+  // Synthesize each sentence independently.
+  std::vector<PhonemeId> phonemeIds;
+  std::map<Phoneme, std::size_t> missingPhonemes;
+  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
+       ++phonemesIter) {
+    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
+
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phonemes
+      std::string phonemesStr;
+      for (auto phoneme : sentencePhonemes) {
+        utf8::append(phoneme, std::back_inserter(phonemesStr));
+      }
+
+      spdlog::debug("Converting {} phoneme(s) to ids: {}",
+                    sentencePhonemes.size(), phonemesStr);
+    }
+
+    std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
+    std::vector<SynthesisResult> phraseResults;
+    std::vector<size_t> phraseSilenceSamples;
+
+    // Use phoneme/id map from config
+    PhonemeIdConfig idConfig;
+    idConfig.phonemeIdMap =
+        std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
+
+    if (voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Split into phrases
+      std::map<Phoneme, float> &phonemeSilenceSeconds =
+          *voice.synthesisConfig.phonemeSilenceSeconds;
+
+      auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+      phrasePhonemes.push_back(currentPhrasePhonemes);
+
+      for (auto sentencePhonemesIter = sentencePhonemes.begin();
+           sentencePhonemesIter != sentencePhonemes.end();
+           sentencePhonemesIter++) {
+        Phoneme &currentPhoneme = *sentencePhonemesIter;
+        currentPhrasePhonemes->push_back(currentPhoneme);
+
+        if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
+          // Split at phrase boundary
+          phraseSilenceSamples.push_back(
+              (std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
+                            voice.synthesisConfig.sampleRate *
+                            voice.synthesisConfig.channels));
+
+          currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+          phrasePhonemes.push_back(currentPhrasePhonemes);
+        }
+      }
+    } else {
+      // Use all phonemes
+      phrasePhonemes.push_back(
+          std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
+    }
+
+    // Ensure results/samples are the same size
+    while (phraseResults.size() < phrasePhonemes.size()) {
+      phraseResults.emplace_back();
+    }
+
+    while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
+      phraseSilenceSamples.push_back(0);
+    }
+
+    // phonemes -> ids -> audio
+    for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
+      if (phrasePhonemes[phraseIdx]->size() <= 0) {
+        continue;
+      }
+
+      // phonemes -> ids
+      phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
+                      missingPhonemes);
+      if (spdlog::should_log(spdlog::level::debug)) {
+        // DEBUG log for phoneme ids
+        std::stringstream phonemeIdsStr;
+        for (auto phonemeId : phonemeIds) {
+          phonemeIdsStr << phonemeId << ", ";
+        }
+
+        spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                      phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
+                      phonemeIdsStr.str());
+      }
+
+      // ids -> audio
+      synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+                 phraseResults[phraseIdx]);
+
+      // Add end of phrase silence
+      for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
+        audioBuffer.push_back(0);
+      }
+
+      result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
+      result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
+
+      phonemeIds.clear();
+    }
+
+    // Add end of sentence silence
+    if (sentenceSilenceSamples > 0) {
+      for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
+        audioBuffer.push_back(0);
+      }
+    }
+
+    if (audioCallback) {
+      // Call back must copy audio since it is cleared afterwards.
+      audioCallback();
+      audioBuffer.clear();
+    }
+
+    phonemeIds.clear();
+  }
+
+  if (missingPhonemes.size() > 0) {
+    spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
+                 missingPhonemes.size());
+
+    for (auto phonemeCount : missingPhonemes) {
+      std::string phonemeStr;
+      utf8::append(phonemeCount.first, std::back_inserter(phonemeStr));
+      spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
+                   (uint32_t)phonemeCount.first, phonemeCount.second);
+    }
+  }
+
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+
+} /* textToAudio */
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result) {
+
+  std::vector<int16_t> audioBuffer;
+  textToAudio(config, voice, text, audioBuffer, result, NULL);
+
+  // Write WAV
+  auto synthesisConfig = voice.synthesisConfig;
+  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
+                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
+                 audioFile);
+
+  audioFile.write((const char *)audioBuffer.data(),
+                  sizeof(int16_t) * audioBuffer.size());
+
+} /* textToWavFile */
+
+} // namespace piper
diff --git a/piper.hpp b/piper.hpp
new file mode 100644
index 00000000..60c5b4b5
--- /dev/null
+++ b/piper.hpp
@@ -0,0 +1,133 @@
+#ifndef PIPER_H_
+#define PIPER_H_
+
+#include <fstream>
+#include <functional>
+#include <map>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <onnxruntime_cxx_api.h>
+#include <piper-phonemize/phoneme_ids.hpp>
+#include <piper-phonemize/phonemize.hpp>
+#include <piper-phonemize/tashkeel.hpp>
+
+#include "json.hpp"
+
+using json = nlohmann::json;
+
+namespace piper {
+
+typedef int64_t SpeakerId;
+
+struct eSpeakConfig {
+  std::string voice = "en-us";
+};
+
+struct PiperConfig {
+  std::string eSpeakDataPath;
+  bool useESpeak = true;
+
+  bool useTashkeel = false;
+  std::optional<std::string> tashkeelModelPath;
+  std::unique_ptr<tashkeel::State> tashkeelState;
+};
+
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };
+
+struct PhonemizeConfig {
+  PhonemeType phonemeType = eSpeakPhonemes;
+  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
+  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
+
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;
+
+  eSpeakConfig eSpeak;
+};
+
+struct SynthesisConfig {
+  // VITS inference settings
+  float noiseScale = 0.667f;
+  float volumeLevel = 1.0f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+
+  // Audio settings
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+
+  // Speaker id from 0 to numSpeakers - 1
+  std::optional<SpeakerId> speakerId;
+
+  // Extra silence
+  float sentenceSilenceSeconds = 0.2f;
+  std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
+};
+
+struct ModelConfig {
+  int numSpeakers;
+
+  // speaker name -> id
+  std::optional<std::map<std::string, SpeakerId>> speakerIdMap;
+};
+
+struct ModelSession {
+  Ort::Session onnx;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::SessionOptions options;
+  Ort::Env env;
+
+  ModelSession() : onnx(nullptr){};
+};
+
+struct SynthesisResult {
+  double inferSeconds;
+  double audioSeconds;
+  double realTimeFactor;
+};
+
+struct Voice {
+  json configRoot;
+  PhonemizeConfig phonemizeConfig;
+  SynthesisConfig synthesisConfig;
+  ModelConfig modelConfig;
+  ModelSession session;
+};
+
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s);
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s);
+
+// Get version of Piper
+std::string getVersion();
+
+// Must be called before using textTo* functions
+void initialize(PiperConfig &config);
+
+// Clean up
+void terminate(PiperConfig &config);
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId, bool useCuda);
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback);
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result);
+
+} // namespace piper
+
+#endif // PIPER_H_

From 73693b1e228ea3c20a5ca044ca17180aa58476de Mon Sep 17 00:00:00 2001
From: flatsiedatsie <unfold@gmail.com>
Date: Thu, 15 Feb 2024 21:21:58 +0100
Subject: [PATCH 3/3] Add files via upload

---
 src/cpp/main.cpp  | 18 ++++++++++++++++++
 src/cpp/piper.cpp |  8 ++++++++
 src/cpp/piper.hpp |  1 +
 3 files changed, 27 insertions(+)

diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index aee4d83c..f5d4d9ba 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -60,6 +60,9 @@ struct RunConfig {
 
   // Amount of noise to add during audio generation
   optional<float> noiseScale;
+  
+  // Audio output volume
+  optional<float> volumeLevel;
 
   // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
   optional<float> lengthScale;
@@ -204,6 +207,10 @@ int main(int argc, char *argv[]) {
   if (runConfig.noiseScale) {
     voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
   }
+  
+  if (runConfig.volumeLevel) {
+    voice.synthesisConfig.volumeLevel = runConfig.volumeLevel.value();
+  }
 
   if (runConfig.lengthScale) {
     voice.synthesisConfig.lengthScale = runConfig.lengthScale.value();
@@ -246,6 +253,7 @@ int main(int argc, char *argv[]) {
     while (getline(cin, line)) {
       auto outputType = runConfig.outputType;
       auto speakerId = voice.synthesisConfig.speakerId;
+	  auto volumeLevel = voice.synthesisConfig.volumeLevel;
       std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
 
       if (runConfig.jsonInput) {
@@ -266,6 +274,10 @@ int main(int argc, char *argv[]) {
           // Override speaker id
           voice.synthesisConfig.speakerId =
               lineRoot["speaker_id"].get<piper::SpeakerId>();
+        } else if (lineRoot.contains("volume_level")) {
+          // Override volume level
+          voice.synthesisConfig.volumeLevel = 
+			  std::stof(lineRoot["volume_level"].get<std::string>());
         } else if (lineRoot.contains("speaker")) {
           // Resolve to id using speaker id map
           auto speakerName = lineRoot["speaker"].get<std::string>();
@@ -373,6 +385,7 @@ int main(int argc, char *argv[]) {
 
       // Restore config (--json-input)
       voice.synthesisConfig.speakerId = speakerId;
+	  voice.synthesisConfig.volumeLevel = volumeLevel;
 
     } // for each line
 	
@@ -446,6 +459,8 @@ void printUsage(char *argv[]) {
   cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
   cerr << "   --noise_scale           NUM   generator noise (default: 0.667)"
        << endl;
+  cerr << "   -v  NUM   --volume      NUM   Volume level (default: 1.0)"
+       << endl;
   cerr << "   --length_scale          NUM   phoneme length (default: 1.0)"
        << endl;
   cerr << "   --noise_w               NUM   phoneme width noise (default: 0.8)"
@@ -514,6 +529,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
     } else if (arg == "--noise_scale" || arg == "--noise-scale") {
       ensureArg(argc, argv, i);
       runConfig.noiseScale = stof(argv[++i]);
+    } else if (arg == "-v" || arg == "--volume") {
+      ensureArg(argc, argv, i);
+      runConfig.volumeLevel = stof(argv[++i]);
     } else if (arg == "--length_scale" || arg == "--length-scale") {
       ensureArg(argc, argv, i);
       runConfig.lengthScale = stof(argv[++i]);
diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp
index 00d4a47a..21e3e246 100644
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@@ -162,6 +162,10 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
     if (inferenceValue.contains("noise_scale")) {
       synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
     }
+	
+    if (inferenceValue.contains("volume_level")) {
+      synthesisConfig.volumeLevel = inferenceValue.value("volume_level", 1.0f);
+    }
 
     if (inferenceValue.contains("length_scale")) {
       synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
@@ -421,6 +425,10 @@ void synthesize(std::vector<PhonemeId> &phonemeIds,
 
   // Scale audio to fill range and convert to int16
   float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
+  
+  // Scale to desired volume level
+  audioScale = audioScale * synthesisConfig.volumeLevel;
+  
   for (int64_t i = 0; i < audioCount; i++) {
     int16_t intAudioValue = static_cast<int16_t>(
         std::clamp(audio[i] * audioScale,
diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp
index 7b956f79..60c5b4b5 100644
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@@ -52,6 +52,7 @@ struct PhonemizeConfig {
 struct SynthesisConfig {
   // VITS inference settings
   float noiseScale = 0.667f;
+  float volumeLevel = 1.0f;
   float lengthScale = 1.0f;
   float noiseW = 0.8f;