Use different japanese train files for tesseract

They seem to work better as suggested here: tesseract-ocr/tessdata#119 Refs: #973
eikek · Aug 13, 2021 · 326cf1c · 326cf1c
1 parent f79aa44
commit 326cf1c
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile
@@ -63,6 +63,12 @@ RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$versi
   rm docspell-joex-*.zip && \
   ln -snf docspell-joex-* docspell-joex
 
+# Using these data files for japanese, because they work better. See #973
+RUN \
+  wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
+  wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
+  mv jpn*.traineddata /usr/share/tessdata
+
 COPY joex-entrypoint.sh /opt/joex-entrypoint.sh
 
 ENTRYPOINT ["/opt/joex-entrypoint.sh", "-J-XX:+UseG1GC"]