Skip to content

Commit

Permalink
Hack: add audio transcription
Browse files Browse the repository at this point in the history
  • Loading branch information
stchris committed Aug 3, 2023
1 parent f226886 commit 01cf142
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 3 deletions.
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ RUN apt-get -qq -y update \
tesseract-ocr-aze \
tesseract-ocr-bel \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
ffmpeg \
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
Expand Down Expand Up @@ -144,6 +145,9 @@ RUN python3 -m spacy download el_core_news_sm \
&& python3 -m spacy download da_core_news_sm
# RUN python3 -m spacy download zh_core_web_sm

RUN pip3 install --no-cache-dir openai-whisper
RUN whisper --model base /dev/null || true

COPY . /ingestors
WORKDIR /ingestors
RUN pip3 install --no-cache-dir -e /ingestors
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ COMPOSE=docker-compose
DOCKER=$(COMPOSE) run --rm ingest-file

.PHONY: build

all: build shell

build:
Expand Down Expand Up @@ -36,7 +35,11 @@ format-check:
black --check .

test: services
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term tests/test_doc.py

minialeph:
$(COMPOSE) up -d --remove-orphans redis
FTM_STORE_URI=hack.db $(COMPOSE) run --rm mini-ingest-file /bin/bash

restart: build
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file
Expand Down
22 changes: 22 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,25 @@ services:
depends_on:
- postgres
- redis

mini-ingest-file:
build:
context: .
#image: ghcr.io/alephdata/ingest-file
hostname: ingest
tmpfs:
- /tmp:mode=777
- /data:mode=777
environment:
FTM_STORE_URI: sqlite:///hack.db
LOG_FORMAT: TEXT # TEXT or JSON
volumes:
- "./ingestors:/ingestors/ingestors"
- "./tests:/ingestors/tests"
- "./data:/ingestors/data"
- "./requirements.txt:/ingestors/requirements.txt"
- "./setup.py:/ingestors/setup.py"
- "./hack.db:/ingestors/hack.db"
- "~:/host"
depends_on:
- redis
Binary file added hack.db
Binary file not shown.
4 changes: 4 additions & 0 deletions ingestors/media/audio.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from followthemoney import model
from pymediainfo import MediaInfo
import whisper

from ingestors.ingestor import Ingestor
from ingestors.support.timestamp import TimestampSupport
Expand Down Expand Up @@ -41,6 +42,9 @@ def ingest(self, file_path, entity):
try:
entity.schema = model.get("Audio")
metadata = MediaInfo.parse(file_path)
whisper_model = whisper.load_model("base")
result = whisper_model.transcribe(audio=str(file_path))
entity.add("bodyText", result["text"])
for track in metadata.tracks:
entity.add("title", track.title)
entity.add("generator", track.writing_application)
Expand Down

0 comments on commit 01cf142

Please sign in to comment.