From 6f2e544798056dc383604ce004ed223ea9fb7e14 Mon Sep 17 00:00:00 2001 From: Peter Mangiafico Date: Mon, 30 Sep 2024 16:37:46 -0700 Subject: [PATCH] cleanup bucket with files added to s3 --- lib/dor/text_extraction/speech_to_text.rb | 25 +++++++++++++++++++ .../speech_to_text/stt_workspace_cleanup.rb | 3 +-- .../text_extraction/speech_to_text_spec.rb | 20 +++++++++++++++ .../stt_workspace_cleanup_spec.rb | 20 ++++++++++++--- 4 files changed, 62 insertions(+), 6 deletions(-) diff --git a/lib/dor/text_extraction/speech_to_text.rb b/lib/dor/text_extraction/speech_to_text.rb index ce799832..1c92b31f 100644 --- a/lib/dor/text_extraction/speech_to_text.rb +++ b/lib/dor/text_extraction/speech_to_text.rb @@ -32,6 +32,27 @@ def required? workflow_context['runSpeechToText'] || false end + # remove any files in S3 workspace that are no longer needed and return the number of files removed + def cleanup + cleanup_input_folder + cleanup_output_folder + true + end + + # remove all files in the s3 input folder + def cleanup_input_folder + # Iterate over the list of filenames to be deleted (which is based on the job_id prefix for this druid) + s3_objects = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id) + s3_objects.contents.each do |object| + aws_provider.client.delete_object(bucket: aws_provider.bucket_name, key: object.key) + end.size + end + + # remove all files in the s3 output folder + def cleanup_output_folder + # TODO: implement + end + # return a list of filenames that should be stt'd # iterate over all files in cocina_object.structural.contains, looking at mimetypes # return a list of filenames that are correct mimetype @@ -85,6 +106,10 @@ def allowed_resource_types def allowed_object_types [Cocina::Models::ObjectType.media] end + + def aws_provider + @aws_provider ||= Dor::TextExtraction::AwsProvider.new(region: Settings.aws.region, access_key_id: Settings.aws.access_key_id, secret_access_key: Settings.aws.secret_access_key) + end end end end diff --git a/lib/robots/dor_repo/speech_to_text/stt_workspace_cleanup.rb b/lib/robots/dor_repo/speech_to_text/stt_workspace_cleanup.rb index ea90cb5d..12fe9722 100644 --- a/lib/robots/dor_repo/speech_to_text/stt_workspace_cleanup.rb +++ b/lib/robots/dor_repo/speech_to_text/stt_workspace_cleanup.rb @@ -11,8 +11,7 @@ def initialize # available from LyberCore::Robot: druid, bare_druid, workflow_service, object_client, cocina_object, logger def perform_work - # TODO: cleanup any speech to text workspace files - true + Dor::TextExtraction::SpeechToText.new(cocina_object:, logger:).cleanup end end end diff --git a/spec/lib/dor/text_extraction/speech_to_text_spec.rb b/spec/lib/dor/text_extraction/speech_to_text_spec.rb index f8ba9408..ceac9d80 100644 --- a/spec/lib/dor/text_extraction/speech_to_text_spec.rb +++ b/spec/lib/dor/text_extraction/speech_to_text_spec.rb @@ -96,6 +96,26 @@ def build_file(sdr_preserve, shelve, filename) end end + describe '#cleanup' do + let(:cocina_object) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, dro?: true, type: object_type, version:) } + let(:client) { instance_double(Aws::S3::Client, list_objects:) } + let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) } + let(:m4a_object) { instance_double(Aws::S3::Types::Object, key: "#{bare_druid}-v#{version}/file1.m4a") } + let(:mp4_object) { instance_double(Aws::S3::Types::Object, key: "#{bare_druid}-v#{version}/file1.mp4") } + let(:version) { 2 } + + before do + allow(Aws::S3::Client).to receive(:new).and_return(client) + allow(client).to receive(:delete_object).and_return(instance_double(Aws::S3::Types::Object)) + end + + it 'removes all files from s3' do + expect(stt.cleanup).to be true + expect(client).to have_received(:delete_object).with(bucket: 'sul-speech-to-text-dev', key: "#{bare_druid}-v#{version}/file1.m4a").once + expect(client).to have_received(:delete_object).with(bucket: 'sul-speech-to-text-dev', key: "#{bare_druid}-v#{version}/file1.mp4").once + end + end + describe '#filenames_to_stt' do let(:cocina_object) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, structural:, type: object_type) } diff --git a/spec/robots/dor_repo/speech_to_text/stt_workspace_cleanup_spec.rb b/spec/robots/dor_repo/speech_to_text/stt_workspace_cleanup_spec.rb index 36a6364d..aa86f314 100644 --- a/spec/robots/dor_repo/speech_to_text/stt_workspace_cleanup_spec.rb +++ b/spec/robots/dor_repo/speech_to_text/stt_workspace_cleanup_spec.rb @@ -3,12 +3,24 @@ require 'spec_helper' describe Robots::DorRepo::SpeechToText::SttWorkspaceCleanup do - subject(:perform) { test_perform(robot, druid) } - let(:druid) { 'druid:bb222cc3333' } let(:robot) { described_class.new } - it 'runs the stt-workspace-cleanup robot' do - expect(perform).to be true + let(:object) { build(:dro, id: druid) } + let(:stt) do + instance_double(Dor::TextExtraction::SpeechToText, cleanup: true) + end + let(:object_client) do + instance_double(Dor::Services::Client::Object, find: object) + end + + before do + allow(Dor::Services::Client).to receive(:object).and_return(object_client) + allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt) + end + + it 'calls the cleanup method' do + test_perform(robot, druid) + expect(stt).to have_received(:cleanup) end end