Skip to content

Commit

Permalink
cleanup bucket with files added to s3
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed Oct 7, 2024
1 parent 87f03e8 commit 6f2e544
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 6 deletions.
25 changes: 25 additions & 0 deletions lib/dor/text_extraction/speech_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,27 @@ def required?
workflow_context['runSpeechToText'] || false
end

# remove any files in S3 workspace that are no longer needed and return the number of files removed
def cleanup
cleanup_input_folder
cleanup_output_folder
true
end

# remove all files in the s3 input folder
def cleanup_input_folder
# Iterate over the list of filenames to be deleted (which is based on the job_id prefix for this druid)
s3_objects = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id)
s3_objects.contents.each do |object|
aws_provider.client.delete_object(bucket: aws_provider.bucket_name, key: object.key)
end.size
end

# remove all files in the s3 output folder
def cleanup_output_folder
# TODO: implement
end

# return a list of filenames that should be stt'd
# iterate over all files in cocina_object.structural.contains, looking at mimetypes
# return a list of filenames that are correct mimetype
Expand Down Expand Up @@ -85,6 +106,10 @@ def allowed_resource_types
def allowed_object_types
[Cocina::Models::ObjectType.media]
end

def aws_provider
@aws_provider ||= Dor::TextExtraction::AwsProvider.new(region: Settings.aws.region, access_key_id: Settings.aws.access_key_id, secret_access_key: Settings.aws.secret_access_key)
end
end
end
end
3 changes: 1 addition & 2 deletions lib/robots/dor_repo/speech_to_text/stt_workspace_cleanup.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ def initialize

# available from LyberCore::Robot: druid, bare_druid, workflow_service, object_client, cocina_object, logger
def perform_work
# TODO: cleanup any speech to text workspace files
true
Dor::TextExtraction::SpeechToText.new(cocina_object:, logger:).cleanup
end
end
end
Expand Down
20 changes: 20 additions & 0 deletions spec/lib/dor/text_extraction/speech_to_text_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ def build_file(sdr_preserve, shelve, filename)
end
end

describe '#cleanup' do
let(:cocina_object) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, dro?: true, type: object_type, version:) }
let(:client) { instance_double(Aws::S3::Client, list_objects:) }
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) }
let(:m4a_object) { instance_double(Aws::S3::Types::Object, key: "#{bare_druid}-v#{version}/file1.m4a") }
let(:mp4_object) { instance_double(Aws::S3::Types::Object, key: "#{bare_druid}-v#{version}/file1.mp4") }
let(:version) { 2 }

before do
allow(Aws::S3::Client).to receive(:new).and_return(client)
allow(client).to receive(:delete_object).and_return(instance_double(Aws::S3::Types::Object))
end

it 'removes all files from s3' do
expect(stt.cleanup).to be true
expect(client).to have_received(:delete_object).with(bucket: 'sul-speech-to-text-dev', key: "#{bare_druid}-v#{version}/file1.m4a").once
expect(client).to have_received(:delete_object).with(bucket: 'sul-speech-to-text-dev', key: "#{bare_druid}-v#{version}/file1.mp4").once
end
end

describe '#filenames_to_stt' do
let(:cocina_object) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, structural:, type: object_type) }

Expand Down
20 changes: 16 additions & 4 deletions spec/robots/dor_repo/speech_to_text/stt_workspace_cleanup_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,24 @@
require 'spec_helper'

describe Robots::DorRepo::SpeechToText::SttWorkspaceCleanup do
subject(:perform) { test_perform(robot, druid) }

let(:druid) { 'druid:bb222cc3333' }
let(:robot) { described_class.new }

it 'runs the stt-workspace-cleanup robot' do
expect(perform).to be true
let(:object) { build(:dro, id: druid) }
let(:stt) do
instance_double(Dor::TextExtraction::SpeechToText, cleanup: true)
end
let(:object_client) do
instance_double(Dor::Services::Client::Object, find: object)
end

before do
allow(Dor::Services::Client).to receive(:object).and_return(object_client)
allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
end

it 'calls the cleanup method' do
test_perform(robot, druid)
expect(stt).to have_received(:cleanup)
end
end

0 comments on commit 6f2e544

Please sign in to comment.