Skip to content

Commit

Permalink
Merge pull request #500 from psu-libraries/incrementals
Browse files Browse the repository at this point in the history
incrementals
  • Loading branch information
whereismyjetpack authored Jan 22, 2024
2 parents 4ef7611 + 00216fe commit 96eb866
Show file tree
Hide file tree
Showing 16 changed files with 36 additions and 130 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM jruby:9.4.2.0
WORKDIR /app
ARG UID=3000
ARG UID=1000

ENV BUNDLE_PATH=/app/vendor/bundle

Expand Down
4 changes: 1 addition & 3 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ gem 'library_stdnums'
gem 'mail'
gem 'marc'
gem 'rake'
gem 'redis'
gem 'rsolr'
gem 'shelvit'
gem 'sidekiq', '~> 6.5'
gem 'sidekiq-scheduler', '~> 4.0'
gem 'traject'
gem 'traject-marc4j_reader', platform: :jruby
gem 'whenever', require: false
Expand All @@ -26,7 +25,6 @@ group :development, :test do
gem 'pry-debugger-jruby', platform: :jruby
gem 'rspec'
gem 'rspec-its'
gem 'rspec-sidekiq'
gem 'rubocop', '~> 1.5'
gem 'rubocop-performance', '~> 1.1'
gem 'rubocop-rspec', '~> 2'
Expand Down
40 changes: 9 additions & 31 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ GEM
dry-equalizer (~> 0.2)
dry-initializer (~> 3.0)
dry-schema (~> 1.5, >= 1.5.2)
et-orbi (1.2.7)
tzinfo
faker (3.2.0)
i18n (>= 1.8.11, < 2)
faraday (1.4.2)
Expand All @@ -77,9 +75,6 @@ GEM
ffi-compiler (1.0.1)
ffi (>= 1.0.0)
rake
fugit (1.9.0)
et-orbi (~> 1, >= 1.2.7)
raabro (~> 1.4)
hashdiff (1.0.1)
hashie (4.1.0)
http (4.4.1)
Expand Down Expand Up @@ -130,20 +125,21 @@ GEM
coderay (~> 1.1)
method_source (~> 1.0)
spoon (~> 0.0)
pry-byebug (3.9.0)
pry-byebug (3.10.1)
byebug (~> 11.0)
pry (~> 0.13.0)
pry (>= 0.13, < 0.15)
pry-debugger-jruby (2.0.0-java)
pry (>= 0.13, < 0.14)
ruby-debug-base (>= 0.10.4, < 0.12)
public_suffix (4.0.6)
raabro (1.4.0)
racc (1.7.2)
racc (1.7.2-java)
rack (2.2.6.4)
rainbow (3.1.1)
rake (13.0.3)
redis (4.8.1)
redis (5.0.8)
redis-client (>= 0.17.0)
redis-client (0.19.1)
connection_pool
regexp_parser (2.8.0)
rexml (3.2.5)
rsolr (2.3.0)
Expand All @@ -164,9 +160,6 @@ GEM
rspec-mocks (3.10.2)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.10.0)
rspec-sidekiq (3.1.0)
rspec-core (~> 3.0, >= 3.0.0)
sidekiq (>= 2.4.0)
rspec-support (3.10.2)
rubocop (1.52.0)
json (~> 2.3)
Expand Down Expand Up @@ -194,20 +187,9 @@ GEM
ruby-debug-base (0.11.0-java)
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.4)
rufus-scheduler (3.9.1)
fugit (~> 1.1, >= 1.1.6)
scrub_rb (1.0.1)
shelvit (0.1.2)
lcsort (~> 0.9)
sidekiq (6.5.12)
connection_pool (>= 2.2.5, < 3)
rack (~> 2.0)
redis (>= 4.5.0, < 5)
sidekiq-scheduler (4.0.3)
redis (>= 4.2.0)
rufus-scheduler (~> 3.2)
sidekiq (>= 4, < 7)
tilt (>= 1.4.0)
simplecov (0.17.1)
docile (~> 1.1)
json (>= 1.8, < 3)
Expand All @@ -216,7 +198,6 @@ GEM
slop (4.9.1)
spoon (0.0.6)
ffi
tilt (2.3.0)
traject (3.5.0)
concurrent-ruby (>= 0.8.0)
dot-properties (>= 0.1.1)
Expand All @@ -231,12 +212,10 @@ GEM
traject-marc4j_reader (1.1.0-java)
marc (~> 1.0)
marc-marc4j (~> 1.0)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf (0.1.4-java)
unf_ext (0.0.7.7)
unf_ext (0.0.9.1)
unicode-display_width (2.4.2)
webmock (3.13.0)
addressable (>= 2.3.6)
Expand All @@ -247,6 +226,7 @@ GEM
yell (2.2.2)

PLATFORMS
universal-java-1.8
universal-java-10
universal-java-11
universal-java-14
Expand All @@ -266,16 +246,14 @@ DEPENDENCIES
pry-byebug
pry-debugger-jruby
rake
redis
rsolr
rspec
rspec-its
rspec-sidekiq
rubocop (~> 1.5)
rubocop-performance (~> 1.1)
rubocop-rspec (~> 2)
shelvit
sidekiq (~> 6.5)
sidekiq-scheduler (~> 4.0)
simplecov (< 0.18)
traject
traject-marc4j_reader
Expand Down
13 changes: 0 additions & 13 deletions config.ru

This file was deleted.

10 changes: 5 additions & 5 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ solr_writer:
reader_class_name: PsulibTraject::MarcCombiningReader
commit_timeout: 10000
symphony_data_path: "<%= ENV.fetch('SYMPHONY_DATA_PATH', '/data/symphony_data') %>"
# Folder within `symphony_data_path` where hourlies are stored
symphony_hourlies_subdir: hourlies
# How long to keep the hourlies skip keys
# we keep hourlies for 7 days, so we keep the lock for 10 to overlap
hourlies_skip_expire_seconds: 864000
# Folder within `symphony_data_path` where incrementals are stored
symphony_incremental_subdir: incrementals
# How long to keep the incrementals skip keys
# we keep incrementals for 7 days, so we keep the lock for 10 to overlap
incremental_skip_expire_seconds: 864000
hathi_etas: false
hathi_overlap_path: "<%= ENV.fetch('HATHI_OVERLAP_PATH', '/data/hathitrust_data/overlap.tsv') %>"
marc4j_reader:
Expand Down
10 changes: 0 additions & 10 deletions config/sidekiq.yml

This file was deleted.

4 changes: 1 addition & 3 deletions lib/psulib_traject.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
require 'library_stdnums'
require 'redis'
require 'shelvit'
require 'sidekiq'
require 'sidekiq-scheduler'
require 'traject'
require 'traject/macros/marc21_semantics'
require 'yaml'
Expand Down Expand Up @@ -40,7 +38,7 @@ module PsulibTraject
require 'psulib_traject/solr_manager'
require 'psulib_traject/subject_heading'
require 'psulib_traject/workers/base'
require 'psulib_traject/workers/hourly_indexer'
require 'psulib_traject/workers/incremental_indexer'
require 'psulib_traject/workers/indexer'

Config.setup do |config|
Expand Down
2 changes: 0 additions & 2 deletions lib/psulib_traject/workers/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
module PsulibTraject
module Workers
class Base
include Sidekiq::Worker

def self.perform_now(*args)
new.perform(*args)
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@

module PsulibTraject
module Workers
class HourlyIndexer < Base
class IncrementalIndexer < Base
def perform
perform_indexes
perform_deletes
end

def hourlies_directory
@hourlies_directory ||= Pathname
def incremental_directory
@incremental_directory ||= Pathname
.new(ConfigSettings.symphony_data_path)
.join(ConfigSettings.symphony_hourlies_subdir)
.join(ConfigSettings.symphony_incremental_subdir)
end

def perform_deletes
current_collection = PsulibTraject::SolrManager.new.current_collection

target_deletes = Dir.glob("#{hourlies_directory}/**/*_deletes_*.txt").sort
target_deletes = Dir.glob("#{incremental_directory}/**/*_deletes_*.txt").sort

processed_deletes = redis.keys("#{current_collection}:*").map { |e| e.gsub("#{current_collection}:", '') }
files_to_process = target_deletes - processed_deletes
Expand All @@ -30,7 +30,7 @@ def perform_deletes
.split("\n")
.map { |id| delete(id) }
redis.set("#{current_collection}:#{file_name}", true)
redis.expire("#{current_collection}:#{file_name}", ConfigSettings.hourlies_skip_expire_seconds.to_i)
redis.expire("#{current_collection}:#{file_name}", ConfigSettings.incremental_skip_expire_seconds.to_i)
end
end

Expand All @@ -41,7 +41,7 @@ def delete(id)

def perform_indexes
current_collection = PsulibTraject::SolrManager.new.current_collection
target_files = Dir.glob("#{hourlies_directory}/**/*.m*rc").sort
target_files = Dir.glob("#{incremental_directory}/**/*.m*rc").sort

indexed_files = redis.keys("#{current_collection}:*").map { |e| e.gsub("#{current_collection}:", '') }
files_to_index = target_files - indexed_files
Expand All @@ -54,7 +54,7 @@ def perform_indexes
files_to_index.each do |file_name|
indexer.logger.info "marking #{file_name} as done"
redis.set("#{current_collection}:#{file_name}", true)
redis.expire("#{current_collection}:#{file_name}", ConfigSettings.hourlies_skip_expire_seconds.to_i)
redis.expire("#{current_collection}:#{file_name}", ConfigSettings.incremental_skip_expire_seconds.to_i)
end
end
end
Expand Down
20 changes: 10 additions & 10 deletions lib/tasks/traject.rake
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# frozen_string_literal: true

namespace :traject do
desc 'Index a file or folder of files async with sidekiq'
task :index_async, [:path, :collection] do |_task, args|
PsulibTraject::Workers::Indexer.perform_async(args.path, args.collection)
end

desc 'Index a file or folder of files without sidekiq'
desc 'Index a file or folder of files'
task :index, [:path] do |_task, args|
PsulibTraject::Workers::Indexer.perform_now(args.path, args.collection)
end

desc 'Run Hourlies'
desc 'Run incrementals'
task :incrementals do
PsulibTraject::Workers::IncrementalIndexer.perform_now
end

desc 'Run incrementals as hourlies'
task :hourlies do
PsulibTraject::Workers::HourlyIndexer.perform_now
PsulibTraject::Workers::IncrementalIndexer.perform_now
end

desc 'Clear redis of hourly skip list'
task :clear_hourlies do
desc 'Clear redis of incremental skip list'
task :clear_incrementals do
current_collection = PsulibTraject::SolrManager.new.current_collection
redis = Redis.new
redis.keys("#{current_collection}:*").map { |key| redis.del(key) }
Expand Down
7 changes: 0 additions & 7 deletions sidekiq.rb

This file was deleted.

File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

require 'spec_helper'

RSpec.describe PsulibTraject::Workers::HourlyIndexer do
RSpec.describe PsulibTraject::Workers::IncrementalIndexer do
let(:indexer) { described_class }

before(:all) do
Expand Down Expand Up @@ -42,17 +42,6 @@
)
end

it 'submits jobs for each hourly file' do
indexer.perform_async
expect(indexer).to have_enqueued_sidekiq_job
end

it 'increases the size of the job queue' do
expect {
indexer.perform_async
}.to change(indexer.jobs, :size).by(1)
end

it 'performs Indexer jobs' do
indexer.perform_now
expect(WebMock).to have_requested(
Expand All @@ -67,15 +56,6 @@
:post, "http://#{ConfigSettings.solr.host}:#{ConfigSettings.solr.port}/solr/psul_catalog/update/json"
)
.with(body: '{"delete":"1235"}').times(1)
expect(PsulibTraject::Workers::Indexer.jobs.size).to eq(0)
end

it 'does not perform the job a second time' do
indexer.perform_now
expect(WebMock).to have_requested(
:post, "http://#{ConfigSettings.solr.host}:#{ConfigSettings.solr.port}/solr/psul_catalog/update/json"
).times(0)
expect(PsulibTraject::Workers::Indexer.jobs.size).to eq(0)
end
end
end
16 changes: 0 additions & 16 deletions spec/support/sidekiq.rb

This file was deleted.

0 comments on commit 96eb866

Please sign in to comment.