Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A proposal for cache validation #16

Merged
merged 2 commits into from
Dec 5, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lidbox/data/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import collections
import io
import json
import logging
import os
import shutil
Expand Down Expand Up @@ -391,6 +392,28 @@ def cache(ds, directory=None, batch_size=1, cache_key=None):
.unbatch())


def validate_cache(dataframe, path, cache_key):
"""
Validate any existing cache. Validation is based on saving the keys and shape of the given
dataframe to JSON-format.
"""
cache_file = f"{path}/{cache_key}_meta.json"
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as infile:
existing_values = json.load(infile)
new_keys = dataframe.columns.to_list()
assert existing_values["keys"] == new_keys, \
f"Cache validation failed, old keys {existing_values['keys']} vs. new {new_keys}"
assert existing_values["shape"] == list(dataframe.shape), \
f"Cache validation failed, old shape {existing_values['shape']} vs. new {dataframe.shape}"
logger.info("Cache validation passed.")
else:
values = {"keys": dataframe.columns.to_list(), "shape": dataframe.shape}
logger.info(f"Previous cache does not exist. Saving dataframe keys and shape to {cache_file} for validation.")
with open(cache_file, 'w', encoding='utf-8') as outfile:
json.dump(values, outfile, indent=2)


def compute_rms_vad(ds, strength, vad_frame_length_ms, min_non_speech_length_ms=0):
"""
Compute root mean square based voice activity detection.
Expand Down