Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement single file save/load for index. #492

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/abstract_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ template <typename data_t> class AbstractDataStore
virtual ~AbstractDataStore() = default;

// Return number of points returned
virtual location_t load(const std::string &filename) = 0;
virtual location_t load(const std::string &filename, size_t offset) = 0;

// Why does store take num_pts? Since store only has capacity, but we allow
// resizing we can end up in a situation where the store has spare capacity.
// To optimize disk utilization, we pass the number of points that are "true"
// points, so that the store can discard the empty locations before saving.
virtual size_t save(const std::string &filename, const location_t num_pts) = 0;
virtual size_t save(std::ofstream &writer, const location_t num_pts, size_t offset) = 0;

DISKANN_DLLEXPORT virtual location_t capacity() const;

Expand Down
7 changes: 5 additions & 2 deletions include/abstract_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ class AbstractGraphStore
virtual ~AbstractGraphStore() = default;

// returns tuple of <nodes_read, start, num_frozen_points>
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
const size_t num_points) = 0;
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix, const size_t num_points,
size_t offset) = 0;
virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points,
const uint32_t start) = 0;

virtual int store(std::ofstream &writer, const size_t num_points, const size_t num_fz_points, const uint32_t start,
size_t offset) = 0;

// not synchronised, user should use lock when necvessary.
virtual const std::vector<location_t> &get_neighbours(const location_t i) const = 0;
virtual void add_neighbour(const location_t i, location_t neighbour_id) = 0;
Expand Down
1 change: 1 addition & 0 deletions include/distance.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "windows_customizations.h"
#include <cstdint>
#include <cstring>

namespace diskann
Expand Down
9 changes: 5 additions & 4 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr<Distance<data_t>> distance_fn);
virtual ~InMemDataStore();

virtual location_t load(const std::string &filename) override;
virtual size_t save(const std::string &filename, const location_t num_points) override;
virtual location_t load(const std::string &filename, size_t offset = 0) override;
virtual size_t save(const std::string &filename, const location_t num_pts) override;
virtual size_t save(std::ofstream &writer, const location_t num_pts, size_t offset) override;

virtual size_t get_aligned_dim() const override;

Expand Down Expand Up @@ -59,9 +60,9 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
virtual location_t expand(const location_t new_size) override;
virtual location_t shrink(const location_t new_size) override;

virtual location_t load_impl(const std::string &filename);
virtual location_t load_impl(const std::string &filename, size_t offset);
#ifdef EXEC_ENV_OLS
virtual location_t load_impl(AlignedFileReader &reader);
virtual location_t load_impl(AlignedFileReader &reader, size_t offset);
#endif

private:
Expand Down
17 changes: 10 additions & 7 deletions include/in_mem_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ class InMemGraphStore : public AbstractGraphStore
InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree);

// returns tuple of <nodes_read, start, num_frozen_points>
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
const size_t num_points) override;
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix, const size_t num_points,
size_t offset) override;
virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points,
const uint32_t start) override;

virtual int store(std::ofstream &writer, const size_t num_points, const size_t num_fz_points, const uint32_t start,
size_t offset) override;
virtual const std::vector<location_t> &get_neighbours(const location_t i) const override;
virtual void add_neighbour(const location_t i, location_t neighbour_id) override;
virtual void clear_neighbours(const location_t i) override;
Expand All @@ -33,13 +34,15 @@ class InMemGraphStore : public AbstractGraphStore
virtual uint32_t get_max_observed_degree() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points);
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points,
size_t offset);
#ifdef EXEC_ENV_OLS
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(AlignedFileReader &reader, size_t expected_num_points);
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(AlignedFileReader &reader, size_t expected_num_points,
size_t offset);
#endif

int save_graph(const std::string &index_path_prefix, const size_t active_points, const size_t num_frozen_points,
const uint32_t start);
int save_graph(std::ofstream &writer, const size_t active_points, const size_t num_frozen_points,
const uint32_t start, size_t offset);

private:
size_t _max_range_of_graph = 0;
Expand Down
33 changes: 23 additions & 10 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@

namespace diskann
{
// This struct is used for storing metadata for save_as_one_file version 1.
struct SaveLoadMetaDataV1
{
uint64_t data_offset;
uint64_t delete_list_offset;
uint64_t tags_offset;
uint64_t graph_offset;
};

inline double estimate_ram_usage(size_t size, uint32_t dim, uint32_t datasize, uint32_t degree)
{
Expand Down Expand Up @@ -57,7 +65,9 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
const size_t num_frozen_pts = 0, const bool dynamic_index = false,
const bool enable_tags = false, const bool concurrent_consolidate = false,
const bool pq_dist_build = false, const size_t num_pq_chunks = 0,
const bool use_opq = false, const bool filtered_index = false);
const bool use_opq = false, const bool filtered_index = false,
bool save_as_one_file = false, uint64_t save_as_one_file_version = 1,
bool load_from_one_file = false, uint64_t load_from_one_file_version = 1);

DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr<AbstractDataStore<T>> data_store,
std::unique_ptr<AbstractGraphStore> graph_store);
Expand Down Expand Up @@ -313,15 +323,15 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
DISKANN_DLLEXPORT size_t save_tags(std::string filename);
DISKANN_DLLEXPORT size_t save_delete_list(const std::string &filename);
#ifdef EXEC_ENV_OLS
DISKANN_DLLEXPORT size_t load_graph(AlignedFileReader &reader, size_t expected_num_points);
DISKANN_DLLEXPORT size_t load_data(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_tags(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_delete_set(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_graph(AlignedFileReader &reader, size_t expected_num_points, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_data(AlignedFileReader &reader, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_tags(AlignedFileReader &reader, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_delete_set(AlignedFileReader &reader, size_t offset = 0);
#else
DISKANN_DLLEXPORT size_t load_graph(const std::string filename, size_t expected_num_points);
DISKANN_DLLEXPORT size_t load_data(std::string filename0);
DISKANN_DLLEXPORT size_t load_tags(const std::string tag_file_name);
DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename);
DISKANN_DLLEXPORT size_t load_graph(const std::string filename, size_t expected_num_points, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_data(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_tags(const std::string &filename, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename, size_t offset = 0);
#endif

private:
Expand Down Expand Up @@ -360,7 +370,10 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

bool _has_built = false;
bool _saturate_graph = false;
bool _save_as_one_file = false; // plan to support in next version
bool _save_as_one_file; // plan to support filtered index in next version.
uint64_t _save_as_one_file_version; // Version used for save index to single file.
bool _load_from_one_file; // Whether to load index from single file.
uint64_t _load_from_one_file_version; // Version used for save index to single file.
bool _dynamic_index = false;
bool _enable_tags = false;
bool _normalize_vecs = false; // Using normalied L2 for cosine.
Expand Down
42 changes: 39 additions & 3 deletions include/index_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ struct IndexConfig
bool concurrent_consolidate;
bool use_opq;
bool filtered_index;
bool save_as_one_file;
uint64_t save_as_one_file_version;
bool load_from_one_file;
uint64_t load_from_one_file_version;

size_t num_pq_chunks;
size_t num_frozen_pts;
Expand All @@ -45,12 +49,15 @@ struct IndexConfig
IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension,
size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags,
bool pq_dist_build, bool concurrent_consolidate, bool use_opq, bool filtered_index,
std::string &data_type, const std::string &tag_type, const std::string &label_type,
std::shared_ptr<IndexWriteParameters> index_write_params,
bool save_as_one_file, uint64_t save_as_one_file_version, bool load_from_one_file,
uint64_t load_from_one_file_version, std::string &data_type, const std::string &tag_type,
const std::string &label_type, std::shared_ptr<IndexWriteParameters> index_write_params,
std::shared_ptr<IndexSearchParams> index_search_params)
: data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension),
max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build),
concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), filtered_index(filtered_index),
save_as_one_file(save_as_one_file), save_as_one_file_version(save_as_one_file_version),
load_from_one_file(load_from_one_file), load_from_one_file_version(load_from_one_file_version),
num_pq_chunks(num_pq_chunks), num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type),
data_type(data_type), index_write_params(index_write_params), index_search_params(index_search_params)
{
Expand Down Expand Up @@ -194,6 +201,30 @@ class IndexConfigBuilder
return *this;
}

IndexConfigBuilder &with_save_as_single_file(bool save_as_one_file)
{
this->_save_as_one_file = save_as_one_file;
return *this;
}

IndexConfigBuilder &with_save_as_single_file_version(uint64_t save_as_one_file_version)
{
this->_save_as_one_file_version = save_as_one_file_version;
return *this;
}

IndexConfigBuilder &with_load_from_single_file(bool load_from_one_file)
{
this->_load_from_one_file = load_from_one_file;
return *this;
}

IndexConfigBuilder &with_load_from_single_file_version(uint64_t load_from_one_file_version)
{
this->_save_as_one_file_version = load_from_one_file_version;
return *this;
}

IndexConfig build()
{
if (_data_type == "" || _data_type.empty())
Expand All @@ -219,7 +250,8 @@ class IndexConfigBuilder

return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks,
_num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate,
_use_opq, _filtered_index, _data_type, _tag_type, _label_type, _index_write_params,
_use_opq, _filtered_index, _save_as_one_file, _save_as_one_file_version, _load_from_one_file,
_load_from_one_file_version, _data_type, _tag_type, _label_type, _index_write_params,
_index_search_params);
}

Expand All @@ -240,6 +272,10 @@ class IndexConfigBuilder
bool _concurrent_consolidate = false;
bool _use_opq = false;
bool _filtered_index{defaults::HAS_LABELS};
bool _save_as_one_file;
uint64_t _save_as_one_file_version;
bool _load_from_one_file;
uint64_t _load_from_one_file_version;

size_t _num_pq_chunks = 0;
size_t _num_frozen_pts{defaults::NUM_FROZEN_POINTS_STATIC};
Expand Down
43 changes: 30 additions & 13 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -714,13 +714,8 @@ inline void open_file_to_write(std::ofstream &writer, const std::string &filenam
}
}

template <typename T>
inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t ndims, size_t offset = 0)
template <typename T> inline size_t save_bin(std::ofstream &writer, T *data, size_t npts, size_t ndims, size_t offset)
{
std::ofstream writer;
open_file_to_write(writer, filename);

diskann::cout << "Writing bin: " << filename.c_str() << std::endl;
writer.seekp(offset, writer.beg);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
size_t bytes_written = npts * ndims * sizeof(T) + 2 * sizeof(uint32_t);
Expand All @@ -730,11 +725,22 @@ inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t
<< std::endl;

writer.write((char *)data, npts * ndims * sizeof(T));
writer.close();
diskann::cout << "Finished writing bin." << std::endl;
return bytes_written;
}

template <typename T>
inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t ndims, size_t offset = 0)
{
std::ofstream writer;
open_file_to_write(writer, filename);
diskann::cout << "Writing bin file: " << filename.c_str() << std::endl;
size_t bytes_written = save_bin<T>(writer, data, npts, ndims, offset);
writer.close();
diskann::cout << "Close file " << filename << "." << std::endl;
return bytes_written;
}

inline void print_progress(double percentage)
{
int val = (int)(percentage * 100);
Expand Down Expand Up @@ -938,12 +944,11 @@ template <typename T> void save_Tvecs(const char *filename, T *data, size_t npts
writer.write((char *)cur_pt, ndims * sizeof(T));
}
}

template <typename T>
inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset = 0)
inline size_t save_data_in_base_dimensions(std::ofstream &writer, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset)
{
std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
open_file_to_write(writer, filename);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
size_t bytes_written = 2 * sizeof(uint32_t) + npts * ndims * sizeof(T);
writer.seekp(offset, writer.beg);
Expand All @@ -953,10 +958,21 @@ inline size_t save_data_in_base_dimensions(const std::string &filename, T *data,
{
writer.write((char *)(data + i * aligned_dim), ndims * sizeof(T));
}
writer.close();
return bytes_written;
}

template <typename T>
inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset = 0)
{
std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
open_file_to_write(writer, filename);
size_t file_size = save_data_in_base_dimensions(writer, data, npts, ndims, aligned_dim, offset);
writer.close();

return file_size;
}

template <typename T>
inline void copy_aligned_data_from_file(const char *bin_file, T *&data, size_t &npts, size_t &dim,
const size_t &rounded_dim, size_t offset = 0)
Expand All @@ -968,11 +984,12 @@ inline void copy_aligned_data_from_file(const char *bin_file, T *&data, size_t &
throw diskann::ANNException("Null pointer passed to copy_aligned_data_from_file function", -1, __FUNCSIG__,
__FILE__, __LINE__);
}

std::ifstream reader;
reader.exceptions(std::ios::badbit | std::ios::failbit);
reader.open(bin_file, std::ios::binary);
reader.seekg(offset, reader.beg);

reader.seekg(offset, reader.beg);
int npts_i32, dim_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&dim_i32, sizeof(int));
Expand Down
3 changes: 3 additions & 0 deletions src/distance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,4 +730,7 @@ template DISKANN_DLLEXPORT class SlowDistanceL2<float>;
template DISKANN_DLLEXPORT class SlowDistanceL2<int8_t>;
template DISKANN_DLLEXPORT class SlowDistanceL2<uint8_t>;

template DISKANN_DLLEXPORT Distance<float> *get_distance_function(Metric m);
template DISKANN_DLLEXPORT Distance<int8_t> *get_distance_function(Metric m);
template DISKANN_DLLEXPORT Distance<uint8_t> *get_distance_function(Metric m);
} // namespace diskann
Loading