# 编译
cmake -DCMAKE_BUILD_TYPE=Debug -B build | |
cmake --build build -j 8 |
出现报错 [CMake is not able to find BOOST libraries](https://stackoverflow.com/questions/24173330/cmake-is-not-able-to-find-boost-libraries)
, 解决 sudo apt-get install cmake libblkid-dev e2fslibs-dev libboost-all-dev libaudit-dev
。
# 基于内存方案
构建索引的参数:
1data_type、2dist_fn、3data_file、4index_path_prefix、5max_degree、6size_of_build_search_list、7search_DRAM_budget、8build_DRAM_budget、9 num_thread、10PQ_disk_bytes
# Index 构建函数
template<typename T, typename TagT> | |
Index<T, TagT>::Index(Metric m, const size_t dim, const size_t max_points, | |
const bool dynamic_index, | |
const Parameters &indexParams, | |
const Parameters &searchParams, const bool enable_tags, | |
const bool support_eager_delete) | |
: Index(m, dim, max_points, dynamic_index, enable_tags, support_eager_delete) { // Thank you C++ 11! | |
_indexingQueueSize = indexParams.Get<uint32_t>("L"); | |
_indexingRange = indexParams.Get<uint32_t>("R"); | |
_indexingMaxC = indexParams.Get<uint32_t>("C"); | |
_indexingAlpha = indexParams.Get<float>("alpha"); | |
uint32_t num_threads_srch = searchParams.Get<uint32_t>("num_threads"); | |
uint32_t num_threads_indx = indexParams.Get<uint32_t>("num_threads"); | |
uint32_t num_threads = diskann_max(num_threads_srch, num_threads_indx); | |
uint32_t search_l = searchParams.Get<uint32_t>("L"); | |
initialize_query_scratch(num_threads, search_l, _indexingQueueSize, | |
_indexingRange, dim); | |
} | |
template<typename T, typename TagT> | |
Index<T, TagT>::Index(Metric m, const size_t dim, const size_t max_points, | |
const bool dynamic_index, | |
const bool enable_tags, const bool support_eager_delete) | |
: _dist_metric(m), _dim(dim), _max_points(max_points), | |
_dynamic_index(dynamic_index), _enable_tags(enable_tags), | |
_support_eager_delete(support_eager_delete) { | |
if (dynamic_index && !enable_tags) { | |
throw diskann::ANNException( | |
"ERROR: Eager Deletes must have Dynamic Indexing enabled.", -1, | |
__FUNCSIG__, __FILE__, __LINE__); | |
diskann::cerr | |
<< "WARNING: Dynamic Indices must have tags enabled. Auto-enabling." | |
<< std::endl; | |
_enable_tags = true; | |
} | |
if (support_eager_delete && !dynamic_index) { | |
diskann::cerr << "ERROR: Eager Deletes must have Dynamic Indexing " | |
"enabled. Exitting." | |
<< std::endl; | |
throw diskann::ANNException( | |
"ERROR: Eager deletes are possible only if dynamic indexing is " | |
"enabled. Exiting.", | |
-1, __FUNCSIG__, __FILE__, __LINE__); | |
} | |
// data is stored to _nd * aligned_dim matrix with necessary | |
// zero-padding | |
_aligned_dim = ROUND_UP(_dim, 8); | |
if (dynamic_index) { | |
_num_frozen_pts = 1; | |
} | |
// Sanity check. While logically it is correct, max_points ==0 causes | |
// downstream problems. | |
if (_max_points == 0) { | |
_max_points = 1; | |
} | |
alloc_aligned(((void **) &_data), | |
(_max_points + _num_frozen_pts) * _aligned_dim * sizeof(T), | |
8 * sizeof(T)); | |
std::memset(_data, 0, | |
(_max_points + _num_frozen_pts) * _aligned_dim * sizeof(T)); | |
_ep = (unsigned) _max_points; | |
//_final_graph.reserve(_max_points + _num_frozen_pts); | |
_final_graph.resize(_max_points + _num_frozen_pts); | |
if (_support_eager_delete) { | |
_in_graph.reserve(_max_points + _num_frozen_pts); | |
_in_graph.resize(_max_points + _num_frozen_pts); | |
} | |
if (m == diskann::Metric::COSINE && std::is_floating_point<T>::value) { | |
// This is safe because T is float inside the if block. | |
this->_distance = (Distance<T> *) new AVXNormalizedCosineDistanceFloat(); | |
this->_normalize_vecs = true; | |
std::cout<<"Normalizing vectors and using L2 for cosine AVXNormalizedCosineDistanceFloat()." << std::endl; | |
// std::cout << "Need to add functionality for COSINE metric" << std::endl; | |
} else { | |
this->_distance = get_distance_function<T>(m); | |
} | |
_locks = std::vector<std::mutex>(_max_points + _num_frozen_pts); | |
if (_support_eager_delete) | |
_locks_in = std::vector<std::mutex>(_max_points + _num_frozen_pts); | |
_width = 0; | |
} | |
template<typename T, typename TagT> | |
Index<T, TagT>::~Index() { | |
// Ensure that no other activity is happening before dtor() | |
std::unique_lock<std::shared_timed_mutex> ul(_update_lock); | |
std::unique_lock<std::shared_timed_mutex> tul(_tag_lock); | |
std::unique_lock<std::shared_timed_mutex> tdl(_delete_lock); | |
for (auto &lock : _locks) { | |
LockGuard lg(lock); | |
} | |
for (auto &lock : _locks_in) { | |
LockGuard lg(lock); | |
} | |
if (this->_distance != nullptr) { | |
delete this->_distance; | |
this->_distance = nullptr; | |
} | |
if (this->_data != nullptr) { | |
aligned_free(this->_data); | |
this->_data = nullptr; | |
} | |
while (!_query_scratch.empty()) { | |
auto val = _query_scratch.pop(); | |
while (val.indices == nullptr) { | |
_query_scratch.wait_for_push_notify(); | |
val = _query_scratch.pop(); | |
} | |
val.destroy(); | |
} | |
} | |
_locks = std::vector<std::mutex>(_max_points + _num_frozen_pts); | |
if (_support_eager_delete) | |
_locks_in = std::vector<std::mutex>(_max_points + _num_frozen_pts); | |
_width = 0; | |
} |
search_invocation 是什么?_dynamic_index 是什么?
选边策略:
template<typename T, typename TagT> | |
void Index<T, TagT>::occlude_list(std::vector<Neighbor> &pool, | |
const float alpha, const unsigned degree, | |
const unsigned maxc, | |
std::vector<Neighbor> &result, | |
std::vector<float> & occlude_factor) { | |
if (pool.empty()) | |
return; | |
assert(std::is_sorted(pool.begin(), pool.end())); | |
assert(!pool.empty()); | |
float cur_alpha = 1; | |
while (cur_alpha <= alpha && result.size() < degree) { | |
unsigned start = 0; | |
float eps = cur_alpha + 0.01f; // used for MIPS, where we store a value | |
// of eps in cur_alpha to | |
// denote pruned out entries which we can skip in later rounds. | |
while (result.size() < degree && (start) < pool.size() && start < maxc) { | |
auto &p = pool[start]; | |
if (occlude_factor[start] > cur_alpha) { | |
start++; | |
continue; | |
} | |
occlude_factor[start] = std::numeric_limits<float>::max(); | |
result.push_back(p); | |
for (unsigned t = start + 1; t < pool.size() && t < maxc; t++) { | |
if (occlude_factor[t] > alpha) | |
continue; | |
float djk = _distance->compare( | |
_data + _aligned_dim * (size_t) pool[t].id, | |
_data + _aligned_dim * (size_t) p.id, (unsigned) _aligned_dim); | |
if (_dist_metric == diskann::Metric::L2 || | |
_dist_metric == diskann::Metric::COSINE) { | |
occlude_factor[t] = | |
//(std::max)(occlude_factor[t], pool[t].distance / djk); | |
diskann_max(occlude_factor[t], pool[t].distance / djk); | |
} else if (_dist_metric == | |
diskann::Metric::INNER_PRODUCT) { // stylized rules for | |
// inner product since | |
// we want max instead | |
// of min distance | |
float x = -pool[t].distance; | |
float y = -djk; | |
if (y > cur_alpha * x) { | |
occlude_factor[t] = | |
/* (std::max)*/ diskann_max(occlude_factor[t], eps); | |
} | |
} | |
} | |
start++; | |
} | |
cur_alpha *= 1.2; | |
} | |
} |
这里有一个很有意思的点,cur_alpha 从 1 开始递增迭代来计算。
# build_disk
建立索引入口:
int build_disk_index(const char *dataFilePath, const char *indexFilePath, | |
const char * indexBuildParameters, | |
diskann::Metric compareMetric) |
dataFilePath: base_bin;indexFilePath: 保存索引路径;indexBuildParameter:构建索引的参数;compareMetric:一般为 l2。
这里有一系列的路径,我先记录下来:
std::string base_file(dataFilePath); // base
std::string data_file_to_use = base_file; // base
std::string index_prefix_path(indexFilePath); // index 前缀
std::string pq_pivots_path = index_prefix_path + "_pq_pivots.bin"; // 保存pivots
std::string pq_compressed_vectors_path =
index_prefix_path + "_pq_compressed.bin"; // 保存压缩的pq
std::string mem_index_path = index_prefix_path + "_mem.index"; // 保存内存索引
std::string disk_index_path = index_prefix_path + "_disk.index";
std::string medoids_path = disk_index_path + "_medoids.bin";
std::string centroids_path = disk_index_path + "_centroids.bin";
std::string sample_base_prefix = index_prefix_path + "_sample";
划分 pq 块:
double final_index_ram_limit = get_memory_budget(param_list[2]); | |
size_t num_pq_chunks = | |
(size_t)(std::floor)(_u64(final_index_ram_limit / points_num)); |
随机采样训练数据(0,1)分布:
double p_val = ((double) MAX_PQ_TRAINING_SET_SIZE / (double) points_num); // 这里 max 是 256000L,也就是说最多训练这么多数据 | |
gen_random_slice<T>(data_file_to_use.c_str(), p_val, train_data, train_size, | |
train_dim); // 按照 p_val 进行采样,因为这里只是使用了 10000 个数据,所以 p>1 直接取全部数据进行训练 |
PQ 训练
generate_pq_pivots(train_data, train_size, (uint32_t) dim, 256, | |
(uint32_t) num_pq_chunks, NUM_KMEANS_REPS, | |
pq_pivots_path, make_zero_mean); //l2 make_zero_mean 为 true | |
int generate_pq_pivots(const float *passed_train_data, size_t num_train, | |
unsigned dim, unsigned num_centers, | |
unsigned num_pq_chunks, unsigned max_k_means_reps, | |
std::string pq_pivots_path, bool make_zero_mean) // 固定 center 为 256,max_k_means_reps 为 12 |
如果 pq_pivots_path 文件存在,那么直接读入,反之则需要进行训练。
l2 计算预处理,可以将 traindata 数据更加集中与 center,由于 sift 数据集向量为 128 维,使用 32 个 pq_chunks,可以被等分,每一个 chunks 负责 4 个维度,又因为 pq center 为 256,所以 8 位 1byte 就可以表示,4 维度会降维到 1 维,并且有 float 变为 uint8。
size_t cur_chunk_size = chunk_offsets[i + 1] - chunk_offsets[i]; // 这个直接为 4 | |
std::unique_ptr<float[]> cur_pivot_data = | |
std::make_unique<float[]>(num_centers * cur_chunk_size); | |
std::unique_ptr<float[]> cur_data = | |
std::make_unique<float[]>(num_train * cur_chunk_size); | |
std::unique_ptr<uint32_t[]> closest_center = | |
std::make_unique<uint32_t[]>(num_train); | |
// 从 traindata 中导入对应维度的数据到 cur_data,也就是 4 维的数据 |
keams 选择 pivots:
kmeans::kmeanspp_selecting_pivots(cur_data.get(), num_train, cur_chunk_size, | |
cur_pivot_data.get(), num_centers); | |
void kmeanspp_selecting_pivots(float* data, size_t num_points, size_t dim, | |
float* pivot_data, size_t num_centers) |
首先随机选择一个初始的点;
roll init_id | |
picked.push_back(init_id); | |
std::memcpy(pivot_data, data + init_id * dim, dim * sizeof(float)); // 拷贝到 cur_pivot_data 中,dim 这里是 4 |
计算所有点与该点的距离,然后用一个随机的迭代的方式获取其他 256 个随机点,说实话,这里我不是很明白,他为什么这么做。。。。
选择了 256 个 center 并保存在 cur_pivot_data 中之后,用 lloyds 算法进行训练:
kmeans::run_lloyds(cur_data.get(), num_train, cur_chunk_size, | |
cur_pivot_data.get(), num_centers, max_k_means_reps, | |
NULL, closest_center.get()); | |
float run_lloyds(float* data, size_t num_points, size_t dim, float* centers, | |
const size_t num_centers, const size_t max_reps, | |
std::vector<size_t>* closest_docs, | |
uint32_t* closest_center) //cloest_docs 就是说这个 center 内有哪些节点,类似于倒排索引 | |
float* docs_l2sq = new float[num_points]; | |
math_utils::compute_vecs_l2sq(docs_l2sq, data, num_points, dim); | |
// 迭代训练 | |
lloyds_iter(data, num_points, dim, centers, num_centers, | |
docs_l2sq, closest_docs, closest_center); | |
float lloyds_iter(float* data, size_t num_points, size_t dim, float* centers, | |
size_t num_centers, float* docs_l2sq, | |
std::vector<size_t>* closest_docs, | |
uint32_t*& closest_center) |
开始进行计算 closest_centers:
math_utils::compute_closest_centers(data, num_points, dim, centers, | |
num_centers, 1, closest_center, | |
closest_docs, docs_l2sq); | |
void compute_closest_centers(float* data, size_t num_points, size_t dim, | |
float* pivot_data, size_t num_centers, size_t k, | |
uint32_t* closest_centers_ivf, | |
std::vector<size_t>* inverted_index, | |
float* pts_norms_squared) | |
uint32_t* closest_centers = new uint32_t[PAR_BLOCK_SIZE * k]; | |
float* distance_matrix = new float[num_centers * PAR_BLOCK_SIZE]; | |
math_utils::compute_closest_centers_in_block( | |
data, num_ponts, dim, pivot_data, num_centers, | |
pts_norms_squared, pivs_norms_squared, closest_centers, distance_matrix, | |
k); | |
void compute_closest_centers_in_block( | |
const float* const data, const size_t num_points, const size_t dim, | |
const float* const centers, const size_t num_centers, | |
const float* const docs_l2sq, const float* const centers_l2sq, | |
uint32_t* center_index, float* const dist_matrix, size_t k) |
这里他用了 MKL 的函数进行矩阵计算,说实话这一块不是很懂,先记下来以后有需要再去看:https://murphypei.github.io/blog/2019/09/cblas-gemm-gemv
计算好所有 pivot,以及 cloest_center 之后就可以进行保存了:
for (uint64_t j = 0; j < num_centers; j++) { | |
std::memcpy(full_pivot_data.get() + j * dim + chunk_offsets[i], | |
cur_pivot_data.get() + j * cur_chunk_size, | |
cur_chunk_size * sizeof(float)); // 保存全精度码本 | |
} | |
diskann::save_bin<float>(pq_pivots_path.c_str(), full_pivot_data.get(), | |
(size_t) num_centers, dim); | |
std::string centroids_path = pq_pivots_path + "_centroid.bin"; | |
diskann::save_bin<float>(centroids_path.c_str(), centroid.get(), (size_t) dim, | |
1); // 点集的中心向量 | |
std::string rearrangement_path = pq_pivots_path + "_rearrangement_perm.bin"; | |
diskann::save_bin<uint32_t>(rearrangement_path.c_str(), rearrangement.data(), | |
rearrangement.size(), 1); // 就是维度是按什么顺序进行 rearrangement | |
std::string chunk_offsets_path = pq_pivots_path + "_chunk_offsets.bin"; | |
diskann::save_bin<uint32_t>(chunk_offsets_path.c_str(), chunk_offsets.data(), | |
chunk_offsets.size(), 1); // 每个 pq chunk 负责的原始维度数目 |
回到构建索引函数中:
generate_pq_pivots(train_data, train_size, (uint32_t) dim, 256, | |
(uint32_t) num_pq_chunks, NUM_KMEANS_REPS, | |
pq_pivots_path, make_zero_mean); | |
generate_pq_data_from_pivots<T>(data_file_to_use.c_str(), 256, | |
(uint32_t) num_pq_chunks, pq_pivots_path, | |
pq_compressed_vectors_path); |
这个 generate_pq_data_from_pivots 就是根据 pivot 把向量进行 qp 压缩。注意上面的 pivot 是用采样数据进行训练的。
构建索引:
diskann::build_merged_vamana_index<T>( | |
data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val, | |
indexing_ram_budget, mem_index_path, medoids_path, centroids_path); | |
int build_merged_vamana_index(std::string base_file, | |
diskann::Metric compareMetric, unsigned L, | |
unsigned R, double sampling_rate, | |
double ram_budget, std::string mem_index_path, | |
std::string medoids_file, | |
std::string centroids_file) | |
paras.Set<unsigned>("L", (unsigned) L); | |
paras.Set<unsigned>("R", (unsigned) R); | |
paras.Set<unsigned>("C", 750); // 这个 C 是真的奇奇妙妙的,这个参数确定的是最终裁边的时候侯选池的大小 | |
paras.Set<float>("alpha", 1.2f); // 在这里设置了 alpha | |
paras.Set<unsigned>("num_rnds", 2); | |
paras.Set<bool>("saturate_graph", 1); | |
paras.Set<std::string>("save_path", mem_index_path); |
这里就需要根据内存的预算进行划分,判断需要划分为多少个 part:
int num_parts = | |
partition_with_ram_budget<T>(base_file, sampling_rate, ram_budget, | |
2 * R / 3, merged_index_prefix, 2); | |
// 注意这里的 kbase 参数,这个参数实际上就是说把一个向量 assign 到 kbase 个 cluster 中 | |
template<typename T> | |
int partition_with_ram_budget(const std::string data_file, | |
const double sampling_rate, double ram_budget, | |
size_t graph_degree, | |
const std::string prefix_path, size_t k_base); |
这里划分不是随机进行划分的,代码首先以 part=3 为起始,进行 Kmeans 聚类,把对应的向量放到不同的 part 中。并且它还是生成一个 shard_idmap 这样的文件,记录每个 kmeans 聚类中的点 id *refix_path* + "_subshard-" + std::to_string(i) + "_ids_uint32.bin
。
我们根据每一个 part 进行数据划分:
std::string merged_index_prefix = mem_index_path + "_tempFiles"; | |
std::string shard_base_file = | |
merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin"; | |
std::string shard_ids_file = merged_index_prefix + "_subshard-" + | |
std::to_string(p) + "_ids_uint32.bin"; | |
retrieve_shard_data_from_ids<T>(base_file, shard_ids_file, | |
shard_base_file); | |
template<typename T> | |
int retrieve_shard_data_from_ids(const std::string data_file, | |
std::string idmap_filename, | |
std::string data_filename); | |
std::string shard_index_file = | |
merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index"; |
通过 retrieve_shard_data_from_ids 我们得到了部分重叠的采样数据,并把数据放到 shard_base_file。
我们进入到 Index 的类构造函数中,会发现一些参数比如 frozen_pts 这些都不需要考虑。一个需要注意的点就是 diskann 可以支持 dim 不被 8 整除,会有一个 aligned 操作, _aligned_dim = ROUND_UP(_dim, 8);
。
_u64 shard_base_dim, shard_base_pts; | |
get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim); | |
std::unique_ptr<diskann::Index<T>> _pvamanaIndex = | |
std::unique_ptr<diskann::Index<T>>( | |
new diskann::Index<T>(compareMetric, shard_base_dim, | |
shard_base_pts, false)); // TODO: Single? | |
_pvamanaIndex->build(shard_base_file.c_str(), shard_base_pts, paras); | |
_pvamanaIndex->save(shard_index_file.c_str()); | |
template<typename T, typename TagT> | |
Index<T, TagT>::Index(Metric m, const size_t dim, const size_t max_points, | |
const bool dynamic_index, | |
const bool enable_tags, const bool support_eager_delete) //bool 值全部默认 false | |
: _dist_metric(m), _dim(dim), _max_points(max_points), | |
_dynamic_index(dynamic_index), _enable_tags(enable_tags), | |
_support_eager_delete(support_eager_delete) |
这里 build 有两个重载,但是明明参数和这两个重载函数都不一致。。。最后 gdb 打断点发现进入的是这个 build 函数内,真的奇奇怪怪的:
void Index<T, TagT>::build(const char * filename, | |
const size_t num_points_to_load, | |
Parameters & parameters, | |
const std::vector<TagT> &tags) |
build 这个函数里面其实能看的还是比较少的,大部分代码都在做安全性检查。
我们进入到 link 这个函数里面:
link(parameters); | |
void Index<T, TagT>::link(Parameters ¶meters) | |
unsigned num_threads = parameters.Get<unsigned>("num_threads"); // 注意如果不指定 num_threads,会使用所有可用的 cpu | |
if (num_threads != 0) | |
omp_set_num_threads(num_threads); |
link 操作会执行两次,也就是跑两边 vanama,先把一些参数准备好:
_indexingQueueSize = parameters.Get<unsigned>("L"); // Search list size | |
_indexingRange = parameters.Get<unsigned>("R"); | |
_indexingMaxC = parameters.Get<unsigned>("C"); | |
const float last_round_alpha = parameters.Get<float>("alpha"); | |
unsigned L = _indexingQueueSize; | |
uint32_t num_syncs = | |
(unsigned) DIV_ROUND_UP(_nd + _num_frozen_pts, (64 * 64)); | |
if (num_syncs < 40) // 根据节点数量,可以把当前数据分成多块,虽然并不会并行,但是会让打印进度更加方便: | |
num_syncs = 40; | |
std::vector<unsigned> Lvec; | |
Lvec.push_back(L); | |
Lvec.push_back(L); | |
const unsigned NUM_RNDS = 2; // 两阶段 | |
_indexingAlpha = 1.0f; //index 的 alpha 参数 | |
std::vector<unsigned> visit_order; | |
std::vector<diskann::Neighbor> pool, tmp; | |
tsl::robin_set<unsigned> visited; |
计算 entry_point,并且设置一些时间参数:
_ep = calculate_entry_point(); | |
// 这里是返回离数据集中心最近的点(也是数据集上的点) | |
double sync_time = 0, total_sync_time = 0; | |
double inter_time = 0, total_inter_time = 0; | |
size_t inter_count = 0, total_inter_count = 0; | |
// 一些时间参数 |
这里有多个循环,很容易就绕晕了,最外层是 vanama 次数,然后是切分数据,最后是
for (uint32_t rnd_no = 0; rnd_no < NUM_RNDS; rnd_no++) { // 默认执行两轮 | |
for (uint32_t sync_num = 0; sync_num < num_syncs; sync_num++) { | |
// 这个 for 并行计算 | |
for (_s64 node_ctr = (_s64) start_id; node_ctr < (_s64) end_id; | |
++node_ctr) |
使用贪心检索查找候选邻居,这里有太多层了,每一层换一个参数。。。。:
pool.reserve(L * 2); | |
visited.reserve(L * 2); | |
get_expanded_nodes(node, L, init_ids, pool, visited); | |
void Index<T, TagT>::get_expanded_nodes( | |
const size_t node_id, const unsigned Lindex, | |
std::vector<unsigned> init_ids, | |
std::vector<Neighbor> & expanded_nodes_info, | |
tsl::robin_set<unsigned> &expanded_nodes_ids) { | |
const T *node_coords = _data + _aligned_dim * node_id; | |
if (init_ids.size() == 0) | |
init_ids.emplace_back(_ep); | |
std::vector<unsigned> des; | |
std::vector<Neighbor> best_L_nodes; | |
best_L_nodes.resize(Lindex + 1); | |
tsl::robin_set<unsigned> inserted_into_pool_rs; | |
boost::dynamic_bitset<> inserted_into_pool_bs; | |
iterate_to_fixed_point(node_coords, Lindex, init_ids, expanded_nodes_info, | |
expanded_nodes_ids, best_L_nodes, des, | |
inserted_into_pool_rs, inserted_into_pool_bs); | |
} | |
std::pair<uint32_t, uint32_t> Index<T, TagT>::iterate_to_fixed_point( | |
const T *node_coords, const unsigned Lsize, | |
const std::vector<unsigned> &init_ids, | |
std::vector<Neighbor> & expanded_nodes_info, | |
tsl::robin_set<unsigned> & expanded_nodes_ids, | |
std::vector<Neighbor> &best_L_nodes, std::vector<unsigned> &des, | |
tsl::robin_set<unsigned> &inserted_into_pool_rs, | |
boost::dynamic_bitset<> &inserted_into_pool_bs, bool ret_frozen, | |
bool search_invocation) |
这里代码首先把 init_ids(len=1,因为是去 center 作为 entrypoint 并且没有添加其他的元素)中 ep 给放到 best_L_nodes 中,并且设置为已经插入到 pool,inserted_into_pool_bs[id] = 1。
这里我 gdb 打断点后发现,vanama 根本没有使用初始化随机图,而是直接采用插入的方法。 鹅美静!
这里使用贪心算法,在图上进行检索,然后裁边:
get_expanded_nodes(node, L, init_ids, pool, visited); | |
// 添加该点现有的邻居节点 | |
prune_neighbors(node, pool, pruned_list); |
这一块太抽象了,很难用文字描述,简单来说就是论文算法的实现。
这里它用 alpha 因子进行 occlude 的时候不是直接完成的,而且反复迭代增大初始因子进行的,这点很有意思。
现在分块构建的索引已经保存好了,需要做的就是 merge:
diskann::merge_shards(merged_index_prefix + "_subshard-", "_mem.index", | |
merged_index_prefix + "_subshard-", "_ids_uint32.bin", | |
num_parts, R, mem_index_path, medoids_file); | |
int merge_shards(const std::string &vamana_prefix, | |
const std::string &vamana_suffix, | |
const std::string &idmaps_prefix, | |
const std::string &idmaps_suffix, const _u64 nshards, | |
unsigned max_degree, const std::string &output_vamana, | |
const std::string &medoids_file) |