-------------
| |
| magic |
| number |
| |
|-------------|
| | ------------------------------
| | | | | | | | |
| ivf |------------------>| D | ntotal| dummy|dummy |trained | type |
| header | | | | | | | |
| | ------------------------------
-------------
| |
| nlist |
| |
-------------
| |
| nprobe |
| |
-------------
| | ---------
| | | IxF2 |
| IVF |------------------>------------------------------------------------
| quantizer | | D | ntotal |dummy |dummy |trained | type |
| | ------------------------------------------------
| | |xb.size| xb.data| direct | direct_map |
| | ---------------------------------------
-------------
| |
| residual |
| |
-------------
| |
| code_size |
| |
-------------
| | -------------------------
| | | D | M | nbits |
| Product |------------------>|------------------------------------
| Quantizer | | centroids.size | centroids.data() |
| | -------------------------------------
-------------
| | ---------------------------
| | | ilar | nlist | code_size |
| Inverted |------------------>----------------------------------------
| List | | full | ids_list.size |ids_list.data |
| | ----------------------------------------
| | |codes| ids|
| | -----------
| | |codes| ids|
| | -----------
------------- .........
样本总量 256 * 8 * 1024
维度256
PQ成32段
每个code固定量化空间是256个,即nbits=8
nlist 1
生成文件大小 84149436 字节
index_io.cpp
else if(const IndexIVFPQ * ivpq =
dynamic_cast<const IndexIVFPQ *> (idx)) {
const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
uint32_t h = fourcc (ivfpqr ? "IwQR" : "IwPQ");
WRITE1 (h);
write_ivf_header (ivpq, f);
WRITE1 (ivpq->by_residual);
WRITE1 (ivpq->code_size);
write_ProductQuantizer (&ivpq->pq, f);
write_InvertedLists (ivpq->invlists, f);
if (ivfpqr) {
write_ProductQuantizer (&ivfpqr->refine_pq, f);
WRITEVECTOR (ivfpqr->refine_codes);
WRITE1 (ivfpqr->k_factor);
}
}
faiss 不同版本之间 magic number略有差异
uint32_t h = fourcc (ivfpqr ? "IwQR" : "IwPQ");
WRITE1 (h);
文件偏移 0-3字节
49 76 50 51 IwPQ
write_ivf_header (ivpq, f);
具体代码实现
static void write_ivf_header (const IndexIVF *ivf, IOWriter *f) {
write_index_header (ivf, f);
WRITE1 (ivf->nlist);
WRITE1 (ivf->nprobe);
write_index (ivf->quantizer, f);
WRITE1 (ivf->maintain_direct_map);
WRITEVECTOR (ivf->direct_map);
}
static void write_index_header (const Index *idx, IOWriter *f) {
WRITE1 (idx->d); # int 数据维度 256
WRITE1 (idx->ntotal); # int 向量总数 256 * 8 * 1024
Index::idx_t dummy = 1 << 20;
WRITE1 (dummy); # int64 1 << 20
WRITE1 (dummy); # int64 1 << 20
WRITE1 (idx->is_trained); # bool 必为true
WRITE1 (idx->metric_type); #类型:enum MetricType 值:METRIC_L2 即 1
}
文件偏移 索引文件偏移 4-36
0x04~0x07 00 01 00 00 #256 little-endian
0x08~0x15 00 00 20 00 00 00 00 00 #1024*1024*2 little-endian
0x16~0x23 00 00 10 00 00 00 00 00
0x24~0x31 00 00 10 00 00 00 00 00
0x32~0x32 01 #1
0x33~0x36 01 00 00 00 #1 little-endian
WRITE1 (ivf->nlist); # 8字节 1024
WRITE1 (ivf->nprobe);
索引文件偏移37~52
01 00 00 00 00 00 00 00 # little-endian 1
00 01 00 00 00 00 00 00 # little-endian 256
write_index (ivf->quantizer, f);
if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
uint32_t h = fourcc (
idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
WRITE1 (h);
write_index_header (idx, f);
WRITEVECTOR (idxf->xb);
}
文件偏移 53-56
49 78 46 32 IxF2
static void write_index_header (const Index *idx, IOWriter *f) {
WRITE1 (idx->d);
WRITE1 (idx->ntotal);
Index::idx_t dummy = 1 << 20;
WRITE1 (dummy);
WRITE1 (dummy);
WRITE1 (idx->is_trained);
WRITE1 (idx->metric_type);
}
文件偏移 57-89
00 01 00 00 # 256 little-endian
01 00 00 00 00 00 00 00 # 1 little-endian
00 00 10 00 00 00 00 00
00 00 10 00 00 00 00 00
01 #1
01 00 00 00 #1 little-endian
WRITEVECTOR (idxf->xb);
#define WRITEVECTOR(vec) { \
size_t size = (vec).size (); \
WRITEANDCHECK (&size, 1); \
WRITEANDCHECK ((vec).data (), size); \
}
std::vecrot.size() 文件偏移 90-97 1 * 256
00 01 00 00 00 00 00 00 # 256 little-endian
std::vecrot.data() 写入内容 256 个float 共 1024字节 文件偏移 98-1121
93 f3 a7 bb # float little-endian
69 ca de 3b # float little-endian
ec 88 59 bb # float little-endian
...........
...........
...........
WRITE1 (ivf->maintain_direct_map);
WRITEVECTOR (ivf->direct_map);
maintain_direct_map 默认为false direct_map 默认为空集合 文件偏移 1122-1130
00 # false
00 00 00 00 00 00 00 00 # ......
WRITE1 (ivpq->by_residual);
WRITE1 (ivpq->code_size);
文件偏移 1131-1139
01
20 00 00 00 00 00 00 00 #32 little-endian
write_ProductQuantizer (&ivpq->pq, f);
void write_ProductQuantizer (const ProductQuantizer *pq, IOWriter *f) {
WRITE1 (pq->d);
WRITE1 (pq->M);
WRITE1 (pq->nbits);
WRITEVECTOR (pq->centroids);
}
WRITE1 (pq->d); #256
WRITE1 (pq->M); #32
WRITE1 (pq->nbits); #8
文件偏移 1140-1171
00 01 00 00 00 00 00 00 #256 little-endian
20 00 00 00 00 00 00 00 #32 little-endian
08 00 00 00 00 00 00 00 #8 little-endian
WRITEVECTOR (pq->centroids);
#define WRITEVECTOR(vec) { \
size_t size = (vec).size (); \
WRITEANDCHECK (&size, 1); \
WRITEANDCHECK ((vec).data (), size); \
}
共M * ksub * dsub 即 32 * 256 * 8 (65536)个float 文件偏移 1172-1179
00 00 01 00 00 00 00 00 #65536 little-endian
总共65536*4 个字节 文件偏移 1180-263323
f8 1c 70 3d # float little-endian
5a 55 ec 3c # float little-endian
0e 69 35 3d # float little-endian
...........
...........
...........
write_InvertedLists (ivpq->invlists, f);
void write_InvertedLists (const InvertedLists *ils, IOWriter *f) {
if (ils == nullptr) {
uint32_t h = fourcc ("il00");
WRITE1 (h);
} else if (const auto & ails =
dynamic_cast<const ArrayInvertedLists *>(ils)) {
uint32_t h = fourcc ("ilar");
WRITE1 (h);
WRITE1 (ails->nlist);
WRITE1 (ails->code_size);
// here we store either as a full or a sparse data buffer
size_t n_non0 = 0;
for (size_t i = 0; i < ails->nlist; i++) {
if (ails->ids[i].size() > 0)
n_non0++;
}
if (n_non0 > ails->nlist / 2) {
uint32_t list_type = fourcc("full");
WRITE1 (list_type);
std::vector<size_t> sizes;
for (size_t i = 0; i < ails->nlist; i++) {
sizes.push_back (ails->ids[i].size());
}
WRITEVECTOR (sizes);
} else {
int list_type = fourcc("sprs"); // sparse
WRITE1 (list_type);
std::vector<size_t> sizes;
for (size_t i = 0; i < ails->nlist; i++) {
size_t n = ails->ids[i].size();
if (n > 0) {
sizes.push_back (i);
sizes.push_back (n);
}
}
WRITEVECTOR (sizes);
}
// make a single contiguous data buffer (useful for mmapping)
for (size_t i = 0; i < ails->nlist; i++) {
size_t n = ails->ids[i].size();
if (n > 0) {
WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
WRITEANDCHECK (ails->ids[i].data(), n);
}
}
} else if (const auto & od =
dynamic_cast<const OnDiskInvertedLists *>(ils)) {
uint32_t h = fourcc ("ilod");
WRITE1 (h);
WRITE1 (ils->nlist);
WRITE1 (ils->code_size);
// this is a POD object
WRITEVECTOR (od->lists);
{
std::vector<OnDiskInvertedLists::Slot> v(
od->slots.begin(), od->slots.end());
WRITEVECTOR(v);
}
{
std::vector<char> x(od->filename.begin(), od->filename.end());
WRITEVECTOR(x);
}
WRITE1(od->totsize);
} else {
fprintf(stderr, "WARN! write_InvertedLists: unsupported invlist type, "
"saving null invlist\n");
uint32_t h = fourcc ("il00");
WRITE1 (h);
}
}
uint32_t h = fourcc ("ilar");
WRITE1 (h);
文件相对偏移 0-4
69 6c 61 72 # ilar
写入 nlist
WRITE1 (ails->nlist);
文件相对偏移 5-12
01 00 00 00 00 00 00 00 #1 little-endian
写入 code_size
WRITE1 (ails->code_size); # code size per vector in bytes
文件相对偏移 13-20
20 00 00 00 00 00 00 00 #32 little-endian
if (n_non0 > ails->nlist / 2) {
uint32_t list_type = fourcc("full");
WRITE1 (list_type);
std::vector<size_t> sizes;
for (size_t i = 0; i < ails->nlist; i++) {
sizes.push_back (ails->ids[i].size());
}
WRITEVECTOR (sizes);
} else {
int list_type = fourcc("sprs"); // sparse
WRITE1 (list_type);
std::vector<size_t> sizes;
for (size_t i = 0; i < ails->nlist; i++) {
size_t n = ails->ids[i].size();
if (n > 0) {
sizes.push_back (i);
sizes.push_back (n);
}
}
WRITEVECTOR (sizes);
}
本次演示走了 full分支 写入 magic number
uint32_t list_type = fourcc("full");
WRITE1 (list_type);
'full' 文件相对偏移 21-24
66 75 6c 6c # full
#### 写入 nilis->ids.size() list
for (size_t i = 0; i < ails->nlist; i++) {
sizes.push_back (ails->ids[i].size());
}
WRITEVECTOR (sizes);
文件相对偏移 25-32
01 00 00 00 00 00 00 00 # 1 little-endian
00 00 20 00 00 00 00 00 # 2097152= 1024*1024*2 little-endian
for (size_t i = 0; i < ails->nlist; i++) {
size_t n = ails->ids[i].size();
if (n > 0) {
WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
WRITEANDCHECK (ails->ids[i].data(), n);
}
}
WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
codes 为原始数据pq后的量化编码 code 大小是code_size(32)字节 因此一共得花费 n*32字节空间
31 f0 ab e1 27 9f b6 d7 3f 1f e4 28 b4 09 07 38 1f 72 44 cb 13 e9 ce ac db 6e db 61 94 21 0f b1 # 32字节
...............................................................................................
...............................................................................................
...............................................................................................
...............................................................................................
写入用户子定义ids总共需花费空间 n*long(8)
WRITEANDCHECK (ails->ids[i].data(), n);
01 00 00 00 00 00 00 00 # 1 little-endian
不执行
if (ivfpqr) {
write_ProductQuantizer (&ivfpqr->refine_pq, f);
WRITEVECTOR (ivfpqr->refine_codes);
WRITE1 (ivfpqr->k_factor);
}
残差列表不存储文件,每次加载重新计算残差列表 默认不使用残差列表
// precomputed table not stored. It is cheaper to recompute it
ivpq->use_precomputed_table = 0;
if (ivpq->by_residual)
ivpq->precompute_table ();