Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimization for flushing zero vectors to pinecone #15

Open
wants to merge 5 commits into
base: feature/remote_indexes
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- run: psql test -c 'alter database test set enable_seqscan = off'

# setup the database for testing
- run: make installcheck REGRESS="pinecone_crud pinecone_medium_create pinecone_zero_vector_insert pinecone_build_after_insert pinecone_invalid_config" REGRESS_OPTS="--dbname=test --inputdir=./test --use-existing"
- run: make installcheck REGRESS="pinecone_crud pinecone_medium_create pinecone_zero_vector_insert pinecone_build_after_insert" REGRESS_OPTS="--dbname=test --inputdir=./test --use-existing"
- if: ${{ failure() }}
run: cat regression.diffs
# mac:
Expand Down
2 changes: 1 addition & 1 deletion src/pinecone/pinecone.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ IndexBulkDeleteResult *no_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteRe
void pinecone_spec_validator(const PineconeOptions *opts);
void pinecone_host_validator(const char *spec);
void validate_api_key(void);
void validate_vector_nonzero(Vector* vector);
bool validate_vector_nonzero(Vector* vector);
bool no_validate(Oid opclassoid);

// utils
Expand Down
1 change: 1 addition & 0 deletions src/pinecone/pinecone_build.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ void pinecone_build_callback(Relation index, ItemPointer tid, Datum *values, boo
cJSON *json_vector;
char* pinecone_id = pinecone_id_from_heap_tid(*tid);
json_vector = tuple_get_pinecone_vector(itup_desc, values, isnull, pinecone_id);
if(json_vector==NULL) return;
cJSON_AddItemToArray(buildstate->json_vectors, json_vector);
if (cJSON_GetArraySize(buildstate->json_vectors) >= PINECONE_BATCH_SIZE) {
pinecone_bulk_upsert(pinecone_api_key, buildstate->host, buildstate->json_vectors, pinecone_vectors_per_request);
Expand Down
6 changes: 2 additions & 4 deletions src/pinecone/pinecone_insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,12 @@ bool AppendBufferTupleInCtx(Relation index, Datum *values, bool *isnull, ItemPoi
MemoryContext oldCtx;
MemoryContext insertCtx;
bool checkpoint_created;
Vector* vector;
// use a memory context because index_form_tuple can allocate
insertCtx = AllocSetContextCreate(CurrentMemoryContext,
"Pinecone insert tuple temporary context",
ALLOCSET_DEFAULT_SIZES);
oldCtx = MemoryContextSwitchTo(insertCtx);

vector = DatumGetVector(values[0]);
validate_vector_nonzero(vector);

checkpoint_created = AppendBufferTuple(index, values, isnull, heap_tid, heapRel);
MemoryContextSwitchTo(oldCtx);
Expand Down Expand Up @@ -286,7 +283,8 @@ void FlushToPinecone(Relation index)

vector_id = pinecone_id_from_heap_tid(buffer_tup.tid);
json_vector = tuple_get_pinecone_vector(index->rd_att, index_values, index_isnull, vector_id);
cJSON_AddItemToArray(json_vectors, json_vector);
if(json_vector!=NULL)
cJSON_AddItemToArray(json_vectors, json_vector);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/pinecone/pinecone_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ void load_buffer_into_sort(Relation index, PineconeScanOpaque so, Datum query_da
page = BufferGetPage(buf);

// add all tuples on the page to the sortstate
for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page); offno = OffsetNumberNext(offno)) {
for (OffsetNumber offno = FirstOffsetNumber; offno <= PageGetMaxOffsetNumber(page) && n_sortedtuple > pinecone_max_buffer_scan; offno = OffsetNumberNext(offno)) {
// get the tid and the vector from the heap tuple
ItemId itemid;
Item item;
Expand Down
32 changes: 10 additions & 22 deletions src/pinecone/pinecone_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,17 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull
cJSON *metadata = cJSON_CreateObject();
Vector *vector;
cJSON *json_values;
bool isNonZero;
// Check if the first Datum is zero, which indicates a NULL pointer/ NULL vector
if(values[0]==0) {
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("Invalid vector: NULL vector"),
errhint("Pinecone insists that vectors cannot be NULL.")));
return NULL;
}
vector = DatumGetVector(values[0]);
validate_vector_nonzero(vector);
isNonZero = validate_vector_nonzero(vector);
if(!isNonZero) return NULL;
json_values = cJSON_CreateFloatArray(vector->x, vector->dim);
// prepare metadata
for (int i = 1; i < tup_desc->natts; i++) // skip the first column which is the vector
Expand Down Expand Up @@ -52,27 +61,6 @@ cJSON* tuple_get_pinecone_vector(TupleDesc tup_desc, Datum *values, bool *isnull
return json_vector;
}

cJSON* index_tuple_get_pinecone_vector(Relation index, IndexTuple itup) {
int natts = index->rd_att->natts;
Datum *itup_values = (Datum *) palloc(sizeof(Datum) * natts);
bool *itup_isnull = (bool *) palloc(sizeof(bool) * natts);
TupleDesc itup_desc = index->rd_att;
char* vector_id;
index_deform_tuple(itup, itup_desc, itup_values, itup_isnull);
vector_id = pinecone_id_from_heap_tid(itup->t_tid);
return tuple_get_pinecone_vector(itup_desc, itup_values, itup_isnull, vector_id);
}

cJSON* heap_tuple_get_pinecone_vector(Relation heap, HeapTuple htup) {
int natts = heap->rd_att->natts;
Datum *htup_values = (Datum *) palloc(sizeof(Datum) * natts);
bool *htup_isnull = (bool *) palloc(sizeof(bool) * natts);
TupleDesc htup_desc = heap->rd_att;
char* vector_id;
heap_deform_tuple(htup, htup_desc, htup_values, htup_isnull);
vector_id = pinecone_id_from_heap_tid(htup->t_self);
return tuple_get_pinecone_vector(htup_desc, htup_values, htup_isnull, vector_id);
}

ItemPointerData pinecone_id_get_heap_tid(char *id)
{
Expand Down
10 changes: 7 additions & 3 deletions src/pinecone/pinecone_validate.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,18 @@ void validate_api_key(void) {
}
}

void validate_vector_nonzero(Vector* vector) {
bool validate_vector_nonzero(Vector* vector) {
if (vector_eq_zero_internal(vector)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("Invalid vector: zero vector"),
errhint("Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.")));
return false;
}
return true;
}


#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnonnull"
void pinecone_spec_validator(const PineconeOptions *opts)
{
if (opts == NULL || cJSON_Parse(GET_STRING_RELOPTION(opts, spec)) == NULL || strcmp(GET_STRING_RELOPTION(opts, spec), "") == 0)
Expand All @@ -32,6 +35,7 @@ void pinecone_spec_validator(const PineconeOptions *opts)
Refer to https://docs.pinecone.io/reference/create_index")));
}
}
#pragma GCC diagnostic pop

void pinecone_host_validator(const char *host)
{
Expand Down
3 changes: 3 additions & 0 deletions test/expected/pinecone_crud.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
-- SETUP
-- suppress output
\o /dev/null
\o /dev/null
delete from pinecone_mock;
-- logging level
SET client_min_messages = 'notice';
-- flush each vector individually
SET pinecone.vectors_per_request = 1;
SET pinecone.requests_per_batch = 1;
SET pinecone.max_buffer_scan = 1000;
-- disable flat scan to force use of the index
SET enable_seqscan = off;
-- Testing database is responsible for initializing the mock table with
Expand Down
126 changes: 106 additions & 20 deletions test/expected/pinecone_zero_vector_insert.out
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SET client_min_messages = 'notice';
-- flush each vector individually
SET pinecone.vectors_per_request = 1;
SET pinecone.requests_per_batch = 1;
SET pinecone.max_buffer_scan = 0;
-- disable flat scan to force use of the index
SET enable_seqscan = off;
-- CREATE TABLE
Expand Down Expand Up @@ -35,29 +36,114 @@ VALUES ('https://api.pinecone.io/indexes', 'POST', $${
}$$);
-- mock describe index stats
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":0}');
INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
-- create index after insering 0 vector - Throws an error
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
ERROR: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
-- Truncate the table to remove the values for creating an index successfully
TRUNCATE TABLE t;
VALUES ('https://fakehost/describe_index_stats', 'GET', '{"namespaces":{},"dimension":3,"indexFullness":0,"totalVectorCount":2}');
-- mock upsert
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert', 'POST', '{"upsertedCount":1}');
-- mock query
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);
-- mock fetch
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/fetch', 'GET', $${
"code": 3,
"message": "No IDs provided for fetch query",
"details": []
}$$);
-- create index
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/vectors/upsert',
'{ "vectors": [{
"id": "000000000001",
"values": [100, 1, 1],
"metadata": {
}
}]
}',
'{"upsertedCount":1}'
);
-- insert vectors: throws warning while flushing zero-vector
INSERT INTO t (id, val) VALUES (1, '[100,1,1]');
INSERT INTO t (id, val) VALUES (2, '[0,0,0]');
ERROR: Invalid vector: zero vector
INSERT INTO t (id, val) VALUES (3, NULL);
WARNING: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
WARNING: No vectors to flush to pinecone
INSERT INTO t (id, val) VALUES (4, '[10120,76,1]');
WARNING: Invalid vector: NULL vector
HINT: Pinecone insists that vectors cannot be NULL.
WARNING: No vectors to flush to pinecone
-- returns only id = 1 as it is flushed to pinecone (zero vector not flushed to pinecone)
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
NOTICE: Buffer is too large
HINT: There are 0 tuples in the buffer that have not yet been flushed to pinecone and 3 tuples in pinecone that are not yet live. You may want to consider flushing the buffer.
NOTICE: Reached max local scan
id | val
----+-----------
1 | [100,1,1]
(1 row)

SELECT * FROM t;
id | val
----+--------------
1 | [100,1,1]
2 | [0,0,0]
3 |
4 | [10120,76,1]
(4 rows)

DROP INDEX i2;
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
id | val
----+--------------
2 | [0,0,0]
1 | [100,1,1]
4 | [10120,76,1]
3 |
(4 rows)

DELETE FROM pinecone_mock
WHERE url_prefix = 'https://fakehost/query' AND method = 'POST';
INSERT INTO pinecone_mock (url_prefix, method, response)
VALUES ('https://fakehost/query', 'POST', $${
"results": [],
"matches": [{
"id": "000000000001",
"score": 2,
"values": []
},
{
"id": "000000000004",
"score": 2,
"values": []
}],
"namespace": "",
"usage": {
"readUnits": 5
}
}$$);
-- displays warning while flushing zero vector to pinecone
CREATE INDEX i2 ON t USING pinecone (val) WITH (spec = '{"serverless":{"cloud":"aws","region":"us-west-2"}}');
WARNING: Invalid vector: zero vector
HINT: Pinecone insists that dense vectors cannot be zero in all dimensions. I don't know why they do this to you even when your metric isn't cosine.
WARNING: Invalid vector: NULL vector
HINT: Pinecone insists that vectors cannot be NULL.
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
NOTICE: Reached max local scan
id | val
----+--------------
1 | [100,1,1]
4 | [10120,76,1]
(2 rows)

SELECT * FROM t;
id | val
----+--------------
1 | [100,1,1]
2 | [0,0,0]
3 |
4 | [10120,76,1]
(4 rows)

DROP TABLE t;
3 changes: 3 additions & 0 deletions test/sql/pinecone_crud.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
-- suppress output
\o /dev/null
-- logging level
\o /dev/null
delete from pinecone_mock;
SET client_min_messages = 'notice';
-- flush each vector individually
SET pinecone.vectors_per_request = 1;
SET pinecone.requests_per_batch = 1;
SET pinecone.max_buffer_scan = 1000;
-- disable flat scan to force use of the index
SET enable_seqscan = off;
-- Testing database is responsible for initializing the mock table with
Expand Down
Loading
Loading