This repository has been archived by the owner on Oct 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreate_index.py
executable file
·58 lines (51 loc) · 2.54 KB
/
create_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
# -*- coding: utf8 -*-
# Copyright (c) 2020 Roberto Treviño Cervantes
#########################################################################
# #
# This file is part of FUTURE (Powered by Monad). #
# #
# FUTURE is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# FUTURE is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with FUTURE. If not, see <https://www.gnu.org/licenses/>. #
# #
#########################################################################
import lmdb, base64, hnswlib, bson
import numpy as np
from Monad import Monad, getSentenceMeanVector
bson.loads = bson.BSON.decode
bson.dumps = bson.BSON.encode
futureURLs = Monad("future_urls")
futureURLs.createIndex("FUTURE_url_vecs")
futureURLs.compileIndex()
hnswImagesLookup = hnswlib.Index(space="cosine", dim=50)
hnswImagesLookup.init_index(max_elements=1000000, ef_construction=200, M=16)
hnswImagesLookup.set_ef(100)
imageDBIndex = lmdb.open("./future_images", readonly=True)
with imageDBIndex.begin() as imageDBTransaction:
imageDBSelector = imageDBTransaction.cursor()
for key, value in imageDBSelector:
value = bson.loads(value)
try:
hnswImagesLookup.add_items(
np.array([np.frombuffer(value["vec"], dtype="float32")]),
np.array([int(key.decode("utf-8"))]),
)
except:
pass
#search = futureURLs.searchIndex(getSentenceMeanVector("web hosting"), 5, 1)
futureURLs.saveIndex()
#labels, distances = hnswImagesLookup.knn_query(
# getSentenceMeanVector("web hosting"), k=5)
#print(labels)
#print(distances)
hnswImagesLookup.save_index("FUTURE_images_vecs.bin")