From 7a439432b79fb6c3c2646a6d268b99760bca0f97 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 22 Jul 2024 16:42:23 -0400 Subject: [PATCH 01/91] init --- common/config.py | 9 +- common/extractors/GraphExtractor.py | 21 ++ common/extractors/__init__.py | 1 + .../louvain/louvain_2_other_passes.gsql | 217 +++++++++++++++ .../louvain/louvain_3_final_community.gsql | 44 +++ .../louvain_4_modularity_1_for_pass.gsql | 39 +++ .../louvain/louvain_4_modularity_2_final.gsql | 52 ++++ .../graphRAG/louvain/louvain_5_reset.gsql | 13 + common/gsql/supportai/Scan_For_Updates.gsql | 10 +- common/gsql/supportai/SupportAI_Schema.gsql | 18 +- common/llm_services/openai_service.py | 2 +- common/py_schemas/schemas.py | 17 +- copilot/app/routers/supportai.py | 223 +++------------ copilot/app/supportai/supportai.py | 185 +++++++++++++ copilot/docs/notebooks/graphrag.ipynb | 261 ++++++++++++++++++ .../app/eventual_consistency_checker.py | 3 +- .../app/graphrag/__init__.py | 1 + .../app/graphrag/graph_rag.py | 138 +++++++++ .../app/graphrag/util.py | 36 +++ .../app/graphrag/worker.py | 27 ++ eventual-consistency-service/app/main.py | 142 +++++++--- 21 files changed, 1226 insertions(+), 233 deletions(-) create mode 100644 common/extractors/GraphExtractor.py create mode 100644 common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_3_final_community.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_5_reset.gsql create mode 100644 copilot/app/supportai/supportai.py create mode 100644 copilot/docs/notebooks/graphrag.ipynb create mode 100644 eventual-consistency-service/app/graphrag/__init__.py create mode 100644 eventual-consistency-service/app/graphrag/graph_rag.py create mode 100644 eventual-consistency-service/app/graphrag/util.py create mode 100644 eventual-consistency-service/app/graphrag/worker.py diff --git a/common/config.py b/common/config.py index 8eb9432a..2546e38a 100644 --- a/common/config.py +++ b/common/config.py @@ -15,14 +15,15 @@ AWSBedrock, AzureOpenAI, GoogleVertexAI, - OpenAI, Groq, + HuggingFaceEndpoint, + LLM_Model, Ollama, - HuggingFaceEndpoint + OpenAI, ) +from common.logs.logwriter import LogWriter from common.session import SessionHandler from common.status import StatusManager -from common.logs.logwriter import LogWriter security = HTTPBasic() session_handler = SessionHandler() @@ -102,7 +103,7 @@ raise Exception("Embedding service not implemented") -def get_llm_service(llm_config): +def get_llm_service(llm_config) -> LLM_Model: if llm_config["completion_service"]["llm_service"].lower() == "openai": return OpenAI(llm_config["completion_service"]) elif llm_config["completion_service"]["llm_service"].lower() == "azure": diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py new file mode 100644 index 00000000..c8f24355 --- /dev/null +++ b/common/extractors/GraphExtractor.py @@ -0,0 +1,21 @@ +from langchain_community.graphs.graph_document import GraphDocument +from langchain_core.documents import Document +from langchain_experimental.graph_transformers import LLMGraphTransformer + +from common.config import get_llm_service, llm_config +from common.extractors.BaseExtractor import BaseExtractor + + +class GraphExtractor(BaseExtractor): + def __init__(self): + llm = get_llm_service(llm_config).llm + self.transformer = LLMGraphTransformer( + llm=llm, + node_properties=["description"], + relationship_properties=["description"], + ) + + def extract(self, text) -> list[GraphDocument]: + doc = Document(page_content=text) + graph_docs = self.transformer.convert_to_graph_documents([doc]) + return graph_docs diff --git a/common/extractors/__init__.py b/common/extractors/__init__.py index ced539e4..e2f0bcdf 100644 --- a/common/extractors/__init__.py +++ b/common/extractors/__init__.py @@ -1,3 +1,4 @@ +from common.extractors.GraphExtractor import GraphExtractor from common.extractors.LLMEntityRelationshipExtractor import ( LLMEntityRelationshipExtractor, ) diff --git a/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql new file mode 100644 index 00000000..231631d6 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql @@ -0,0 +1,217 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_2( + UINT layer = 0, + UINT max_hop = 10, + UINT batch_num = 1 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + SumAccum @vid; + SumAccum @@links_to_check; + + // Initialization + LOG(TRUE, "Query started!"); + All_Nodes = {{{entity_vertex_name}.*}}; + _tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + ACCUM + @@links_to_check += 1; + + All_Nodes = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + @@m += weight / 2, + s.@k += weight, + IF s == t THEN // self-loop link + s.@k_self_loop += weight + END + POST-ACCUM + s.@{community_id_attribute_name} = s, + s.@community_vid = to_string(s.id), + s.@vid = getvid(s), + s.@batch_id = s.@vid % batch_num + ; + LOG(TRUE, All_Nodes.size()); + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = All_Nodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop = hop + 1; + LOG(TRUE, hop); + IF hop == 1 THEN // first iteration + ChangedNodes = + SELECT s + FROM Candidates:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE + ; + ELSE // remaining iterations + // Calculate sum_total + Tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k) + ; + Tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}) + ; + LOG(TRUE, @@community_sum_total_map.size()); + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {{}}; + FOREACH batch_id IN RANGE[0, batch_num-1] DO + LOG(TRUE, batch_id); + // Calculate the delta Q to remove the node from the previous community + Nodes = + SELECT s + FROM Candidates:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@batch_id == batch_id + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + s.@k_in += weight + ELSE + s.@community_k_in_map += (t.@{community_id_attribute_name} -> weight) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add + ; + // Find the best move + Nodes = + SELECT s + FROM Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE + ; + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = + SELECT s + FROM ChangedNodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@best_move.community == t.@{community_id_attribute_name} + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@{community_id_attribute_name} + // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid)) + POST-ACCUM + s.@to_change_community = FALSE + ; + LOG(TRUE, SwapNodes.size()); + ChangedNodes = ChangedNodes MINUS SwapNodes; + LOG(TRUE, ChangedNodes.size()); + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = + SELECT s + FROM ChangedNodes:s + POST-ACCUM + s.@{community_id_attribute_name} = s.@best_move.community, + s.@community_vid = s.@best_move.ext_vid, + s.@to_change_community = FALSE + ; + + @@move_cnt += ChangedNodes.size(); + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = + SELECT t + FROM ChangedNodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND t.@{community_id_attribute_name} != s.@{community_id_attribute_name} + ; + LOG(TRUE, Candidates.size()); + END; + + PRINT @@move_cnt AS Delta; + + // Coarsening + LOG(TRUE, "Coarsening"); + UINT new_layer = layer + 1; + @@community_sum_total_map.clear(); + Tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + DOUBLE weight = e.layer_weight_map.get(layer), + @@community_sum_in_map += (s.@{community_id_attribute_name} -> weight) + END + POST-ACCUM + //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), + INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) + END + ; + LOG(TRUE, @@community_sum_in_map.size()); + @@community_sum_in_map.clear(); + Tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN + @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> weight)) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + //f_links_to.println(s.uniq_id, target_community, k_in, new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) + END + END + ; + LOG(TRUE, @@source_target_k_in_map.size()); + @@source_target_k_in_map.clear(); + PRINT @@links_to_check; + LOG(TRUE, "Query finished!"); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql new file mode 100644 index 00000000..75cbad7e --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql @@ -0,0 +1,44 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_3( + UINT top_layer = 2 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + MinAccum @{community_id_attribute_name}; // the community ID of the node + INT layer = top_layer; + + // Initialization + LOG(TRUE, "Query started!"); + All_Nodes = {{{entity_vertex_name}.*}}; + + // Top layer + Nodes = + SELECT t + FROM All_Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t + WHERE layer IN e.layer_set + ACCUM t.@{community_id_attribute_name} = to_string(s.id) + ; + LOG(TRUE, layer, Nodes.size()); + + // Other layers + WHILE Nodes.size() > 0 AND layer > 0 DO + layer = layer - 1; + Nodes = + SELECT t + FROM Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t + WHERE layer IN e.layer_set + ACCUM t.@{community_id_attribute_name} = s.@{community_id_attribute_name} + ; + LOG(TRUE, layer, Nodes.size()); + END; + + // Write to the file + Nodes = + SELECT s + FROM Nodes:s + POST-ACCUM + //f.println(s.uniq_id, s.@{community_id_attribute_name}) + s.{community_id_attribute_name} = s.@{community_id_attribute_name} + + ; + LOG(TRUE, "Query finished!"); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql new file mode 100644 index 00000000..0058d0ee --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql @@ -0,0 +1,39 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4a( + UINT layer=0 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MapAccum, SumAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum, SumAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + + All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s == t THEN + @@community_in_weight_map += (s -> weight) + END, + @@community_total_weight_map += (s -> weight), + @@sum_weight += weight + ; + LOG(TRUE, All_Nodes.size()); + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; + PRINT layer; + PRINT @@modularity AS modularity; + PRINT @@community_total_weight_map.size() AS community_number; + PRINT All_Nodes.size(); + @@community_in_weight_map.clear(); + @@community_total_weight_map.clear(); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql new file mode 100644 index 00000000..31ba4d0b --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql @@ -0,0 +1,52 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4b( +) FOR GRAPH {graph_name} SYNTAX v1 {{ + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + MapAccum> @@Community_sizes; + MapAccum> @@count_of_sizes; + AvgAccum @@avg_community_size; + + DOUBLE wt = 1.0; + All_Nodes = {{{entity_vertex_name}.*}}; + Nodes = + SELECT s + FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM IF s.{community_id_attribute_name} == t.{community_id_attribute_name} THEN + @@community_in_weight_map += (s.{community_id_attribute_name} -> wt) + END, + @@community_total_weight_map += (s.{community_id_attribute_name} -> wt), + @@sum_weight += wt + ; + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + + _tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + @@Community_sizes += (s.{community_id_attribute_name} -> 1); + + FOREACH (comm, cnt) IN @@Community_sizes DO + @@count_of_sizes += (cnt -> 1); + @@avg_community_size += cnt; + END; + + // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; + PRINT @@modularity AS modularity; + PRINT @@community_total_weight_map.size() AS community_number; + PRINT @@count_of_sizes AS num_communities_by_size; + PRINT @@avg_community_size AS avg_community_size; + + @@community_in_weight_map.clear(); + @@community_total_weight_map.clear(); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain/louvain_5_reset.gsql new file mode 100644 index 00000000..7590935a --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_5_reset.gsql @@ -0,0 +1,13 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_5_reset( +) FOR GRAPH {graph_name} SYNTAX v1 {{ + + // Initialization + Nodes = {{{entity_vertex_name}.*}}; + + // Top layer + DELETE e + FROM Nodes:s -(({belongs_to_edge_name}|{links_to_edge_name}):e)- :t + ; +}} diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index 03ced2ec..7d9d1b83 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - ELSE IF v_type == "Concept" THEN - res = SELECT s FROM start:s - POST-ACCUM @@v_and_text += (s.id -> s.description), - s.epoch_processing = datetime_to_epoch(now()); + // ELSE IF v_type == "Concept" THEN + // res = SELECT s FROM start:s + // POST-ACCUM @@v_and_text += (s.id -> s.description), + // s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), @@ -42,4 +42,4 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); END; PRINT @@v_and_text; -} \ No newline at end of file +} diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 061993bb..0998affe 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -18,4 +18,18 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE HAS_CHILD(FROM Document, TO DocumentChunk) WITH REVERSE_EDGE="reverse_HAS_CHILD"; ADD DIRECTED EDGE HAS_RELATIONSHIP(FROM Concept, TO Concept, relation_type STRING) WITH REVERSE_EDGE="reverse_HAS_RELATIONSHIP"; ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; -} \ No newline at end of file + + // GraphRAG + ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + + ADD DIRECTED EDGE KNN(FROM Entity, TO Entity); // TODO: check where knn algo writes results + ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity); // Connect ResolvedEntities with their children entities + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, weight UINT); // store edges between entities after they're resolved + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community); + + // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out + // Hierarchical communities (Louvain/Leiden) + // ADD UNDIRECTED EDGE LINKS_TO(FROM Community, TO Community); + // ADD DIRECTED EDGE BELONGS_TO(FROM Community, TO Community); +} diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 914f6364..c7274720 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,5 +1,6 @@ import logging import os +from langchain_openai import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv @@ -16,7 +17,6 @@ def __init__(self, config): auth_detail ] - from langchain.chat_models import ChatOpenAI model_name = config["llm_model"] self.llm = ChatOpenAI( diff --git a/common/py_schemas/schemas.py b/common/py_schemas/schemas.py index e5dd1faf..07a2113f 100644 --- a/common/py_schemas/schemas.py +++ b/common/py_schemas/schemas.py @@ -15,11 +15,13 @@ class SupportAIQuestion(BaseModel): method_params: dict = {} -class SupportAIInitConfig(BaseModel): - chunker: str - chunker_params: dict - extractor: str - extractor_params: dict +class SupportAIMethod(enum.StrEnum): + SUPPORTAI = enum.auto() + GRAPHRAG = enum.auto() + + +class EccConfig(BaseModel): + method: SupportAIMethod = SupportAIMethod.SUPPORTAI class GSQLQueryInfo(BaseModel): @@ -126,15 +128,18 @@ class QueryUpsertRequest(BaseModel): id: Optional[str] query_info: Optional[GSQLQueryInfo] + class MessageContext(BaseModel): # TODO: fix this to contain proper message context user: str content: str + class ReportQuestions(BaseModel): question: str reasoning: str + class ReportSection(BaseModel): section_name: str description: str @@ -142,6 +147,7 @@ class ReportSection(BaseModel): copilot_fortify: bool = True actions: Optional[List[str]] = None + class ReportCreationRequest(BaseModel): topic: str sections: Union[List[ReportSection], str] = None @@ -150,6 +156,7 @@ class ReportCreationRequest(BaseModel): conversation_id: Optional[str] = None message_context: Optional[List[MessageContext]] = None + class Role(enum.StrEnum): SYSTEM = enum.auto() USER = enum.auto() diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index a3c94951..a829d3a4 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -1,22 +1,38 @@ import json import logging -import uuid from typing import Annotated -from fastapi import APIRouter, BackgroundTasks, Depends, Request +from fastapi import APIRouter, BackgroundTasks, Depends, Request, Response, status from fastapi.security.http import HTTPBase +from supportai import supportai from supportai.concept_management.create_concepts import ( - CommunityConceptCreator, EntityConceptCreator, HigherLevelConceptCreator, - RelationshipConceptCreator) -from supportai.retrievers import (EntityRelationshipRetriever, - HNSWOverlapRetriever, HNSWRetriever, - HNSWSiblingRetriever) - -from common.config import (db_config, embedding_service, embedding_store, - get_llm_service, llm_config) + CommunityConceptCreator, + EntityConceptCreator, + HigherLevelConceptCreator, + RelationshipConceptCreator, +) +from supportai.retrievers import ( + EntityRelationshipRetriever, + HNSWOverlapRetriever, + HNSWRetriever, + HNSWSiblingRetriever, +) + +from common.config import ( + db_config, + embedding_service, + embedding_store, + get_llm_service, + llm_config, +) from common.logs.logwriter import LogWriter -from common.py_schemas.schemas import (CoPilotResponse, CreateIngestConfig, - LoadingInfo, SupportAIQuestion) +from common.py_schemas.schemas import ( # SupportAIInitConfig,; SupportAIMethod, + CoPilotResponse, + CreateIngestConfig, + LoadingInfo, + SupportAIMethod, + SupportAIQuestion, +) logger = logging.getLogger(__name__) router = APIRouter(tags=["SupportAI"]) @@ -26,50 +42,14 @@ @router.post("/{graphname}/supportai/initialize") def initialize( - graphname, conn: Request, credentials: Annotated[HTTPBase, Depends(security)] + graphname, + conn: Request, + credentials: Annotated[HTTPBase, Depends(security)], ): conn = conn.state.conn - # need to open the file using the absolute path - file_path = "common/gsql/supportai/SupportAI_Schema.gsql" - with open(file_path, "r") as f: - schema = f.read() - schema_res = conn.gsql( - """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_schema""".format( - graphname, schema - ) - ) - - file_path = "common/gsql/supportai/SupportAI_IndexCreation.gsql" - with open(file_path) as f: - index = f.read() - index_res = conn.gsql( - """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_indexes""".format( - graphname, index - ) - ) - - file_path = "common/gsql/supportai/Scan_For_Updates.gsql" - with open(file_path) as f: - scan_for_updates = f.read() - res = conn.gsql( - "USE GRAPH " - + conn.graphname - + "\n" - + scan_for_updates - + "\n INSTALL QUERY Scan_For_Updates" - ) - - file_path = "common/gsql/supportai/Update_Vertices_Processing_Status.gsql" - with open(file_path) as f: - update_vertices = f.read() - res = conn.gsql( - "USE GRAPH " - + conn.graphname - + "\n" - + update_vertices - + "\n INSTALL QUERY Update_Vertices_Processing_Status" - ) + resp = supportai.init_supportai(conn, graphname) + schema_res, index_res = resp[0], resp[1] return { "host_name": conn._tg_connection.host, # include host_name for debugging from client. Their pyTG conn might not have the same host as what's configured in copilot "schema_creation_status": json.dumps(schema_res), @@ -80,132 +60,13 @@ def initialize( @router.post("/{graphname}/supportai/create_ingest") def create_ingest( graphname, - ingest_config: CreateIngestConfig, + cfg: CreateIngestConfig, conn: Request, credentials: Annotated[HTTPBase, Depends(security)], ): conn = conn.state.conn - if ingest_config.file_format.lower() == "json": - file_path = "common/gsql/supportai/SupportAI_InitialLoadJSON.gsql" - - with open(file_path) as f: - ingest_template = f.read() - ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) - doc_id = ingest_config.loader_config.get("doc_id_field", "doc_id") - doc_text = ingest_config.loader_config.get("content_field", "content") - ingest_template = ingest_template.replace('"doc_id"', '"{}"'.format(doc_id)) - ingest_template = ingest_template.replace('"content"', '"{}"'.format(doc_text)) - - if ingest_config.file_format.lower() == "csv": - file_path = "common/gsql/supportai/SupportAI_InitialLoadCSV.gsql" - - with open(file_path) as f: - ingest_template = f.read() - ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) - separator = ingest_config.get("separator", "|") - header = ingest_config.get("header", "true") - eol = ingest_config.get("eol", "\n") - quote = ingest_config.get("quote", "double") - ingest_template = ingest_template.replace('"|"', '"{}"'.format(separator)) - ingest_template = ingest_template.replace('"true"', '"{}"'.format(header)) - ingest_template = ingest_template.replace('"\\n"', '"{}"'.format(eol)) - ingest_template = ingest_template.replace('"double"', '"{}"'.format(quote)) - - file_path = "common/gsql/supportai/SupportAI_DataSourceCreation.gsql" - - with open(file_path) as f: - data_stream_conn = f.read() - - # assign unique identifier to the data stream connection - - data_stream_conn = data_stream_conn.replace( - "@source_name@", "SupportAI_" + graphname + "_" + str(uuid.uuid4().hex) - ) - - # check the data source and create the appropriate connection - if ingest_config.data_source.lower() == "s3": - data_conn = ingest_config.data_source_config - if ( - data_conn.get("aws_access_key") is None - or data_conn.get("aws_secret_key") is None - ): - raise Exception("AWS credentials not provided") - connector = { - "type": "s3", - "access.key": data_conn["aws_access_key"], - "secret.key": data_conn["aws_secret_key"], - } - - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - - elif ingest_config.data_source.lower() == "azure": - if ingest_config.data_source_config.get("account_key") is not None: - connector = { - "type": "abs", - "account.key": ingest_config.data_source_config["account_key"], - } - elif ingest_config.data_source_config.get("client_id") is not None: - # verify that the client secret is also provided - if ingest_config.data_source_config.get("client_secret") is None: - raise Exception("Client secret not provided") - # verify that the tenant id is also provided - if ingest_config.data_source_config.get("tenant_id") is None: - raise Exception("Tenant id not provided") - connector = { - "type": "abs", - "client.id": ingest_config.data_source_config["client_id"], - "client.secret": ingest_config.data_source_config["client_secret"], - "tenant.id": ingest_config.data_source_config["tenant_id"], - } - else: - raise Exception("Azure credentials not provided") - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - elif ingest_config.data_source.lower() == "gcs": - # verify that the correct fields are provided - if ingest_config.data_source_config.get("project_id") is None: - raise Exception("Project id not provided") - if ingest_config.data_source_config.get("private_key_id") is None: - raise Exception("Private key id not provided") - if ingest_config.data_source_config.get("private_key") is None: - raise Exception("Private key not provided") - if ingest_config.data_source_config.get("client_email") is None: - raise Exception("Client email not provided") - connector = { - "type": "gcs", - "project_id": ingest_config.data_source_config["project_id"], - "private_key_id": ingest_config.data_source_config["private_key_id"], - "private_key": ingest_config.data_source_config["private_key"], - "client_email": ingest_config.data_source_config["client_email"], - } - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - else: - raise Exception("Data source not implemented") - - load_job_created = conn.gsql("USE GRAPH {}\n".format(graphname) + ingest_template) - - data_source_created = conn.gsql( - "USE GRAPH {}\n".format(graphname) + data_stream_conn - ) - - return { - "load_job_id": load_job_created.split(":")[1] - .strip(" [") - .strip(" ") - .strip(".") - .strip("]"), - "data_source_id": data_source_created.split(":")[1] - .strip(" [") - .strip(" ") - .strip(".") - .strip("]"), - } + return supportai.create_ingest(graphname, cfg, conn) @router.post("/{graphname}/supportai/ingest") @@ -387,18 +248,24 @@ def build_concepts( return {"status": "success"} -@router.get("/{graphname}/supportai/forceupdate") -def ecc( - graphname, +@router.get("/{graphname}/{method}/forceupdate") +def supportai_update( + graphname: str, + method: str, conn: Request, credentials: Annotated[HTTPBase, Depends(security)], bg_tasks: BackgroundTasks, + response: Response, ): + if method != SupportAIMethod.SUPPORTAI and method != SupportAIMethod.GRAPHRAG: + response.status_code = status.HTTP_404_NOT_FOUND + return f"{method} is not a valid method. {SupportAIMethod.SUPPORTAI} or {SupportAIMethod.GRAPHRAG}" + from httpx import get as http_get ecc = ( db_config.get("ecc", "http://localhost:8001") - + f"/{graphname}/consistency_status" + + f"/{graphname}/consistency_status/{method}" ) LogWriter.info(f"Sending ECC request to: {ecc}") bg_tasks.add_task( diff --git a/copilot/app/supportai/supportai.py b/copilot/app/supportai/supportai.py new file mode 100644 index 00000000..e96663a3 --- /dev/null +++ b/copilot/app/supportai/supportai.py @@ -0,0 +1,185 @@ +import json +import uuid + +from pyTigerGraph import TigerGraphConnection + +from common.py_schemas.schemas import ( + # CoPilotResponse, + CreateIngestConfig, + # LoadingInfo, + # SupportAIInitConfig, + # SupportAIMethod, + # SupportAIQuestion, +) + + +def init_supportai(conn: TigerGraphConnection, graphname: str) -> tuple[dict, dict]: + # need to open the file using the absolute path + file_path = "common/gsql/supportai/SupportAI_Schema.gsql" + with open(file_path, "r") as f: + schema = f.read() + schema_res = conn.gsql( + """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_schema""".format( + graphname, schema + ) + ) + + file_path = "common/gsql/supportai/SupportAI_IndexCreation.gsql" + with open(file_path) as f: + index = f.read() + index_res = conn.gsql( + """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_indexes""".format( + graphname, index + ) + ) + + file_path = "common/gsql/supportai/Scan_For_Updates.gsql" + with open(file_path) as f: + scan_for_updates = f.read() + res = conn.gsql( + "USE GRAPH " + + conn.graphname + + "\n" + + scan_for_updates + + "\n INSTALL QUERY Scan_For_Updates" + ) + + file_path = "common/gsql/supportai/Update_Vertices_Processing_Status.gsql" + with open(file_path) as f: + update_vertices = f.read() + res = conn.gsql( + "USE GRAPH " + + conn.graphname + + "\n" + + update_vertices + + "\n INSTALL QUERY Update_Vertices_Processing_Status" + ) + + return schema_res, index_res + + +def create_ingest( + graphname: str, + ingest_config: CreateIngestConfig, + conn: TigerGraphConnection, +): + if ingest_config.file_format.lower() == "json": + file_path = "common/gsql/supportai/SupportAI_InitialLoadJSON.gsql" + + with open(file_path) as f: + ingest_template = f.read() + ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) + doc_id = ingest_config.loader_config.get("doc_id_field", "doc_id") + doc_text = ingest_config.loader_config.get("content_field", "content") + ingest_template = ingest_template.replace('"doc_id"', '"{}"'.format(doc_id)) + ingest_template = ingest_template.replace('"content"', '"{}"'.format(doc_text)) + + if ingest_config.file_format.lower() == "csv": + file_path = "common/gsql/supportai/SupportAI_InitialLoadCSV.gsql" + + with open(file_path) as f: + ingest_template = f.read() + ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) + separator = ingest_config.get("separator", "|") + header = ingest_config.get("header", "true") + eol = ingest_config.get("eol", "\n") + quote = ingest_config.get("quote", "double") + ingest_template = ingest_template.replace('"|"', '"{}"'.format(separator)) + ingest_template = ingest_template.replace('"true"', '"{}"'.format(header)) + ingest_template = ingest_template.replace('"\\n"', '"{}"'.format(eol)) + ingest_template = ingest_template.replace('"double"', '"{}"'.format(quote)) + + file_path = "common/gsql/supportai/SupportAI_DataSourceCreation.gsql" + + with open(file_path) as f: + data_stream_conn = f.read() + + # assign unique identifier to the data stream connection + + data_stream_conn = data_stream_conn.replace( + "@source_name@", "SupportAI_" + graphname + "_" + str(uuid.uuid4().hex) + ) + + # check the data source and create the appropriate connection + if ingest_config.data_source.lower() == "s3": + data_conn = ingest_config.data_source_config + if ( + data_conn.get("aws_access_key") is None + or data_conn.get("aws_secret_key") is None + ): + raise Exception("AWS credentials not provided") + connector = { + "type": "s3", + "access.key": data_conn["aws_access_key"], + "secret.key": data_conn["aws_secret_key"], + } + + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + + elif ingest_config.data_source.lower() == "azure": + if ingest_config.data_source_config.get("account_key") is not None: + connector = { + "type": "abs", + "account.key": ingest_config.data_source_config["account_key"], + } + elif ingest_config.data_source_config.get("client_id") is not None: + # verify that the client secret is also provided + if ingest_config.data_source_config.get("client_secret") is None: + raise Exception("Client secret not provided") + # verify that the tenant id is also provided + if ingest_config.data_source_config.get("tenant_id") is None: + raise Exception("Tenant id not provided") + connector = { + "type": "abs", + "client.id": ingest_config.data_source_config["client_id"], + "client.secret": ingest_config.data_source_config["client_secret"], + "tenant.id": ingest_config.data_source_config["tenant_id"], + } + else: + raise Exception("Azure credentials not provided") + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + elif ingest_config.data_source.lower() == "gcs": + # verify that the correct fields are provided + if ingest_config.data_source_config.get("project_id") is None: + raise Exception("Project id not provided") + if ingest_config.data_source_config.get("private_key_id") is None: + raise Exception("Private key id not provided") + if ingest_config.data_source_config.get("private_key") is None: + raise Exception("Private key not provided") + if ingest_config.data_source_config.get("client_email") is None: + raise Exception("Client email not provided") + connector = { + "type": "gcs", + "project_id": ingest_config.data_source_config["project_id"], + "private_key_id": ingest_config.data_source_config["private_key_id"], + "private_key": ingest_config.data_source_config["private_key"], + "client_email": ingest_config.data_source_config["client_email"], + } + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + else: + raise Exception("Data source not implemented") + + load_job_created = conn.gsql("USE GRAPH {}\n".format(graphname) + ingest_template) + + data_source_created = conn.gsql( + "USE GRAPH {}\n".format(graphname) + data_stream_conn + ) + + return { + "load_job_id": load_job_created.split(":")[1] + .strip(" [") + .strip(" ") + .strip(".") + .strip("]"), + "data_source_id": data_source_created.split(":")[1] + .strip(" [") + .strip(" ") + .strip(".") + .strip("]"), + } diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb new file mode 100644 index 00000000..3b1200af --- /dev/null +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pyTigerGraph import TigerGraphConnection\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "# We first create a connection to the database\n", + "host = os.environ[\"HOST\"]\n", + "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", + "password = os.getenv(\"PASS\", \"tigergraph\")\n", + "conn = TigerGraphConnection(\n", + " host=host, username=username, password=password, graphname=\"GraphRAG_pytgdocs\"\n", + ")\n", + "\n", + "conn.getToken()\n", + "\n", + "# And then add CoPilot's address to the connection. This address\n", + "# is the host's address where the CoPilot container is running.\n", + "conn.ai.configureCoPilotHost(\"http://localhost:8000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "conn.graphname = \"GraphRAG_pytgdocs\"\n", + "# conn.gsql(\"\"\"CREATE GRAPH pyTigerGraphRAG()\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host_name': 'https://algotesting.i.tgcloud.io',\n", + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.829 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.002 seconds!\\\\nLocal schema change succeeded.\"'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.ai.initializeSupportAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", + "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", + "res = conn.ai.createDocumentIngest(\n", + " data_source=\"s3\",\n", + " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", + " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", + " file_format=\"json\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'job_name': 'load_documents_content_json_75b43aab4f714888b2be3f30441e745a',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.ai.runDocumentIngest(\n", + " res[\"load_job_id\"],\n", + " res[\"data_source_id\"],\n", + " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "# conn.ai.forceConsistencyUpdate()\n", + "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", + "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", + "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "content='Hello! How can I assist you today?' response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-2a50fab6-62fc-433c-98b4-221346ca41c6-0' usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17}\n" + ] + }, + { + "data": { + "text/plain": [ + "Joke(setup='Why was the cat sitting on the computer?', punchline='To keep an eye on the mouse!')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_core.pydantic_v1 import BaseModel, Field\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "\n", + "class Joke(BaseModel):\n", + " setup: str = Field(description=\"The setup of the joke\")\n", + " punchline: str = Field(description=\"The punchline to the joke\")\n", + "\n", + "\n", + "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", + "print(model.invoke('hi'))\n", + "structured_llm = model.with_structured_output(Joke)\n", + "structured_llm.invoke(\"Tell me a joke about cats\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents import Document\n", + "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", + "from langchain_openai import ChatOpenAI\n", + "import os\n", + "# from langchain_core.pydantic_v1 import BaseModel\n", + "from pydantic import BaseModel\n", + "\n", + "\n", + "class AnswerWithJustification(BaseModel):\n", + " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", + " answer: str\n", + " justification: str\n", + "\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", + "model_name = \"gpt-4o-mini\"\n", + "llm = ChatOpenAI(model=model_name, temperature=0)\n", + "# sllm = llm.with_structured_output(AnswerWithJustification)\n", + "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", + "\n", + "class GraphExtractor:\n", + " def __init__(self):\n", + " self.transformer = LLMGraphTransformer(\n", + " llm=llm,\n", + " node_properties=[\"description\"],\n", + " relationship_properties=[\"description\"],\n", + " )\n", + "\n", + " def extract(self, text):\n", + " doc = Document(page_content=text)\n", + " graph_docs = self.transformer.convert_to_graph_documents([doc])\n", + " return graph_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id='Marie Curie' type='Person' properties={'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.'}\n", + "id='Pierre Curie' type='Person' properties={'description': 'Husband of Marie Curie and co-winner of her first Nobel Prize.'}\n", + "id='University Of Paris' type='Institution' properties={'description': 'The institution where Marie Curie became the first woman professor in 1906.'}\n", + "id='Nobel Prize' type='Award' properties={'description': 'An award won by Marie Curie, first woman to win it and first person to win it twice.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Pierre Curie', type='Person') type='HUSBAND' properties={'description': \"Marie Curie's husband and co-winner of her first Nobel Prize.\"}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First woman to win a Nobel Prize.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First person to win a Nobel Prize twice.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'Only person to win a Nobel Prize in two scientific fields.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='University Of Paris', type='Institution') type='PROFESSOR' properties={'description': 'First woman to become a professor at the University of Paris in 1906.'}\n" + ] + } + ], + "source": [ + "text = \"\"\"\n", + "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", + "She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.\n", + "Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.\n", + "She was, in 1906, the first woman to become a professor at the University of Paris.\n", + "\"\"\"\n", + "ge = GraphExtractor()\n", + "\n", + "docs = ge.extract(text)\n", + "for d in docs:\n", + " for n in d.nodes:\n", + " print(n)\n", + " for r in d.relationships:\n", + " print(r)\n", + "# print(f\"Nodes:{docs[0].nodes}\")\n", + "# print(f\"Relationships:{docs[0].relationships}\")\n", + "# docs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/eventual-consistency-service/app/eventual_consistency_checker.py b/eventual-consistency-service/app/eventual_consistency_checker.py index 007330bd..fa16694e 100644 --- a/eventual-consistency-service/app/eventual_consistency_checker.py +++ b/eventual-consistency-service/app/eventual_consistency_checker.py @@ -1,4 +1,3 @@ -import json import logging import time from typing import Dict, List @@ -367,4 +366,4 @@ def get_status(self): )[0] LogWriter.info(f"ECC_Status for graphname {self.graphname}: {status}") statuses[v_type] = status - return statuses \ No newline at end of file + return statuses diff --git a/eventual-consistency-service/app/graphrag/__init__.py b/eventual-consistency-service/app/graphrag/__init__.py new file mode 100644 index 00000000..953b2a0b --- /dev/null +++ b/eventual-consistency-service/app/graphrag/__init__.py @@ -0,0 +1 @@ +from .graph_rag import * diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py new file mode 100644 index 00000000..637546d6 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -0,0 +1,138 @@ +import asyncio +import logging + +from graphrag.util import install_query +from graphrag.worker import worker +from pyTigerGraph import TigerGraphConnection + +from common.chunkers import character_chunker, regex_chunker, semantic_chunker +from common.chunkers.base_chunker import BaseChunker +from common.config import (doc_processing_config, embedding_service, + get_llm_service, llm_config, milvus_config) +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor +from common.extractors.BaseExtractor import BaseExtractor + +logger = logging.getLogger(__name__) +consistency_checkers = {} + + +def get_chunker(): + if doc_processing_config.get("chunker") == "semantic": + chunker = semantic_chunker.SemanticChunker( + embedding_service, + doc_processing_config["chunker_config"].get("method", "percentile"), + doc_processing_config["chunker_config"].get("threshold", 0.95), + ) + elif doc_processing_config.get("chunker") == "regex": + chunker = regex_chunker.RegexChunker( + pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") + ) + elif doc_processing_config.get("chunker") == "character": + chunker = character_chunker.CharacterChunker( + chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), + overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), + ) + else: + raise ValueError("Invalid chunker type") + + return chunker + + +async def install_queries( + requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +): + loop = asyncio.get_event_loop() + tasks: list[asyncio.Task] = [] + + # queries that are currently installed + installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + + # add queries to be installed into the queue + tq = asyncio.Queue() + for q in requried_queries: + if q not in installed_queries: + tq.put_nowait((install_query, (conn, q))) + # break + + print("starting workers") + # start workers + for n in range(min(tq.qsize(), n_workers)): + task = loop.create_task(worker(n, tq)) + tasks.append(task) + + # wait for workers to finish jobs + await tq.join() + for t in tasks: + print(t.result()) + return "", "", "" + + +async def init( + graphname: str, conn: TigerGraphConnection +) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: + # install requried queries + requried_queries = [ + "Scan_For_Updates", + "Update_Vertices_Processing_Status", + "ECC_Status", + "Check_Nonexistent_Vertices", + ] + await install_queries(requried_queries, conn) + + # init processing tools + chunker = get_chunker() + vector_indices = {} + vertex_field = milvus_config.get("vertex_field", "vertex_id") + index_names = milvus_config.get( + "indexes", + ["Document", "DocumentChunk", "Entity", "Relationship"], + ) + for index_name in index_names: + vector_indices[graphname + "_" + index_name] = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=graphname + "_" + index_name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, + ) + + if doc_processing_config.get("extractor") == "llm": + extractor = GraphExtractor() + elif doc_processing_config.get("extractor") == "llm": + extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) + else: + raise ValueError("Invalid extractor type") + + if vertex_field is None: + raise ValueError( + "vertex_field is not defined. Ensure Milvus is enabled in the configuration." + ) + + return chunker, vector_indices, extractor + + +async def run(graphname: str, conn: TigerGraphConnection): + """ + ecc flow + + initialize_eventual_consistency_checker + instantiates ecc object + writes checker to checker dict + runs ecc_obj.initialize() + + ECC.initialize + loops and calls fetch and process + + """ + + chunker, vector_indices, extractor = await init(graphname, conn) + + # process docs + + return f"hi from graph rag ecc: {conn.graphname} ({graphname})" diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py new file mode 100644 index 00000000..ae6fbcf7 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/util.py @@ -0,0 +1,36 @@ +import base64 +from urllib.parse import quote_plus + +import httpx +from pyTigerGraph import TigerGraphConnection + +from common.logs.logwriter import LogWriter + + +async def install_query( + conn: TigerGraphConnection, query_name: str +) -> dict[str, httpx.Response | str | None]: + print("install --", query_name) + LogWriter.info(f"Installing query {query_name}") + with open(f"common/gsql/supportai/{query_name}.gsql", "r") as f: + query = f.read() + + query = f"""\ +USE GRAPH {conn.graphname} +{query} +INSTALL QUERY {query_name}""" + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) + + if "error" in res.text.lower(): + LogWriter.error(res.text) + return {"result": None, "error": f"Failed to install query {query_name}"} + + return {"result": res, "error": False} diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py new file mode 100644 index 00000000..4edd561a --- /dev/null +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -0,0 +1,27 @@ +import asyncio + + +async def worker( + n: int, + task_queue: asyncio.Queue, +): + worker_name = f"worker-{n+1}" + worker_name += " " if n + 1 < 10 else "" + responses = [] + i = 0 + + while not task_queue.empty(): + # get the next task + func, args = await task_queue.get() + response = await func(*args) + + responses.append(response) + i += 1 + task_queue.task_done() + + # collate results + results = [] + for r in responses: + results.append(r) + + return results diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 4ca26c2c..4c486bc0 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,54 +1,79 @@ +import asyncio +import json import logging -from typing import Annotated +from contextlib import asynccontextmanager +from threading import Thread +from typing import Annotated, Callable -from fastapi import Depends, FastAPI, BackgroundTasks +import graphrag +from eventual_consistency_checker import EventualConsistencyChecker +from fastapi import BackgroundTasks, Depends, FastAPI, Response, status from fastapi.security.http import HTTPBase from common.config import ( db_config, + doc_processing_config, embedding_service, get_llm_service, llm_config, milvus_config, security, - doc_processing_config, ) +from common.db.connections import elevate_db_connection_to_token from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.logs.logwriter import LogWriter from common.metrics.tg_proxy import TigerGraphConnectionProxy -from common.db.connections import elevate_db_connection_to_token -from eventual_consistency_checker import EventualConsistencyChecker -import json -from threading import Thread +from common.py_schemas.schemas import SupportAIMethod logger = logging.getLogger(__name__) consistency_checkers = {} -app = FastAPI() -@app.on_event("startup") -def startup_event(): - if not db_config.get("enable_consistency_checker", True): - LogWriter.info("Eventual consistency checker disabled") - return +@asynccontextmanager +async def lifespan(_: FastAPI): + if not db_config.get("enable_consistency_checker", False): + LogWriter.info("Eventual Consistency Checker not run on startup") + + else: + startup_checkers = db_config.get("graph_names", []) + for graphname in startup_checkers: + conn = elevate_db_connection_to_token( + db_config["hostname"], + db_config["username"], + db_config["password"], + graphname, + ) + start_ecc_in_thread(graphname, conn) + yield + LogWriter.info("ECC Shutdown") + + +app = FastAPI(lifespan=lifespan) - startup_checkers = db_config.get("graph_names", []) - for graphname in startup_checkers: - conn = elevate_db_connection_to_token(db_config["hostname"], db_config["username"], db_config["password"], graphname) - start_ecc_in_thread(graphname, conn) def start_ecc_in_thread(graphname: str, conn: TigerGraphConnectionProxy): - thread = Thread(target=initialize_eventual_consistency_checker, args=(graphname, conn), daemon=True) + thread = Thread( + target=initialize_eventual_consistency_checker, + args=(graphname, conn), + daemon=True, + ) thread.start() LogWriter.info(f"Eventual consistency checker started for graph {graphname}") -def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConnectionProxy): + +def initialize_eventual_consistency_checker( + graphname: str, conn: TigerGraphConnectionProxy +): if graphname in consistency_checkers: return consistency_checkers[graphname] try: - process_interval_seconds = milvus_config.get("process_interval_seconds", 1800) # default 30 minutes - cleanup_interval_seconds = milvus_config.get("cleanup_interval_seconds", 86400) # default 30 days, + process_interval_seconds = milvus_config.get( + "process_interval_seconds", 1800 + ) # default 30 minutes + cleanup_interval_seconds = milvus_config.get( + "cleanup_interval_seconds", 86400 + ) # default 30 days, batch_size = milvus_config.get("batch_size", 10) vector_indices = {} vertex_field = None @@ -70,7 +95,7 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn password=milvus_config.get("password", ""), vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field + vertex_field=vertex_field, ) if doc_processing_config.get("chunker") == "semantic": @@ -111,7 +136,9 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn raise ValueError("Invalid extractor type") if vertex_field is None: - raise ValueError("vertex_field is not defined. Ensure Milvus is enabled in the configuration.") + raise ValueError( + "vertex_field is not defined. Ensure Milvus is enabled in the configuration." + ) checker = EventualConsistencyChecker( process_interval_seconds, @@ -124,7 +151,7 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn conn, chunker, extractor, - batch_size + batch_size, ) consistency_checkers[graphname] = checker @@ -138,22 +165,65 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn return checker except Exception as e: - LogWriter.error(f"Failed to start eventual consistency checker for graph {graphname}: {e}") + LogWriter.error( + f"Failed to start eventual consistency checker for graph {graphname}: {e}" + ) + + +def start_func_in_thread(f: Callable, *args, **kwargs): + thread = Thread( + target=f, + args=args, + kwargs=kwargs, + daemon=True, + ) + thread.start() + LogWriter.info(f'Thread started for function: "{f.__name__}"') + + +# def start_async_func(f: Callable, *args, **kwargs): +# asyncio.run(f(args, kwargs)) +# LogWriter.info(f'Thread started for function: "{f.__name__}"') + @app.get("/") def root(): LogWriter.info(f"Healthcheck") return {"status": "ok"} -@app.get("/{graphname}/consistency_status") -def consistency_status(graphname: str, credentials: Annotated[HTTPBase, Depends(security)]): - if graphname in consistency_checkers: - ecc = consistency_checkers[graphname] - status = json.dumps(ecc.get_status()) - else: - conn = elevate_db_connection_to_token(db_config["hostname"], credentials.username, credentials.password, graphname) - start_ecc_in_thread(graphname, conn) - status = f"Eventual consistency checker started for graph {graphname}" - LogWriter.info(f"Returning consistency status for {graphname}: {status}") - return status +@app.get("/{graphname}/consistency_status/{ecc_method}") +def consistency_status( + graphname: str, + ecc_method: str, + background: BackgroundTasks, + credentials: Annotated[HTTPBase, Depends(security)], + response: Response, +): + conn = elevate_db_connection_to_token( + db_config["hostname"], + credentials.username, + credentials.password, + graphname, + ) + match ecc_method: + case SupportAIMethod.SUPPORTAI: + if graphname in consistency_checkers: + ecc = consistency_checkers[graphname] + ecc_status = json.dumps(ecc.get_status()) + else: + start_ecc_in_thread(graphname, conn) + ecc_status = ( + f"Eventual consistency checker started for graph {graphname}" + ) + + LogWriter.info(f"Returning consistency status for {graphname}: {status}") + case SupportAIMethod.GRAPHRAG: + background.add_task(graphrag.run, graphname, conn) + # asyncio.run(graphrag.run(graphname, conn)) + ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname})" + case _: + response.status_code = status.HTTP_404_NOT_FOUND + return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" + + return ecc_status From 8e0ed554c8041c5cc9b070f51636a8c636275b2f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:30:08 -0400 Subject: [PATCH 02/91] save: docs handled concurrently -- writing upsert_edge --- docker-compose.yml | 154 +++++++++--------- .../app/graphrag/graph_rag.py | 95 +++++++---- .../app/graphrag/util.py | 151 ++++++++++++++++- .../app/graphrag/worker.py | 33 +++- eventual-consistency-service/app/main.py | 37 +---- 5 files changed, 309 insertions(+), 161 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4da38a25..f0a80154 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,30 +1,30 @@ services: - copilot: - image: tigergraphml/copilot:latest - container_name: copilot - build: - context: . - dockerfile: copilot/Dockerfile - ports: - - 8000:8000 - depends_on: - - eventual-consistency-service - - chat-history - environment: - LLM_CONFIG: "/code/configs/llm_config.json" - DB_CONFIG: "/code/configs/db_config.json" - MILVUS_CONFIG: "/code/configs/milvus_config.json" - LOGLEVEL: "INFO" - USE_CYPHER: "true" - volumes: - - ./configs/:/code/configs - - ./common:/code/common - networks: - - copilot_local - +# copilot: +# image: tigergraphml/copilot:latest +# container_name: copilot +# build: +# context: . +# dockerfile: copilot/Dockerfile +# ports: +# - 8000:8000 +# depends_on: +# - eventual-consistency-service +# - chat-history +# environment: +# LLM_CONFIG: "/code/configs/llm_config.json" +# DB_CONFIG: "/code/configs/db_config.json" +# MILVUS_CONFIG: "/code/configs/milvus_config.json" +# LOGLEVEL: "INFO" +# USE_CYPHER: "true" +# volumes: +# - ./configs/:/code/configs +# - ./common:/code/common +# networks: +# - copilot_local +# eventual-consistency-service: image: tigergraphml/ecc:latest - container_name: eventual-consistency-service + # container_name: eventual-consistency-service build: context: . dockerfile: eventual-consistency-service/Dockerfile @@ -40,64 +40,64 @@ services: - ./common:/code/common networks: - copilot_local - - chat-history: - image: tigergraphml/chat-history:latest - container_name: chat-history - build: - context: chat-history/ - dockerfile: Dockerfile - ports: - - 8002:8002 - environment: - CONFIG: "/configs/config.json" - LOGLEVEL: "INFO" - volumes: - - ./chat-history/:/configs - networks: - - copilot_local - # report-service: - # image: tigergraphml/report-service:latest - # container_name: report-service +# + # chat-history: + # image: tigergraphml/chat-history:latest + # container_name: chat-history # build: - # context: . - # dockerfile: report-service/Dockerfile + # context: chat-history/ + # dockerfile: Dockerfile # ports: # - 8002:8002 # environment: - # LLM_CONFIG: "/code/configs/llm_config.json" - # DB_CONFIG: "/code/configs/db_config.json" - # MILVUS_CONFIG: "/code/configs/milvus_config.json" + # CONFIG: "/configs/config.json" # LOGLEVEL: "INFO" # volumes: - # - ./configs/:/code/configs - # - ./common:/code/common - # - ui: - image: tigergraphml/copilot-ui:latest - container_name: ui - build: - context: copilot-ui - dockerfile: Dockerfile - ports: - - 3000:3000 - depends_on: - - copilot - networks: - - copilot_local - - nginx: - container_name: nginx - image: nginx - volumes: - - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf - ports: - - "80:80" - depends_on: - - ui - - copilot - networks: - - copilot_local - + # - ./configs/:/configs + # networks: + # - copilot_local +# # report-service: +# # image: tigergraphml/report-service:latest +# # container_name: report-service +# # build: +# # context: . +# # dockerfile: report-service/Dockerfile +# # ports: +# # - 8002:8002 +# # environment: +# # LLM_CONFIG: "/code/configs/llm_config.json" +# # DB_CONFIG: "/code/configs/db_config.json" +# # MILVUS_CONFIG: "/code/configs/milvus_config.json" +# # LOGLEVEL: "INFO" +# # volumes: +# # - ./configs/:/code/configs +# # - ./common:/code/common +# # +# ui: +# image: tigergraphml/copilot-ui:latest +# container_name: ui +# build: +# context: copilot-ui +# dockerfile: Dockerfile +# ports: +# - 3000:3000 +# depends_on: +# - copilot +# networks: +# - copilot_local +# +# nginx: +# container_name: nginx +# image: nginx +# volumes: +# - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf +# ports: +# - "80:80" +# depends_on: +# - ui +# - copilot +# networks: +# - copilot_local +# networks: copilot_local: diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 637546d6..1477d9e0 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,14 +1,19 @@ import asyncio import logging -from graphrag.util import install_query +import ecc_util +from graphrag.util import install_query, stream_docs, upsert_chunk from graphrag.worker import worker from pyTigerGraph import TigerGraphConnection -from common.chunkers import character_chunker, regex_chunker, semantic_chunker from common.chunkers.base_chunker import BaseChunker -from common.config import (doc_processing_config, embedding_service, - get_llm_service, llm_config, milvus_config) +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor @@ -17,28 +22,6 @@ consistency_checkers = {} -def get_chunker(): - if doc_processing_config.get("chunker") == "semantic": - chunker = semantic_chunker.SemanticChunker( - embedding_service, - doc_processing_config["chunker_config"].get("method", "percentile"), - doc_processing_config["chunker_config"].get("threshold", 0.95), - ) - elif doc_processing_config.get("chunker") == "regex": - chunker = regex_chunker.RegexChunker( - pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") - ) - elif doc_processing_config.get("chunker") == "character": - chunker = character_chunker.CharacterChunker( - chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), - overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), - ) - else: - raise ValueError("Invalid chunker type") - - return chunker - - async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 ): @@ -51,11 +34,10 @@ async def install_queries( # add queries to be installed into the queue tq = asyncio.Queue() for q in requried_queries: - if q not in installed_queries: + q_name = q.split("/")[-1] + if q_name not in installed_queries: tq.put_nowait((install_query, (conn, q))) - # break - print("starting workers") # start workers for n in range(min(tq.qsize(), n_workers)): task = loop.create_task(worker(n, tq)) @@ -65,23 +47,48 @@ async def install_queries( await tq.join() for t in tasks: print(t.result()) + # TODO: Check if anything had an error return "", "", "" +async def process_doc( + conn: TigerGraphConnection, doc: dict[str, str], sem: asyncio.Semaphore +): + # TODO: Embed document and chunks + chunker = ecc_util.get_chunker() + try: + print(">>>>>", doc["v_id"], len(doc["attributes"]["text"])) + # await asyncio.sleep(5) + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + await upsert_chunk(conn, v_id, f"{v_id}_chunk_{i}", chunk) + # break # single chunk FIXME: delete + finally: + sem.release() + + return doc["v_id"] + + async def init( graphname: str, conn: TigerGraphConnection ) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: # install requried queries requried_queries = [ - "Scan_For_Updates", - "Update_Vertices_Processing_Status", - "ECC_Status", - "Check_Nonexistent_Vertices", + # "common/gsql/supportai/Scan_For_Updates", + # "common/gsql/supportai/Update_Vertices_Processing_Status", + # "common/gsql/supportai/ECC_Status", + # "common/gsql/supportai/Check_Nonexistent_Vertices", + "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamDocContent", ] - await install_queries(requried_queries, conn) + # await install_queries(requried_queries, conn) + return await install_queries(requried_queries, conn) # init processing tools - chunker = get_chunker() + chunker = ecc_util.get_chunker() + vector_indices = {} vertex_field = milvus_config.get("vertex_field", "vertex_id") index_names = milvus_config.get( @@ -131,8 +138,26 @@ async def run(graphname: str, conn: TigerGraphConnection): """ + # init configurable objects chunker, vector_indices, extractor = await init(graphname, conn) # process docs + doc_workers = 48 # TODO: make configurable + doc_tasks = [] + doc_sem = asyncio.Semaphore(doc_workers) + + async with asyncio.TaskGroup() as tg: + async for content in stream_docs(conn): + # only n workers at a time -- held up by semaphore + print(">>>>>>>>>>>>>>>>>>>>>>>>\n", len(doc_tasks), "<<<<<<<<<") + await doc_sem.acquire() + task = tg.create_task(process_doc(conn, content, doc_sem)) + doc_tasks.append(task) + break + + # do something with doc_tasks + for t in doc_tasks: + print(t.result()) + print("DONE") return f"hi from graph rag ecc: {conn.graphname} ({graphname})" diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index ae6fbcf7..ce2efe52 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -1,4 +1,7 @@ import base64 +import json +import time +import traceback from urllib.parse import quote_plus import httpx @@ -7,14 +10,24 @@ from common.logs.logwriter import LogWriter +def make_headers(conn: TigerGraphConnection): + if conn.apiToken is None or conn.apiToken == "": + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + else: + headers = {"Authorization": f"Bearer {conn.apiToken}"} + + return headers + + async def install_query( - conn: TigerGraphConnection, query_name: str + conn: TigerGraphConnection, query_path: str ) -> dict[str, httpx.Response | str | None]: - print("install --", query_name) - LogWriter.info(f"Installing query {query_name}") - with open(f"common/gsql/supportai/{query_name}.gsql", "r") as f: + LogWriter.info(f"Installing query {query_path}") + with open(f"{query_path}.gsql", "r") as f: query = f.read() + query_name = query_path.split("/")[-1] query = f"""\ USE GRAPH {conn.graphname} {query} @@ -31,6 +44,134 @@ async def install_query( if "error" in res.text.lower(): LogWriter.error(res.text) - return {"result": None, "error": f"Failed to install query {query_name}"} + return { + "result": None, + "error": True, + "message": f"Failed to install query {query_name}", + } return {"result": res, "error": False} + + +async def stream_doc_ids( + conn: TigerGraphConnection, current_batch: int, ttl_batches: int +) -> dict[str, str | list[str]]: + headers = make_headers(conn) + + try: + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + }, + headers=headers, + ) + ids = res.json()["results"][0]["@@doc_ids"] + return {"error": False, "ids": ids} + + except Exception as e: + exc = traceback.format_exc() + LogWriter.error( + f"/{conn.graphname}/query/StreamDocIds\nException Trace:\n{exc}" + ) + + return {"error": True, "message": str(e)} + + +async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): + headers = make_headers(conn) + for i in range(ttl_batches): + doc_ids = await stream_doc_ids(conn, i, ttl_batches) + if doc_ids["error"]: + print(doc_ids) + break # TODO: handle error + + print("*******") + print(doc_ids) + print("*******") + for d in doc_ids["ids"]: + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + + # TODO: check for errors + yield res.json()["results"][0]["DocContent"][0] + return # single doc test FIXME: delete + # return # single batch test FIXME: delete + + +def map_attrs(attributes: dict): + # map attrs + attrs = {} + for k, v in attributes.items(): + if isinstance(v, tuple): + attrs[k] = {"value": v[0], "op": v[1]} + elif isinstance(v, dict): + attrs[k] = { + "value": {"keylist": list(v.keys()), "valuelist": list(v.values())} + } + else: + attrs[k] = {"value": v} + return attrs + + +async def upsert_vertex( + conn: TigerGraphConnection, + vertex_type: str, + vertex_id: str, + attributes: dict = None, +): + attrs = map_attrs(attributes) + data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + print(res) + +async def upsert_edge( + conn: TigerGraphConnection, + vertex_type: str, + vertex_id: str, + attributes: dict = None, +): + TODO + attrs = map_attrs(attributes) + data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + print(res) + +async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): + date_added = int(time.time()) + await upsert_vertex( + conn, + "DocumentChunk", + chunk_id, + attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, + ) + await upsert_vertex( + conn, + "Content", + chunk_id, + attributes={"text": chunk, "epoch_added": date_added}, + ) + conn.upsertEdge("DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id) + # self.conn.upsertEdge("Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) + # if int(chunk_id.split("_")[-1]) > 0: + # self.conn.upsertEdge( + # "DocumentChunk", + # chunk_id, + # "IS_AFTER", + # "DocumentChunk", + # doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + # ) diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py index 4edd561a..a2c7bbb6 100644 --- a/eventual-consistency-service/app/graphrag/worker.py +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -1,27 +1,42 @@ import asyncio +# class Channel(asyncio.Queue): +# def __init__(self, maxsize=0): +# self.is_open = True +# super().__init__(maxsize) +# +# def close(self): +# self.is_open = False + + async def worker( n: int, task_queue: asyncio.Queue, ): + # init worker logging/reporting (TODO) worker_name = f"worker-{n+1}" worker_name += " " if n + 1 < 10 else "" - responses = [] - i = 0 + while task_queue.empty(): + print(f"{worker_name} waiting") + await asyncio.sleep(1) + + # consume task queue + print(f"{worker_name} started") + responses = [] while not task_queue.empty(): # get the next task func, args = await task_queue.get() + + # execute the task response = await func(*args) + # append task results to worker results/response responses.append(response) - i += 1 - task_queue.task_done() - # collate results - results = [] - for r in responses: - results.append(r) + # mark task as done + task_queue.task_done() - return results + print(f"{worker_name} done") + return responses diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 4c486bc0..0277a272 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,10 +1,10 @@ -import asyncio import json import logging from contextlib import asynccontextmanager from threading import Thread from typing import Annotated, Callable +import ecc_util import graphrag from eventual_consistency_checker import EventualConsistencyChecker from fastapi import BackgroundTasks, Depends, FastAPI, Response, status @@ -98,35 +98,7 @@ def initialize_eventual_consistency_checker( vertex_field=vertex_field, ) - if doc_processing_config.get("chunker") == "semantic": - from common.chunkers.semantic_chunker import SemanticChunker - - chunker = SemanticChunker( - embedding_service, - doc_processing_config["chunker_config"].get("method", "percentile"), - doc_processing_config["chunker_config"].get("threshold", 0.95), - ) - elif doc_processing_config.get("chunker") == "regex": - from common.chunkers.regex_chunker import RegexChunker - - chunker = RegexChunker( - pattern=doc_processing_config["chunker_config"].get( - "pattern", "\\r?\\n" - ) - ) - elif doc_processing_config.get("chunker") == "character": - from common.chunkers.character_chunker import CharacterChunker - - chunker = CharacterChunker( - chunk_size=doc_processing_config["chunker_config"].get( - "chunk_size", 1024 - ), - overlap_size=doc_processing_config["chunker_config"].get( - "overlap_size", 0 - ), - ) - else: - raise ValueError("Invalid chunker type") + chunker = ecc_util.get_chunker() if doc_processing_config.get("extractor") == "llm": from common.extractors import LLMEntityRelationshipExtractor @@ -181,11 +153,6 @@ def start_func_in_thread(f: Callable, *args, **kwargs): LogWriter.info(f'Thread started for function: "{f.__name__}"') -# def start_async_func(f: Callable, *args, **kwargs): -# asyncio.run(f(args, kwargs)) -# LogWriter.info(f'Thread started for function: "{f.__name__}"') - - @app.get("/") def root(): LogWriter.info(f"Healthcheck") From ec299a27f84121f9e8e85666847dc517f80e2291 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:39:04 -0400 Subject: [PATCH 03/91] save: docs handled concurrently -- writing upsert_edge --- common/gsql/graphRAG/StreamDocContent.gsql | 5 + common/gsql/graphRAG/StreamDocIds.gsql | 10 + .../louvain/louvain_1_first_pass.gsql | 176 ++++++++++++++++++ eventual-consistency-service/app/ecc_util.py | 24 +++ 4 files changed, 215 insertions(+) create mode 100644 common/gsql/graphRAG/StreamDocContent.gsql create mode 100644 common/gsql/graphRAG/StreamDocIds.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql create mode 100644 eventual-consistency-service/app/ecc_util.py diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql new file mode 100644 index 00000000..fb7338b7 --- /dev/null +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -0,0 +1,5 @@ +CREATE QUERY StreamDocContent(Vertex doc) { + Doc = {doc}; + DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c; + PRINT DocContent; +} diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql new file mode 100644 index 00000000..fb373490 --- /dev/null +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -0,0 +1,10 @@ +CREATE QUERY StreamDocIds(INT current_batch, INT ttl_batches) { + ListAccum @@doc_ids; + Docs = {Document.*}; + + Docs = SELECT d FROM Docs:d + WHERE vertex_to_int(d) % ttl_batches == current_batch + ACCUM @@doc_ids += d.id; + + PRINT @@doc_ids; +} diff --git a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql new file mode 100644 index 00000000..4ca06029 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql @@ -0,0 +1,176 @@ +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( + UINT max_hop = 10, + UINT batch_num = 12, + UINT sample_edge_num = 100 +) FOR GRAPH {graph_name} SYNTAX v1 { + + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; --> this should be Community, I think + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + SumAccum @vid; + + DOUBLE wt = 1.0; + + // Initialization + All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM @@m += wt / 2, + s.@k += wt, + IF s == t THEN // self-loop link + js.@k_self_loop += wt + END + POST-ACCUM + s.@{community_id_attribute_name} = s, + s.@community_vid = to_string(s.id), + s.@vid = getvid(s), + s.@batch_id = s.@vid % batch_num; + + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = All_Nodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop = hop + 1; + LOG(TRUE, hop); + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t + WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE; + + ELSE // remaining iterations + // Calculate sum_total + Tmp = SELECT s FROM All_Nodes:s + POST-ACCUM + @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k); + Tmp = SELECT s FROM All_Nodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}); + + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {{}}; + FOREACH batch_id IN RANGE[0, batch_num-1] DO + LOG(TRUE, batch_id); + // Calculate the delta Q to remove the node from the previous community + Nodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t + WHERE s.@batch_id == batch_id + ACCUM + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + s.@k_in += wt + ELSE + s.@community_k_in_map += (t.@{community_id_attribute_name} -> wt) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add + ; + + // Find the best move + Nodes = SELECT s FROM Nodes:s -({relation_edge_name}:e)- :t + //SAMPLE sample_edge_num EDGE WHEN s.outdegree("{relation_edge_name}") > sample_edge_num + WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; + + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -({relation_edge_name}:e)- :t + WHERE s.@best_move.community == t.@{community_id_attribute_name} + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@{community_id_attribute_name} + // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND ( + s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR ( + abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@{community_id_attribute_name} = s.@best_move.community, + s.@community_vid = s.@best_move.ext_vid, + s.@to_change_community = FALSE; + + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -({relation_edge_name}:e)- :t + WHERE t.@{community_id_attribute_name} != s.@{community_id_attribute_name}; + END; + + PRINT @@move_cnt AS Delta; + + // Coarsening + UINT new_layer = 0; + @@community_sum_total_map.clear(); + Tmp = + SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + @@community_sum_in_map += (s.@{community_id_attribute_name} -> wt) + END + POST-ACCUM + //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), + INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) + END; + + @@community_sum_in_map.clear(); + + Tmp = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM + IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN + @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> wt)) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + //f_links_to.println(s.id, target_community, k_in, new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) + END + END; + + @@source_target_k_in_map.clear(); +} diff --git a/eventual-consistency-service/app/ecc_util.py b/eventual-consistency-service/app/ecc_util.py new file mode 100644 index 00000000..5656e219 --- /dev/null +++ b/eventual-consistency-service/app/ecc_util.py @@ -0,0 +1,24 @@ +from common.chunkers import character_chunker, regex_chunker, semantic_chunker +from common.config import doc_processing_config, embedding_service + + +def get_chunker(): + if doc_processing_config.get("chunker") == "semantic": + chunker = semantic_chunker.SemanticChunker( + embedding_service, + doc_processing_config["chunker_config"].get("method", "percentile"), + doc_processing_config["chunker_config"].get("threshold", 0.95), + ) + elif doc_processing_config.get("chunker") == "regex": + chunker = regex_chunker.RegexChunker( + pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") + ) + elif doc_processing_config.get("chunker") == "character": + chunker = character_chunker.CharacterChunker( + chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), + overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), + ) + else: + raise ValueError("Invalid chunker type") + + return chunker From fce72c43c73aa425d859b8120bf5ccb94e6c995f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:24:47 -0400 Subject: [PATCH 04/91] changing queues for channels --- .../app/graphrag/graph_rag.py | 154 +++++++++++------- .../app/graphrag/util.py | 99 ++++++++--- .../app/graphrag/worker.py | 11 +- eventual-consistency-service/requirements.txt | 1 + 4 files changed, 173 insertions(+), 92 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 1477d9e0..0b5265b1 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,9 +1,10 @@ import asyncio import logging +import time import ecc_util -from graphrag.util import install_query, stream_docs, upsert_chunk -from graphrag.worker import worker +from aiochannel import Channel +from graphrag.util import chunk_doc, install_query, stream_docs from pyTigerGraph import TigerGraphConnection from common.chunkers.base_chunker import BaseChunker @@ -25,52 +26,25 @@ async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 ): - loop = asyncio.get_event_loop() - tasks: list[asyncio.Task] = [] - # queries that are currently installed installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - # add queries to be installed into the queue - tq = asyncio.Queue() - for q in requried_queries: - q_name = q.split("/")[-1] - if q_name not in installed_queries: - tq.put_nowait((install_query, (conn, q))) - - # start workers - for n in range(min(tq.qsize(), n_workers)): - task = loop.create_task(worker(n, tq)) - tasks.append(task) + tasks = [] + async with asyncio.TaskGroup() as grp: + for q in requried_queries: + async with asyncio.Semaphore(n_workers): + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + task = grp.create_task(install_query(conn, q)) + tasks.append(task) - # wait for workers to finish jobs - await tq.join() for t in tasks: print(t.result()) # TODO: Check if anything had an error return "", "", "" -async def process_doc( - conn: TigerGraphConnection, doc: dict[str, str], sem: asyncio.Semaphore -): - # TODO: Embed document and chunks - chunker = ecc_util.get_chunker() - try: - print(">>>>>", doc["v_id"], len(doc["attributes"]["text"])) - # await asyncio.sleep(5) - chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] - # TODO: n chunks at a time - for i, chunk in enumerate(chunks): - await upsert_chunk(conn, v_id, f"{v_id}_chunk_{i}", chunk) - # break # single chunk FIXME: delete - finally: - sem.release() - - return doc["v_id"] - - async def init( graphname: str, conn: TigerGraphConnection ) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: @@ -124,6 +98,62 @@ async def init( return chunker, vector_indices, extractor +async def process_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + embed_q: Channel, + chunk_q: Channel, +): + doc_tasks = [] + async with asyncio.TaskGroup() as grp: + async for content in stream_docs(conn): + # only n workers at a time -- held up by semaphore size + async with asyncio.Semaphore(doc_workers): + task = grp.create_task(chunk_doc(conn, content, chunk_q, embed_q)) + doc_tasks.append(task) + break # single doc FIXME: delete + + # do something with doc_tasks? + for t in doc_tasks: + print(t.result()) + + +async def embed(embed_q: Channel): + pass + + +async def upsert(upsert_q: Channel): + """ + queue expects: + (func, args) <- q.get() + """ + while upsert_q.empty(): + await asyncio.sleep(1) + + # consume task queue + print("upsert started") + responses = [] + while not upsert_q.empty(): + # get the next task + func, args = await upsert_q.get() + + # execute the task + response = await func(*args) + + # append task results to worker results/response + responses.append(response) + + # mark task as done + upsert_q.task_done() + + print(f"upsert done") + return responses + + +async def extract(extract_q: Channel): + pass + + async def run(graphname: str, conn: TigerGraphConnection): """ ecc flow @@ -139,25 +169,33 @@ async def run(graphname: str, conn: TigerGraphConnection): """ # init configurable objects - chunker, vector_indices, extractor = await init(graphname, conn) - - # process docs - doc_workers = 48 # TODO: make configurable - doc_tasks = [] - doc_sem = asyncio.Semaphore(doc_workers) - - async with asyncio.TaskGroup() as tg: - async for content in stream_docs(conn): - # only n workers at a time -- held up by semaphore - print(">>>>>>>>>>>>>>>>>>>>>>>>\n", len(doc_tasks), "<<<<<<<<<") - await doc_sem.acquire() - task = tg.create_task(process_doc(conn, content, doc_sem)) - doc_tasks.append(task) - break - - # do something with doc_tasks - for t in doc_tasks: - print(t.result()) + await init(graphname, conn) + # return + start = time.perf_counter() + + # TODO: make configurable + tasks = [] + docs_chan = Channel(48) # process n chunks at a time max + chunk_chan = Channel(100) # process 100 chunks at a time max + embed_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # get docs + t = grp.create_task(stream_docs(conn, docs_chan,10)) + tasks.append(t) + # process docs + t = grp.create_task(process_docs(conn, docs_chan, embed_chan, chunk_chan)) + tasks.append(t) + # embed + t = grp.create_task(embed(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + # upsert chunks + t = grp.create_task(upsert(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + # extract entities + t = grp.create_task(extract(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + end = time.perf_counter() print("DONE") - return f"hi from graph rag ecc: {conn.graphname} ({graphname})" + print(end - start) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index ce2efe52..c18ec86a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,6 +5,8 @@ from urllib.parse import quote_plus import httpx +from aiochannel import Channel +from app import ecc_util from pyTigerGraph import TigerGraphConnection from common.logs.logwriter import LogWriter @@ -80,7 +82,11 @@ async def stream_doc_ids( return {"error": True, "message": str(e)} -async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): +async def stream_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + ttl_batches: int = 10, +): headers = make_headers(conn) for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) @@ -88,9 +94,6 @@ async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): print(doc_ids) break # TODO: handle error - print("*******") - print(doc_ids) - print("*******") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=None) as client: res = await client.get( @@ -98,13 +101,35 @@ async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): params={"doc": d}, headers=headers, ) - # TODO: check for errors - yield res.json()["results"][0]["DocContent"][0] - return # single doc test FIXME: delete + # this will block and wait if the channel is full + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + # return # single doc test FIXME: delete # return # single batch test FIXME: delete +async def chunk_doc( + conn: TigerGraphConnection, + doc: dict[str, str], + chunk_chan: Channel, + embed_chan: Channel, +): + # TODO: Embed document and chunks + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + # send chunks to be upserted (func, args) + await chunk_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) + + # send chunks to be embedded + + # break # single chunk FIXME: delete + + return doc["v_id"] + + def map_attrs(attributes: dict): # map attrs attrs = {} @@ -124,7 +149,7 @@ async def upsert_vertex( conn: TigerGraphConnection, vertex_type: str, vertex_id: str, - attributes: dict = None, + attributes: dict, ): attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) @@ -133,23 +158,44 @@ async def upsert_vertex( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res) + print(res.json()) + async def upsert_edge( conn: TigerGraphConnection, - vertex_type: str, - vertex_id: str, + src_v_type: str, + src_v_id: str, + edge_type: str, + tgt_v_type: str, + tgt_v_id: str, attributes: dict = None, ): - TODO - attrs = map_attrs(attributes) - data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + if attributes is None: + attrs = {} + else: + attrs = map_attrs(attributes) + data = json.dumps( + { + "edges": { + src_v_type: { + src_v_id: { + edge_type: { + tgt_v_type: { + tgt_v_id: attrs, + } + } + }, + } + } + } + ) headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res) + print(res.json()) + async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): date_added = int(time.time()) @@ -165,13 +211,16 @@ async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): chunk_id, attributes={"text": chunk, "epoch_added": date_added}, ) - conn.upsertEdge("DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id) - # self.conn.upsertEdge("Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) - # if int(chunk_id.split("_")[-1]) > 0: - # self.conn.upsertEdge( - # "DocumentChunk", - # chunk_id, - # "IS_AFTER", - # "DocumentChunk", - # doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), - # ) + await upsert_edge( + conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id + ) + await upsert_edge(conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) + if int(chunk_id.split("_")[-1]) > 0: + await upsert_edge( + conn, + "DocumentChunk", + chunk_id, + "IS_AFTER", + "DocumentChunk", + doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + ) diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py index a2c7bbb6..40720deb 100644 --- a/eventual-consistency-service/app/graphrag/worker.py +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -1,18 +1,11 @@ import asyncio - -# class Channel(asyncio.Queue): -# def __init__(self, maxsize=0): -# self.is_open = True -# super().__init__(maxsize) -# -# def close(self): -# self.is_open = False +from aiochannel import Channel async def worker( n: int, - task_queue: asyncio.Queue, + task_queue: Channel, ): # init worker logging/reporting (TODO) worker_name = f"worker-{n+1}" diff --git a/eventual-consistency-service/requirements.txt b/eventual-consistency-service/requirements.txt index 90cc7f2c..3bc0dae0 100644 --- a/eventual-consistency-service/requirements.txt +++ b/eventual-consistency-service/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohttp==3.9.3 aiosignal==1.3.1 annotated-types==0.5.0 From 46d73dc039ef005c4680c525c2e417225f1d2951 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:45:18 -0400 Subject: [PATCH 05/91] graphrag etl with channels --- copilot/docs/notebooks/graphrag.ipynb | 154 +++++++++++------- .../app/graphrag/graph_rag.py | 125 ++++++++++---- .../app/graphrag/util.py | 35 +++- 3 files changed, 207 insertions(+), 107 deletions(-) diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 3b1200af..57ea4b48 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -16,51 +16,70 @@ "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", "password = os.getenv(\"PASS\", \"tigergraph\")\n", "conn = TigerGraphConnection(\n", - " host=host, username=username, password=password, graphname=\"GraphRAG_pytgdocs\"\n", - ")\n", - "\n", - "conn.getToken()\n", - "\n", - "# And then add CoPilot's address to the connection. This address\n", - "# is the host's address where the CoPilot container is running.\n", - "conn.ai.configureCoPilotHost(\"http://localhost:8000\")" + " host=host,\n", + " username=username,\n", + " password=password,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'The graph GraphRAG_pytgdocs is created.'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "conn.graphname = \"GraphRAG_pytgdocs\"\n", - "# conn.gsql(\"\"\"CREATE GRAPH pyTigerGraphRAG()\"\"\")" + "conn.gsql(\"\"\"CREATE GRAPH GraphRAG_pytgdocs()\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "_ = conn.getToken()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.829 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.002 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.434 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.932 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# And then add CoPilot's address to the connection. This address\n", + "# is the host's address where the CoPilot container is running.\n", + "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", "conn.ai.initializeSupportAI()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -76,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_75b43aab4f714888b2be3f30441e745a',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503'}" + "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538'}" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -102,42 +121,67 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import httpx\n", - "import base64\n", - "\n", - "# conn.ai.forceConsistencyUpdate()\n", - "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", - "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", - "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + "asdf" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content='Hello! How can I assist you today?' response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-2a50fab6-62fc-433c-98b4-221346ca41c6-0' usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17}\n" - ] - }, { "data": { "text/plain": [ - "Joke(setup='Why was the cat sitting on the computer?', punchline='To keep an eye on the mouse!')" + "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658'}" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "for v in [\"Document\", \"Content\", \"DocumentChunk\"]:\n", + " try:\n", + " conn.delVertices(v)\n", + " except:\n", + " pass\n", + "\n", + "import time\n", + "time.sleep(3)\n", + "conn.ai.runDocumentIngest(\n", + " res[\"load_job_id\"],\n", + " res[\"data_source_id\"],\n", + " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "# conn.ai.forceConsistencyUpdate()\n", + "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", + "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", + "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from langchain_core.pydantic_v1 import BaseModel, Field\n", "from langchain_openai import ChatOpenAI\n", @@ -149,15 +193,14 @@ "\n", "\n", "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", - "print(model.invoke('hi'))\n", + "print(model.invoke(\"hi\"))\n", "structured_llm = model.with_structured_output(Joke)\n", - "structured_llm.invoke(\"Tell me a joke about cats\")\n", - "\n" + "structured_llm.invoke(\"Tell me a joke about cats\")" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -165,12 +208,14 @@ "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", "from langchain_openai import ChatOpenAI\n", "import os\n", + "\n", "# from langchain_core.pydantic_v1 import BaseModel\n", "from pydantic import BaseModel\n", "\n", "\n", "class AnswerWithJustification(BaseModel):\n", " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", + "\n", " answer: str\n", " justification: str\n", "\n", @@ -181,6 +226,7 @@ "# sllm = llm.with_structured_output(AnswerWithJustification)\n", "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", "\n", + "\n", "class GraphExtractor:\n", " def __init__(self):\n", " self.transformer = LLMGraphTransformer(\n", @@ -197,25 +243,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id='Marie Curie' type='Person' properties={'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.'}\n", - "id='Pierre Curie' type='Person' properties={'description': 'Husband of Marie Curie and co-winner of her first Nobel Prize.'}\n", - "id='University Of Paris' type='Institution' properties={'description': 'The institution where Marie Curie became the first woman professor in 1906.'}\n", - "id='Nobel Prize' type='Award' properties={'description': 'An award won by Marie Curie, first woman to win it and first person to win it twice.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Pierre Curie', type='Person') type='HUSBAND' properties={'description': \"Marie Curie's husband and co-winner of her first Nobel Prize.\"}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First woman to win a Nobel Prize.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First person to win a Nobel Prize twice.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'Only person to win a Nobel Prize in two scientific fields.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='University Of Paris', type='Institution') type='PROFESSOR' properties={'description': 'First woman to become a professor at the University of Paris in 1906.'}\n" - ] - } - ], + "outputs": [], "source": [ "text = \"\"\"\n", "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 0b5265b1..96a591bc 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -98,60 +98,111 @@ async def init( return chunker, vector_indices, extractor -async def process_docs( +async def chunk_docs( conn: TigerGraphConnection, docs_chan: Channel, - embed_q: Channel, - chunk_q: Channel, + embed_chan: Channel, + upsert_chan: Channel, + extract_chan: Channel, ): + """ + Creates and starts one worker for each document + in the docs channel. + """ doc_tasks = [] async with asyncio.TaskGroup() as grp: - async for content in stream_docs(conn): - # only n workers at a time -- held up by semaphore size - async with asyncio.Semaphore(doc_workers): - task = grp.create_task(chunk_doc(conn, content, chunk_q, embed_q)) - doc_tasks.append(task) - break # single doc FIXME: delete + async for content in docs_chan: + await embed_chan.put(content) # send the document to be embedded + task = grp.create_task( + chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) + ) + doc_tasks.append(task) + # break # single doc FIXME: delete # do something with doc_tasks? for t in doc_tasks: print(t.result()) + # FIXME: don't close these there, other functions will send to them + upsert_chan.close() + embed_chan.close() -async def embed(embed_q: Channel): - pass + # close the extract chan -- chunk_doc is the only sender + # and chunk_doc calls are kicked off from here (this is technically the sender) + extract_chan.close() -async def upsert(upsert_q: Channel): +async def upsert(upsert_chan: Channel): """ + Creates and starts one worker for each upsert job queue expects: (func, args) <- q.get() """ - while upsert_q.empty(): - await asyncio.sleep(1) # consume task queue - print("upsert started") + upsert_tasks = [] + async with asyncio.TaskGroup() as grp: + async for func, args in upsert_chan: + # print("func name >>>>>", func.__name__, args) + # grp.create_task(todo()) + # continue + + # execute the task + t = grp.create_task(func(*args)) + upsert_tasks.append(t) + + print(f"upsert done") + # do something with doc_tasks? + for t in upsert_tasks: + print(t.result()) + + +async def embed(embed_chan: Channel): + """ + Creates and starts one worker for each embed job + """ + + # consume task queue responses = [] - while not upsert_q.empty(): - # get the next task - func, args = await upsert_q.get() + async with asyncio.TaskGroup() as grp: + async for item in embed_chan: + print("embed item>>>>>", type(item)) + grp.create_task(todo()) + continue + # execute the task + # response = await func(*args) - # execute the task - response = await func(*args) + # append task results to worker results/response + # responses.append(response) - # append task results to worker results/response - responses.append(response) + print(f"embed done") + return responses - # mark task as done - upsert_q.task_done() - print(f"upsert done") +async def extract(extract_chan: Channel): + """ + Creates and starts one worker for each extract job + """ + + # consume task queue + responses = [] + async with asyncio.TaskGroup() as grp: + async for item in extract_chan: + print("extract item>>>>>", type(item)) + grp.create_task(todo()) + continue + # execute the task + # response = await func(*args) + + # append task results to worker results/response + # responses.append(response) + + print(f"embed done") return responses -async def extract(extract_q: Channel): - pass +async def todo(): + await asyncio.sleep(1) async def run(graphname: str, conn: TigerGraphConnection): @@ -175,25 +226,27 @@ async def run(graphname: str, conn: TigerGraphConnection): # TODO: make configurable tasks = [] - docs_chan = Channel(48) # process n chunks at a time max - chunk_chan = Channel(100) # process 100 chunks at a time max + docs_chan = Channel(15) # process n chunks at a time max embed_chan = Channel(100) upsert_chan = Channel(100) + extract_chan = Channel(100) async with asyncio.TaskGroup() as grp: # get docs - t = grp.create_task(stream_docs(conn, docs_chan,10)) + t = grp.create_task(stream_docs(conn, docs_chan, 10)) tasks.append(t) # process docs - t = grp.create_task(process_docs(conn, docs_chan, embed_chan, chunk_chan)) - tasks.append(t) - # embed - t = grp.create_task(embed(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task( + chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) + ) tasks.append(t) # upsert chunks - t = grp.create_task(upsert(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task(upsert(upsert_chan)) + tasks.append(t) + # # embed + t = grp.create_task(embed(embed_chan)) tasks.append(t) # extract entities - t = grp.create_task(extract(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task(extract(extract_chan)) tasks.append(t) end = time.perf_counter() diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index c18ec86a..cfb84e5a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -4,9 +4,9 @@ import traceback from urllib.parse import quote_plus +import ecc_util import httpx from aiochannel import Channel -from app import ecc_util from pyTigerGraph import TigerGraphConnection from common.logs.logwriter import LogWriter @@ -87,13 +87,18 @@ async def stream_docs( docs_chan: Channel, ttl_batches: int = 10, ): + """ + Streams the document contents into the docs_chan + """ headers = make_headers(conn) for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) if doc_ids["error"]: - print(doc_ids) break # TODO: handle error + print("********") + print(doc_ids) + print("********") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=None) as client: res = await client.get( @@ -104,26 +109,38 @@ async def stream_docs( # TODO: check for errors # this will block and wait if the channel is full await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # return # single doc test FIXME: delete - # return # single batch test FIXME: delete + # break # single doc test FIXME: delete + # break # single batch test FIXME: delete + + # close the docs chan -- this function is the only sender + docs_chan.close() async def chunk_doc( conn: TigerGraphConnection, doc: dict[str, str], - chunk_chan: Channel, + upsert_chan: Channel, embed_chan: Channel, + extract_chan: Channel, ): - # TODO: Embed document and chunks + """ + Chunks a document. + Places the resulting chunks into the upsert channel (to be upserted to TG) + and the embed channel (to be embedded and written to the vector store) + """ chunker = ecc_util.get_chunker() chunks = chunker.chunk(doc["attributes"]["text"]) v_id = doc["v_id"] # TODO: n chunks at a time for i, chunk in enumerate(chunks): # send chunks to be upserted (func, args) - await chunk_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) + await upsert_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) # send chunks to be embedded + await embed_chan.put(chunk) + + # send chunks to have entities extracted + await extract_chan.put(chunk) # break # single chunk FIXME: delete @@ -158,7 +175,7 @@ async def upsert_vertex( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res.json()) + print("upsert vertex>>>", res.json()) async def upsert_edge( @@ -194,7 +211,7 @@ async def upsert_edge( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res.json()) + print("upsert edge >>>", res.json()) async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): From 7501a37b400eff0334c11aa8adc264ded66d66ca Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:31:24 -0400 Subject: [PATCH 06/91] pytg in 175 seconds --- common/config.py | 2 +- common/embeddings/embedding_services.py | 37 ++- common/embeddings/milvus_embedding_store.py | 87 +++++- common/extractors/BaseExtractor.py | 13 +- common/extractors/GraphExtractor.py | 50 ++++ .../LLMEntityRelationshipExtractor.py | 35 ++- common/gsql/graphRAG/StreamDocContent.gsql | 5 +- common/gsql/graphRAG/StreamDocIds.gsql | 7 +- common/gsql/supportai/SupportAI_Schema.gsql | 12 +- common/logs/logwriter.py | 2 +- common/py_schemas/tool_io_schemas.py | 2 +- .../app/graphrag/graph_rag.py | 252 ++++++++---------- .../app/graphrag/util.py | 240 ++++++++--------- .../app/graphrag/worker.py | 35 --- .../app/graphrag/workers.py | 226 ++++++++++++++++ eventual-consistency-service/app/main.py | 3 +- 16 files changed, 668 insertions(+), 340 deletions(-) delete mode 100644 eventual-consistency-service/app/graphrag/worker.py create mode 100644 eventual-consistency-service/app/graphrag/workers.py diff --git a/common/config.py b/common/config.py index 2546e38a..ec72455d 100644 --- a/common/config.py +++ b/common/config.py @@ -167,7 +167,7 @@ def get_llm_service(llm_config) -> LLM_Model: doc_processing_config = { "chunker": "semantic", "chunker_config": {"method": "percentile", "threshold": 0.95}, - "extractor": "llm", + "extractor": "graphrag", "extractor_config": {}, } elif DOC_PROCESSING_CONFIG.endswith(".json"): diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index c76bf46d..dd506670 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -1,11 +1,13 @@ +import logging import os +import time from typing import List + from langchain.schema.embeddings import Embeddings -import logging -import time + from common.logs.log import req_id_cv -from common.metrics.prometheus_metrics import metrics from common.logs.logwriter import LogWriter +from common.metrics.prometheus_metrics import metrics logger = logging.getLogger(__name__) @@ -87,6 +89,33 @@ def embed_query(self, question: str) -> List[float]: duration ) + async def aembed_query(self, question: str) -> List[float]: + """Embed Query Async. + Embed a string. + + Args: + question (str): + A string to embed. + """ + # start_time = time.time() + # metrics.llm_inprogress_requests.labels(self.model_name).inc() + + # try: + logger.debug_pii(f"aembed_query() embedding question={question}") + query_embedding = await self.embeddings.aembed_query(question) + # metrics.llm_success_response_total.labels(self.model_name).inc() + return query_embedding + # except Exception as e: + # # metrics.llm_query_error_total.labels(self.model_name).inc() + # raise e + # finally: + # metrics.llm_request_total.labels(self.model_name).inc() + # metrics.llm_inprogress_requests.labels(self.model_name).dec() + # duration = time.time() - start_time + # metrics.llm_request_duration_seconds.labels(self.model_name).observe( + # duration + # ) + class AzureOpenAI_Ada002(EmbeddingModel): """Azure OpenAI Ada-002 Embedding Model""" @@ -124,8 +153,8 @@ class AWS_Bedrock_Embedding(EmbeddingModel): """AWS Bedrock Embedding Model""" def __init__(self, config): - from langchain_community.embeddings import BedrockEmbeddings import boto3 + from langchain_community.embeddings import BedrockEmbeddings super().__init__(config=config, model_name=config["embedding_model"]) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 8a52d05f..ac9c5389 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -1,18 +1,17 @@ import logging +import traceback from time import sleep, time from typing import Iterable, List, Optional, Tuple from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from pymilvus import connections, utility -from pymilvus.exceptions import MilvusException +from pymilvus import MilvusException, connections, utility from common.embeddings.base_embedding_store import EmbeddingStore from common.embeddings.embedding_services import EmbeddingModel from common.logs.log import req_id_cv -from common.metrics.prometheus_metrics import metrics from common.logs.logwriter import LogWriter -from pymilvus import MilvusException +from common.metrics.prometheus_metrics import metrics logger = logging.getLogger(__name__) @@ -77,7 +76,7 @@ def connect_to_milvus(self): while retry_attempt < self.max_retry_attempts: try: connections.connect(**self.milvus_connection) - metrics.milvus_active_connections.labels(self.collection_name).inc + # metrics.milvus_active_connections.labels(self.collection_name).inc LogWriter.info( f"""Initializing Milvus with host={self.milvus_connection.get("host", self.milvus_connection.get("uri", "unknown host"))}, port={self.milvus_connection.get('port', 'unknown')}, username={self.milvus_connection.get('user', 'unknown')}, collection={self.collection_name}""" @@ -213,6 +212,76 @@ def add_embeddings( error_message = f"An error occurred while registering document: {str(e)}" LogWriter.error(error_message) + async def aadd_embeddings( + self, + embeddings: Iterable[Tuple[str, List[float]]], + metadatas: List[dict] = None, + ): + """Async Add Embeddings. + Add embeddings to the Embedding store. + Args: + embeddings (Iterable[Tuple[str, List[float]]]): + Iterable of content and embedding of the document. + metadatas (List[Dict]): + List of dictionaries containing the metadata for each document. + The embeddings and metadatas list need to have identical indexing. + """ + try: + if metadatas is None: + metadatas = [] + + # add fields required by Milvus if they do not exist + if self.support_ai_instance: + for metadata in metadatas: + if self.vertex_field not in metadata: + metadata[self.vertex_field] = "" + else: + for metadata in metadatas: + if "seq_num" not in metadata: + metadata["seq_num"] = 1 + if "source" not in metadata: + metadata["source"] = "" + + LogWriter.info( + f"request_id={req_id_cv.get()} Milvus ENTRY aadd_embeddings()" + ) + texts = [text for text, _ in embeddings] + + # operation_type = "add_texts" + # metrics.milvus_query_total.labels( + # self.collection_name, operation_type + # ).inc() + # start_time = time() + + added = await self.milvus.aadd_texts(texts=texts, metadatas=metadatas) + + # duration = time() - start_time + # metrics.milvus_query_duration_seconds.labels( + # self.collection_name, operation_type + # ).observe(duration) + + LogWriter.info( + f"request_id={req_id_cv.get()} Milvus EXIT aadd_embeddings()" + ) + + # Check if registration was successful + if added: + success_message = f"Document registered with id: {added[0]}" + LogWriter.info(success_message) + return success_message + else: + error_message = f"Failed to register document {added}" + LogWriter.error(error_message) + raise Exception(error_message) + + except Exception as e: + error_message = f"An error occurred while registering document:{metadatas} ({len(texts)},{len(metadatas)})\nErr: {str(e)}" + LogWriter.error(error_message) + exc = traceback.format_exc() + LogWriter.error(exc) + LogWriter.error(f"{texts}") + raise e + def get_pks( self, expr: str, @@ -506,11 +575,11 @@ def query(self, expr: str, output_fields: List[str]): return None try: - query_result = self.milvus.col.query( - expr=expr, output_fields=output_fields - ) + query_result = self.milvus.col.query(expr=expr, output_fields=output_fields) except MilvusException as exc: - LogWriter.error(f"Failed to get outputs: {self.milvus.collection_name} error: {exc}") + LogWriter.error( + f"Failed to get outputs: {self.milvus.collection_name} error: {exc}" + ) raise exc return query_result diff --git a/common/extractors/BaseExtractor.py b/common/extractors/BaseExtractor.py index 3f1ec92b..e8638665 100644 --- a/common/extractors/BaseExtractor.py +++ b/common/extractors/BaseExtractor.py @@ -1,6 +1,13 @@ -class BaseExtractor: - def __init__(): +from abc import ABC, abstractmethod + +from langchain_community.graphs.graph_document import GraphDocument + + +class BaseExtractor(ABC): + @abstractmethod + def extract(self, text:str): pass - def extract(self, text): + @abstractmethod + async def aextract(self, text:str) -> list[GraphDocument]: pass diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py index c8f24355..282729a4 100644 --- a/common/extractors/GraphExtractor.py +++ b/common/extractors/GraphExtractor.py @@ -16,6 +16,56 @@ def __init__(self): ) def extract(self, text) -> list[GraphDocument]: + """ + returns a list of GraphDocument: + Each doc is: + nodes=[ + Node( + id='Marie Curie', + type='Person', + properties={ + 'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.' + } + ), + ... + ], + relationships=[ + Relationship( + source=Node(id='Marie Curie', type='Person'), + target=Node(id='Pierre Curie', type='Person'), + type='SPOUSE' + ), + ... + ] + """ doc = Document(page_content=text) graph_docs = self.transformer.convert_to_graph_documents([doc]) + translated_docs = self.translate(graph_docs) + return translated_docs + + async def aextract(self, text:str) -> list[GraphDocument]: + """ + returns a list of GraphDocument: + Each doc is: + nodes=[ + Node( + id='Marie Curie', + type='Person', + properties={ + 'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.' + } + ), + ... + ], + relationships=[ + Relationship( + source=Node(id='Marie Curie', type='Person'), + target=Node(id='Pierre Curie', type='Person'), + type='SPOUSE' + ), + ... + ] + """ + doc = Document(page_content=text) + graph_docs = await self.transformer.aconvert_to_graph_documents([doc]) return graph_docs diff --git a/common/extractors/LLMEntityRelationshipExtractor.py b/common/extractors/LLMEntityRelationshipExtractor.py index d5a0a970..415c3235 100644 --- a/common/extractors/LLMEntityRelationshipExtractor.py +++ b/common/extractors/LLMEntityRelationshipExtractor.py @@ -1,8 +1,9 @@ -from common.llm_services import LLM_Model +import json +from typing import List + from common.extractors.BaseExtractor import BaseExtractor +from common.llm_services import LLM_Model from common.py_schemas import KnowledgeGraph -from typing import List -import json class LLMEntityRelationshipExtractor(BaseExtractor): @@ -19,6 +20,34 @@ def __init__( self.strict_mode = strict_mode def _extract_kg_from_doc(self, doc, chain, parser): + """ + returns: + { + "nodes": [ + { + "id": "str", + "type": "string", + "definition": "string" + } + ], + "rels": [ + { + "source":{ + "id": "str", + "type": "string", + "definition": "string" + } + "target":{ + "id": "str", + "type": "string", + "definition": "string" + } + "definition" + } + ] + } + """ + try: out = chain.invoke( {"input": doc, "format_instructions": parser.get_format_instructions()} diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql index fb7338b7..87f12566 100644 --- a/common/gsql/graphRAG/StreamDocContent.gsql +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -1,5 +1,6 @@ -CREATE QUERY StreamDocContent(Vertex doc) { +CREATE DISTRIBUTED QUERY StreamDocContent(Vertex doc) { Doc = {doc}; - DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c; + DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c + POST-ACCUM d.epoch_processed = datetime_to_epoch(now()); PRINT DocContent; } diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql index fb373490..d5ec982e 100644 --- a/common/gsql/graphRAG/StreamDocIds.gsql +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -1,10 +1,13 @@ -CREATE QUERY StreamDocIds(INT current_batch, INT ttl_batches) { +CREATE DISTRIBUTED QUERY StreamDocIds(INT current_batch, INT ttl_batches) { ListAccum @@doc_ids; Docs = {Document.*}; Docs = SELECT d FROM Docs:d WHERE vertex_to_int(d) % ttl_batches == current_batch - ACCUM @@doc_ids += d.id; + AND d.epoch_processed == 0 + AND d.epoch_processing == 0 + ACCUM @@doc_ids += d.id + POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); PRINT @@doc_ids; } diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 0998affe..0e3cf6c3 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -21,12 +21,12 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { // GraphRAG ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD DIRECTED EDGE KNN(FROM Entity, TO Entity); // TODO: check where knn algo writes results - ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity); // Connect ResolvedEntities with their children entities - ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, weight UINT); // store edges between entities after they're resolved - ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community); + ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results + ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out // Hierarchical communities (Louvain/Leiden) diff --git a/common/logs/logwriter.py b/common/logs/logwriter.py index ff13feed..f75be00c 100644 --- a/common/logs/logwriter.py +++ b/common/logs/logwriter.py @@ -142,7 +142,7 @@ def log(level, message, mask_pii=True, **kwargs): LogWriter.general_logger.info(message) @staticmethod - def info(message, mask_pii=True, **kwargs): + def info(message, mask_pii=False, **kwargs): LogWriter.log("info", message, mask_pii, **kwargs) @staticmethod diff --git a/common/py_schemas/tool_io_schemas.py b/common/py_schemas/tool_io_schemas.py index 1fe16de4..1ea6ed3e 100644 --- a/common/py_schemas/tool_io_schemas.py +++ b/common/py_schemas/tool_io_schemas.py @@ -91,4 +91,4 @@ class ReportSection(BaseModel): questions: List[ReportQuestion] = Field("List of questions and reasoning for the section") class ReportSections(BaseModel): - sections: List[ReportSection] = Field("List of sections for the report") \ No newline at end of file + sections: List[ReportSection] = Field("List of sections for the report") diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 96a591bc..e248510c 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -2,100 +2,59 @@ import logging import time -import ecc_util +import httpx from aiochannel import Channel -from graphrag.util import chunk_doc, install_query, stream_docs +from graphrag import workers +from graphrag.util import init, make_headers, stream_doc_ids,http_timeout from pyTigerGraph import TigerGraphConnection -from common.chunkers.base_chunker import BaseChunker -from common.config import ( - doc_processing_config, - embedding_service, - get_llm_service, - llm_config, - milvus_config, -) +from common.config import embedding_service from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore -from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor +http_logs = logging.getLogger("httpx") +http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) + consistency_checkers = {} -async def install_queries( - requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +async def stream_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + ttl_batches: int = 10, ): - # queries that are currently installed - installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - - tasks = [] - async with asyncio.TaskGroup() as grp: - for q in requried_queries: - async with asyncio.Semaphore(n_workers): - q_name = q.split("/")[-1] - # if the query is not installed, install it - if q_name not in installed_queries: - task = grp.create_task(install_query(conn, q)) - tasks.append(task) - - for t in tasks: - print(t.result()) - # TODO: Check if anything had an error - return "", "", "" - - -async def init( - graphname: str, conn: TigerGraphConnection -) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: - # install requried queries - requried_queries = [ - # "common/gsql/supportai/Scan_For_Updates", - # "common/gsql/supportai/Update_Vertices_Processing_Status", - # "common/gsql/supportai/ECC_Status", - # "common/gsql/supportai/Check_Nonexistent_Vertices", - "common/gsql/graphRAG/StreamDocIds", - "common/gsql/graphRAG/StreamDocContent", - ] - # await install_queries(requried_queries, conn) - return await install_queries(requried_queries, conn) - - # init processing tools - chunker = ecc_util.get_chunker() - - vector_indices = {} - vertex_field = milvus_config.get("vertex_field", "vertex_id") - index_names = milvus_config.get( - "indexes", - ["Document", "DocumentChunk", "Entity", "Relationship"], - ) - for index_name in index_names: - vector_indices[graphname + "_" + index_name] = MilvusEmbeddingStore( - embedding_service, - host=milvus_config["host"], - port=milvus_config["port"], - support_ai_instance=True, - collection_name=graphname + "_" + index_name, - username=milvus_config.get("username", ""), - password=milvus_config.get("password", ""), - vector_field=milvus_config.get("vector_field", "document_vector"), - text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field, - ) - - if doc_processing_config.get("extractor") == "llm": - extractor = GraphExtractor() - elif doc_processing_config.get("extractor") == "llm": - extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) - else: - raise ValueError("Invalid extractor type") - - if vertex_field is None: - raise ValueError( - "vertex_field is not defined. Ensure Milvus is enabled in the configuration." - ) - - return chunker, vector_indices, extractor + """ + Streams the document contents into the docs_chan + """ + logger.info("streaming docs") + headers = make_headers(conn) + for i in range(ttl_batches): + doc_ids = await stream_doc_ids(conn, i, ttl_batches) + if doc_ids["error"]: + continue # TODO: handle error + + logger.info("********doc_ids") + logger.info(doc_ids) + logger.info("********") + for d in doc_ids["ids"]: + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + # TODO: check for errors + # this will block and wait if the channel is full + logger.info("steam_docs writes to docs") + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + # break # single doc test FIXME: delete + # break # single batch test FIXME: delete + + logger.info("stream_docs done") + # close the docs chan -- this function is the only sender + logger.info("****** closing docs chan") + docs_chan.close() async def chunk_docs( @@ -109,100 +68,120 @@ async def chunk_docs( Creates and starts one worker for each document in the docs channel. """ + logger.info("Reading from docs channel") doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - await embed_chan.put(content) # send the document to be embedded + logger.info("*********reading from docs chan") + # continue + v_id = content["v_id"] + txt = content["attributes"]["text"] + # send the document to be embedded + logger.info("chunk writes to extract") + await embed_chan.put((v_id, txt, "Document")) + task = grp.create_task( - chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) + workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) ) doc_tasks.append(task) # break # single doc FIXME: delete + logger.info("*********done reading from docs chan") + logger.info("chunk_docs done") # do something with doc_tasks? - for t in doc_tasks: - print(t.result()) - - # FIXME: don't close these there, other functions will send to them - upsert_chan.close() - embed_chan.close() + # for t in doc_tasks: + # logger.info(t.result()) # close the extract chan -- chunk_doc is the only sender - # and chunk_doc calls are kicked off from here (this is technically the sender) + # and chunk_doc calls are kicked off from here + logger.info("********closing extract chan") extract_chan.close() async def upsert(upsert_chan: Channel): """ Creates and starts one worker for each upsert job - queue expects: + chan expects: (func, args) <- q.get() """ + logger.info("Reading from upsert channel") # consume task queue upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: - # print("func name >>>>>", func.__name__, args) - # grp.create_task(todo()) + logger.info("*********reading from upsert chan") + logger.info(f"{func.__name__}, {args[1]}") # continue - # execute the task t = grp.create_task(func(*args)) upsert_tasks.append(t) + logger.info("*********done reading from upsert chan") - print(f"upsert done") + logger.info(f"upsert done") # do something with doc_tasks? - for t in upsert_tasks: - print(t.result()) + # for t in upsert_tasks: + # logger.info(t.result()) -async def embed(embed_chan: Channel): +async def embed( + embed_chan: Channel, index_stores: dict[str, MilvusEmbeddingStore], graphname: str +): """ Creates and starts one worker for each embed job + chan expects: + (v_id, content, index_name) <- q.get() """ - - # consume task queue - responses = [] + logger.info("Reading from embed channel") async with asyncio.TaskGroup() as grp: - async for item in embed_chan: - print("embed item>>>>>", type(item)) - grp.create_task(todo()) - continue - # execute the task - # response = await func(*args) - - # append task results to worker results/response - # responses.append(response) + # consume task queue + async for v_id, content, index_name in embed_chan: + logger.info("*********reading from embed chan") + # continue + embedding_store = index_stores[f"{graphname}_{index_name}"] + logger.info(f"Embed to {graphname}_{index_name}: {v_id}") + grp.create_task( + workers.embed( + embedding_service, + embedding_store, + v_id, + content, + ) + ) + logger.info("*********done reading from embed chan") - print(f"embed done") - return responses + logger.info(f"embed done") -async def extract(extract_chan: Channel): +async def extract( + extract_chan: Channel, + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection, +): """ Creates and starts one worker for each extract job + chan expects: + (chunk , chunk_id) <- q.get() """ - + logger.info("Reading from extract channel") # consume task queue - responses = [] async with asyncio.TaskGroup() as grp: async for item in extract_chan: - print("extract item>>>>>", type(item)) - grp.create_task(todo()) - continue - # execute the task - # response = await func(*args) - + logger.info("*********reading from extract chan") + logger.info("*********done reading from extract chan") + grp.create_task( + workers.extract(upsert_chan, embed_chan, extractor, conn, *item) + ) # append task results to worker results/response - # responses.append(response) + logger.info("*********done reading from extract chan") - print(f"embed done") - return responses + logger.info(f"extract done") - -async def todo(): - await asyncio.sleep(1) + logger.info("****closing upsert and embed chan") + upsert_chan.close() + embed_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -219,14 +198,13 @@ async def run(graphname: str, conn: TigerGraphConnection): """ - # init configurable objects - await init(graphname, conn) + extractor, index_stores = await init(conn) # return start = time.perf_counter() # TODO: make configurable tasks = [] - docs_chan = Channel(15) # process n chunks at a time max + docs_chan = Channel(1) # process n chunks at a time max embed_chan = Channel(100) upsert_chan = Channel(100) extract_chan = Channel(100) @@ -243,12 +221,14 @@ async def run(graphname: str, conn: TigerGraphConnection): t = grp.create_task(upsert(upsert_chan)) tasks.append(t) # # embed - t = grp.create_task(embed(embed_chan)) + t = grp.create_task(embed(embed_chan, index_stores, graphname)) tasks.append(t) # extract entities - t = grp.create_task(extract(extract_chan)) + t = grp.create_task( + extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + ) tasks.append(t) end = time.perf_counter() - print("DONE") - print(end - start) + logger.info("DONE") + logger.info(end - start) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index cfb84e5a..3fb8f916 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -1,58 +1,117 @@ +import asyncio import base64 import json -import time +import logging import traceback -from urllib.parse import quote_plus -import ecc_util import httpx -from aiochannel import Channel +from graphrag import workers from pyTigerGraph import TigerGraphConnection +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor +from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +logger = logging.getLogger(__name__) +http_timeout = httpx.Timeout(15.0) -def make_headers(conn: TigerGraphConnection): - if conn.apiToken is None or conn.apiToken == "": - tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() - headers = {"Authorization": f"Basic {tkn}"} - else: - headers = {"Authorization": f"Bearer {conn.apiToken}"} - return headers +async def install_queries( + requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +): + # queries that are currently installed + installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + tasks = [] + async with asyncio.TaskGroup() as grp: + for q in requried_queries: + async with asyncio.Semaphore(n_workers): + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + task = grp.create_task(workers.install_query(conn, q)) + tasks.append(task) -async def install_query( - conn: TigerGraphConnection, query_path: str -) -> dict[str, httpx.Response | str | None]: - LogWriter.info(f"Installing query {query_path}") - with open(f"{query_path}.gsql", "r") as f: - query = f.read() + for t in tasks: + logger.info(t.result()) + # TODO: Check if anything had an error - query_name = query_path.split("/")[-1] - query = f"""\ -USE GRAPH {conn.graphname} -{query} -INSTALL QUERY {query_name}""" - tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() - headers = {"Authorization": f"Basic {tkn}"} - async with httpx.AsyncClient(timeout=None) as client: - res = await client.post( - conn.gsUrl + "/gsqlserver/gsql/file", - data=quote_plus(query.encode("utf-8")), - headers=headers, +async def init( + conn: TigerGraphConnection, +) -> tuple[BaseExtractor, dict[str, MilvusEmbeddingStore]]: + # install requried queries + requried_queries = [ + # "common/gsql/supportai/Scan_For_Updates", + # "common/gsql/supportai/Update_Vertices_Processing_Status", + # "common/gsql/supportai/ECC_Status", + # "common/gsql/supportai/Check_Nonexistent_Vertices", + "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamDocContent", + ] + await install_queries(requried_queries, conn) + + # extractor + if doc_processing_config.get("extractor") == "graphrag": + extractor = GraphExtractor() + elif doc_processing_config.get("extractor") == "llm": + extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) + else: + raise ValueError("Invalid extractor type") + vertex_field = milvus_config.get("vertex_field", "vertex_id") + index_names = milvus_config.get( + "indexes", + [ + "Document", + "DocumentChunk", + "Entity", + "Relationship", + # "Concept", + ], + ) + index_stores = {} + content = "init" + # TODO:do concurrently + for index_name in index_names: + name = conn.graphname + "_" + index_name + s = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, ) + # TODO: only do this if collection doesn't exist + vec = embedding_service.embed_query(content) + LogWriter.info(f"Initializing {name}") + s.add_embeddings([(content, vec)], [{vertex_field: content}]) + s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") + index_stores[name] = s - if "error" in res.text.lower(): - LogWriter.error(res.text) - return { - "result": None, - "error": True, - "message": f"Failed to install query {query_name}", - } + return extractor, index_stores - return {"result": res, "error": False} + +def make_headers(conn: TigerGraphConnection): + if conn.apiToken is None or conn.apiToken == "": + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + else: + headers = {"Authorization": f"Bearer {conn.apiToken}"} + + return headers async def stream_doc_ids( @@ -61,7 +120,7 @@ async def stream_doc_ids( headers = make_headers(conn) try: - async with httpx.AsyncClient(timeout=None) as client: + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", params={ @@ -82,71 +141,6 @@ async def stream_doc_ids( return {"error": True, "message": str(e)} -async def stream_docs( - conn: TigerGraphConnection, - docs_chan: Channel, - ttl_batches: int = 10, -): - """ - Streams the document contents into the docs_chan - """ - headers = make_headers(conn) - for i in range(ttl_batches): - doc_ids = await stream_doc_ids(conn, i, ttl_batches) - if doc_ids["error"]: - break # TODO: handle error - - print("********") - print(doc_ids) - print("********") - for d in doc_ids["ids"]: - async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) - # TODO: check for errors - # this will block and wait if the channel is full - await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # break # single doc test FIXME: delete - # break # single batch test FIXME: delete - - # close the docs chan -- this function is the only sender - docs_chan.close() - - -async def chunk_doc( - conn: TigerGraphConnection, - doc: dict[str, str], - upsert_chan: Channel, - embed_chan: Channel, - extract_chan: Channel, -): - """ - Chunks a document. - Places the resulting chunks into the upsert channel (to be upserted to TG) - and the embed channel (to be embedded and written to the vector store) - """ - chunker = ecc_util.get_chunker() - chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] - # TODO: n chunks at a time - for i, chunk in enumerate(chunks): - # send chunks to be upserted (func, args) - await upsert_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) - - # send chunks to be embedded - await embed_chan.put(chunk) - - # send chunks to have entities extracted - await extract_chan.put(chunk) - - # break # single chunk FIXME: delete - - return doc["v_id"] - - def map_attrs(attributes: dict): # map attrs attrs = {} @@ -171,11 +165,13 @@ async def upsert_vertex( attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) - async with httpx.AsyncClient(timeout=None) as client: + # print("upsert vertex>>>", vertex_id) + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print("upsert vertex>>>", res.json()) + + res.raise_for_status() async def upsert_edge( @@ -207,37 +203,9 @@ async def upsert_edge( } ) headers = make_headers(conn) - async with httpx.AsyncClient(timeout=None) as client: + # print("upsert edge >>>", src_v_id, tgt_v_id) + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print("upsert edge >>>", res.json()) - - -async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): - date_added = int(time.time()) - await upsert_vertex( - conn, - "DocumentChunk", - chunk_id, - attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, - ) - await upsert_vertex( - conn, - "Content", - chunk_id, - attributes={"text": chunk, "epoch_added": date_added}, - ) - await upsert_edge( - conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id - ) - await upsert_edge(conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) - if int(chunk_id.split("_")[-1]) > 0: - await upsert_edge( - conn, - "DocumentChunk", - chunk_id, - "IS_AFTER", - "DocumentChunk", - doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), - ) + res.raise_for_status() diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py deleted file mode 100644 index 40720deb..00000000 --- a/eventual-consistency-service/app/graphrag/worker.py +++ /dev/null @@ -1,35 +0,0 @@ -import asyncio - -from aiochannel import Channel - - -async def worker( - n: int, - task_queue: Channel, -): - # init worker logging/reporting (TODO) - worker_name = f"worker-{n+1}" - worker_name += " " if n + 1 < 10 else "" - - while task_queue.empty(): - print(f"{worker_name} waiting") - await asyncio.sleep(1) - - # consume task queue - print(f"{worker_name} started") - responses = [] - while not task_queue.empty(): - # get the next task - func, args = await task_queue.get() - - # execute the task - response = await func(*args) - - # append task results to worker results/response - responses.append(response) - - # mark task as done - task_queue.task_done() - - print(f"{worker_name} done") - return responses diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py new file mode 100644 index 00000000..3eb0d0dd --- /dev/null +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -0,0 +1,226 @@ +import base64 +import logging +import time +from urllib.parse import quote_plus + +import ecc_util +import httpx +from aiochannel import Channel +from graphrag import util # import upsert_edge, upsert_vertex +from langchain_community.graphs.graph_document import GraphDocument +from pyTigerGraph import TigerGraphConnection + +from common.config import milvus_config +from common.embeddings.embedding_services import EmbeddingModel +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor +from common.logs.logwriter import LogWriter + +vertex_field = milvus_config.get("vertex_field", "vertex_id") + +logger = logging.getLogger(__name__) + + +async def install_query( + conn: TigerGraphConnection, query_path: str +) -> dict[str, httpx.Response | str | None]: + LogWriter.info(f"Installing query {query_path}") + with open(f"{query_path}.gsql", "r") as f: + query = f.read() + + query_name = query_path.split("/")[-1] + query = f"""\ +USE GRAPH {conn.graphname} +{query} +INSTALL QUERY {query_name}""" + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + + async with httpx.AsyncClient(timeout=util.http_timeout) as client: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) + + if "error" in res.text.lower(): + LogWriter.error(res.text) + return { + "result": None, + "error": True, + "message": f"Failed to install query {query_name}", + } + + return {"result": res, "error": False} + + +async def chunk_doc( + conn: TigerGraphConnection, + doc: dict[str, str], + upsert_chan: Channel, + embed_chan: Channel, + extract_chan: Channel, +): + """ + Chunks a document. + Places the resulting chunks into the upsert channel (to be upserted to TG) + and the embed channel (to be embedded and written to the vector store) + """ + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + logger.info(f"Chunking {v_id}") + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + chunk_id = f"{v_id}_chunk_{i}" + # send chunks to be upserted (func, args) + logger.info("chunk writes to upsert") + await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) + + # send chunks to be embedded + logger.info("chunk writes to embed") + await embed_chan.put((v_id, chunk, "DocumentChunk")) + + # send chunks to have entities extracted + logger.info("chunk writes to extract") + await extract_chan.put((chunk, chunk_id)) + + return doc["v_id"] + + +async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): + logger.info(f"Upserting chunk {chunk_id}") + logger.info(f"Upserting chunk {chunk_id}") + date_added = int(time.time()) + await util.upsert_vertex( + conn, + "DocumentChunk", + chunk_id, + attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, + ) + await util.upsert_vertex( + conn, + "Content", + chunk_id, + attributes={"text": chunk, "epoch_added": date_added}, + ) + await util.upsert_edge( + conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id + ) + await util.upsert_edge( + conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id + ) + if int(chunk_id.split("_")[-1]) > 0: + await util.upsert_edge( + conn, + "DocumentChunk", + chunk_id, + "IS_AFTER", + "DocumentChunk", + doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + ) + + +async def embed( + embed_svc: EmbeddingModel, + embed_store: MilvusEmbeddingStore, + v_id: str, + content: str, +): + """ + Args: + graphname: str + the name of the graph the documents are in + embed_svc: EmbeddingModel + The class used to vectorize text + embed_store: + The class used to store the vectore to a vector DB + v_id: str + the vertex id that will be embedded + content: str + the content of the document/chunk + index_name: str + the vertex index to write to + """ + logger.info(f"Embedding {v_id}, {content}") + + vec = await embed_svc.aembed_query(content) + await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + + +async def extract( + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection, + chunk: str, + chunk_id: str, +): + logger.info(f"Extracting chunk: {chunk_id}") + extracted: list[GraphDocument] = await extractor.aextract(chunk) + # upsert nodes and edges to the graph + for doc in extracted: + for node in doc.nodes: + logger.info("extract writes entity vert to upsert") + logger.info(f"Node: {node.id}| props: {node.properties}") + v_id = str(node.id) + desc = node.properties.get("description", "") + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + # conn, v_id, chunk_id, chunk + ( + conn, + "Entity", # v_type + v_id, # v_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # link the entity to the chunk it came from + logger.info("extract writes contains edge to upsert") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "DocumentChunk", # src_type + chunk_id, # src_id + "CONTAINS_ENTITY", # edge_type + "Entity", # tgt_type + str(node.id), # tgt_id + None, # attributes + ), + ) + ) + + # embed the entity + # (v_id, content, index_name) + await embed_chan.put((v_id, desc, "Entity")) + + for edge in doc.relationships: + logger.info("extract writes relates edge to upsert") + logger.info(f"{edge}") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + edge.source.id, # src_id + "RELATIONSHIP", # edgeType + "Entity", # tgt_type + edge.target.id, # tgt_id + {"relation_type": edge.type}, # attributes + ), + ) + ) + # embed "Relationship", + # (v_id, content, index_name) + + # TODO: + # embed the extracted entities diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 0277a272..85a1f8ae 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -188,7 +188,8 @@ def consistency_status( case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) # asyncio.run(graphrag.run(graphname, conn)) - ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname})" + import time + ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From bb37198f74e1012880868d44f8d6cbfa09acbfb0 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:36:22 -0400 Subject: [PATCH 07/91] docs processing done -- start community passes --- common/embeddings/embedding_services.py | 2 +- common/gsql/graphRAG/StreamDocContent.gsql | 2 + common/gsql/graphRAG/StreamDocIds.gsql | 7 +- copilot/docs/notebooks/graphrag.ipynb | 159 ++++-------------- .../app/graphrag/graph_rag.py | 69 +++----- .../app/graphrag/util.py | 63 ++++--- .../app/graphrag/workers.py | 30 ++-- 7 files changed, 118 insertions(+), 214 deletions(-) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index dd506670..7ce17478 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -136,7 +136,7 @@ def __init__(self, config): ) from langchain.embeddings import OpenAIEmbeddings - self.embeddings = OpenAIEmbeddings() + self.embeddings = OpenAIEmbeddings().aembed_query class VertexAI_PaLM_Embedding(EmbeddingModel): diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql index 87f12566..a2845148 100644 --- a/common/gsql/graphRAG/StreamDocContent.gsql +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -1,5 +1,7 @@ CREATE DISTRIBUTED QUERY StreamDocContent(Vertex doc) { Doc = {doc}; + + // Get the document's content and mark it as processed DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c POST-ACCUM d.epoch_processed = datetime_to_epoch(now()); PRINT DocContent; diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql index d5ec982e..2fb4a9c4 100644 --- a/common/gsql/graphRAG/StreamDocIds.gsql +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -1,13 +1,16 @@ CREATE DISTRIBUTED QUERY StreamDocIds(INT current_batch, INT ttl_batches) { + /* + * Get the IDs of documents that have not already been processed (one + * batch at a time) + */ ListAccum @@doc_ids; Docs = {Document.*}; Docs = SELECT d FROM Docs:d WHERE vertex_to_int(d) % ttl_batches == current_batch AND d.epoch_processed == 0 - AND d.epoch_processing == 0 ACCUM @@doc_ids += d.id - POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); + POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); // set the processing time PRINT @@doc_ids; } diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 57ea4b48..38b4939b 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.434 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.932 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.335 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.059 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538'}" + "{'job_name': 'load_documents_content_json_a245f14bb5f443acaa051125e4d9a497',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522'}" ] }, - "execution_count": 15, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -121,39 +121,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'asdf' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" + ] + } + ], "source": [ "asdf" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658'}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\",\"Entity\"]:\n", " try:\n", " conn.delVertices(v)\n", " except:\n", " pass\n", "\n", "import time\n", + "\n", "time.sleep(3)\n", "conn.ai.runDocumentIngest(\n", " res[\"load_job_id\"],\n", @@ -168,102 +168,11 @@ "metadata": {}, "outputs": [], "source": [ - "import httpx\n", - "import base64\n", - "\n", - "# conn.ai.forceConsistencyUpdate()\n", - "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", - "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", - "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_core.pydantic_v1 import BaseModel, Field\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "\n", - "class Joke(BaseModel):\n", - " setup: str = Field(description=\"The setup of the joke\")\n", - " punchline: str = Field(description=\"The punchline to the joke\")\n", - "\n", - "\n", - "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", - "print(model.invoke(\"hi\"))\n", - "structured_llm = model.with_structured_output(Joke)\n", - "structured_llm.invoke(\"Tell me a joke about cats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_core.documents import Document\n", - "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", - "from langchain_openai import ChatOpenAI\n", - "import os\n", - "\n", - "# from langchain_core.pydantic_v1 import BaseModel\n", - "from pydantic import BaseModel\n", - "\n", - "\n", - "class AnswerWithJustification(BaseModel):\n", - " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", - "\n", - " answer: str\n", - " justification: str\n", - "\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", - "model_name = \"gpt-4o-mini\"\n", - "llm = ChatOpenAI(model=model_name, temperature=0)\n", - "# sllm = llm.with_structured_output(AnswerWithJustification)\n", - "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", - "\n", - "\n", - "class GraphExtractor:\n", - " def __init__(self):\n", - " self.transformer = LLMGraphTransformer(\n", - " llm=llm,\n", - " node_properties=[\"description\"],\n", - " relationship_properties=[\"description\"],\n", - " )\n", - "\n", - " def extract(self, text):\n", - " doc = Document(page_content=text)\n", - " graph_docs = self.transformer.convert_to_graph_documents([doc])\n", - " return graph_docs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text = \"\"\"\n", - "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", - "She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.\n", - "Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.\n", - "She was, in 1906, the first woman to become a professor at the University of Paris.\n", - "\"\"\"\n", - "ge = GraphExtractor()\n", - "\n", - "docs = ge.extract(text)\n", - "for d in docs:\n", - " for n in d.nodes:\n", - " print(n)\n", - " for r in d.relationships:\n", - " print(r)\n", - "# print(f\"Nodes:{docs[0].nodes}\")\n", - "# print(f\"Relationships:{docs[0].relationships}\")\n", - "# docs" + "conn.gsql(f\"\"\"\n", + "USE GRAPH {conn.graphname}\n", + "DROP QUERY StreamDocIds\n", + "DROP QUERY StreamDocContent\n", + "\"\"\")" ] } ], diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index e248510c..7e67b342 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,16 +1,16 @@ import asyncio import logging import time +import traceback import httpx from aiochannel import Channel -from graphrag import workers -from graphrag.util import init, make_headers, stream_doc_ids,http_timeout -from pyTigerGraph import TigerGraphConnection - from common.config import embedding_service from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor +from graphrag import workers +from graphrag.util import http_timeout, init, make_headers, stream_doc_ids +from pyTigerGraph import TigerGraphConnection http_logs = logging.getLogger("httpx") http_logs.setLevel(logging.WARNING) @@ -32,28 +32,32 @@ async def stream_docs( for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) if doc_ids["error"]: - continue # TODO: handle error + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - logger.info("********doc_ids") - logger.info(doc_ids) - logger.info("********") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) - # TODO: check for errors - # this will block and wait if the channel is full - logger.info("steam_docs writes to docs") - await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # break # single doc test FIXME: delete - # break # single batch test FIXME: delete + try: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + if res.status_code != 200: + # continue to the next doc. + # This doc will not be marked as processed, so the ecc will process it eventually. + continue + logger.info("steam_docs writes to docs") + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + except Exception as e: + exc = traceback.format_exc() + logger.error(f"Error retrieving doc: {d} --> {e}\n{exc}") + continue # try retrieving the next doc logger.info("stream_docs done") # close the docs chan -- this function is the only sender - logger.info("****** closing docs chan") + logger.info("closing docs chan") docs_chan.close() @@ -72,8 +76,6 @@ async def chunk_docs( doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - logger.info("*********reading from docs chan") - # continue v_id = content["v_id"] txt = content["attributes"]["text"] # send the document to be embedded @@ -84,17 +86,12 @@ async def chunk_docs( workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) ) doc_tasks.append(task) - # break # single doc FIXME: delete - logger.info("*********done reading from docs chan") logger.info("chunk_docs done") - # do something with doc_tasks? - # for t in doc_tasks: - # logger.info(t.result()) # close the extract chan -- chunk_doc is the only sender # and chunk_doc calls are kicked off from here - logger.info("********closing extract chan") + logger.info("closing extract_chan") extract_chan.close() @@ -110,13 +107,11 @@ async def upsert(upsert_chan: Channel): upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: - logger.info("*********reading from upsert chan") logger.info(f"{func.__name__}, {args[1]}") # continue # execute the task t = grp.create_task(func(*args)) upsert_tasks.append(t) - logger.info("*********done reading from upsert chan") logger.info(f"upsert done") # do something with doc_tasks? @@ -136,7 +131,6 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: - logger.info("*********reading from embed chan") # continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") @@ -148,7 +142,6 @@ async def embed( content, ) ) - logger.info("*********done reading from embed chan") logger.info(f"embed done") @@ -169,17 +162,13 @@ async def extract( # consume task queue async with asyncio.TaskGroup() as grp: async for item in extract_chan: - logger.info("*********reading from extract chan") - logger.info("*********done reading from extract chan") grp.create_task( workers.extract(upsert_chan, embed_chan, extractor, conn, *item) ) - # append task results to worker results/response - logger.info("*********done reading from extract chan") logger.info(f"extract done") - logger.info("****closing upsert and embed chan") + logger.info("closing upsert and embed chan") upsert_chan.close() embed_chan.close() @@ -202,9 +191,8 @@ async def run(graphname: str, conn: TigerGraphConnection): # return start = time.perf_counter() - # TODO: make configurable tasks = [] - docs_chan = Channel(1) # process n chunks at a time max + docs_chan = Channel(1) embed_chan = Channel(100) upsert_chan = Channel(100) extract_chan = Channel(100) @@ -230,5 +218,4 @@ async def run(graphname: str, conn: TigerGraphConnection): tasks.append(t) end = time.perf_counter() - logger.info("DONE") - logger.info(end - start) + logger.info(f"DONE. graphrag.run elapsed: {end-start}") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 3fb8f916..8f2c2141 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,9 +5,6 @@ import traceback import httpx -from graphrag import workers -from pyTigerGraph import TigerGraphConnection - from common.config import ( doc_processing_config, embedding_service, @@ -19,6 +16,8 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import workers +from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) @@ -33,6 +32,7 @@ async def install_queries( tasks = [] async with asyncio.TaskGroup() as grp: for q in requried_queries: + # only install n queries at a time (n=n_workers) async with asyncio.Semaphore(n_workers): q_name = q.split("/")[-1] # if the query is not installed, install it @@ -41,8 +41,17 @@ async def install_queries( tasks.append(task) for t in tasks: - logger.info(t.result()) - # TODO: Check if anything had an error + res = t.result() + # stop system if a required query doesn't install + if res["error"]: + raise Exception(res["message"]) + + +async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): + content = "init" + vec = embedding_service.embed_query(content) + await s.aadd_embeddings([(content, vec)], [{vertex_field: content}]) + s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") async def init( @@ -78,28 +87,28 @@ async def init( ], ) index_stores = {} - content = "init" - # TODO:do concurrently - for index_name in index_names: - name = conn.graphname + "_" + index_name - s = MilvusEmbeddingStore( - embedding_service, - host=milvus_config["host"], - port=milvus_config["port"], - support_ai_instance=True, - collection_name=name, - username=milvus_config.get("username", ""), - password=milvus_config.get("password", ""), - vector_field=milvus_config.get("vector_field", "document_vector"), - text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field, - ) - # TODO: only do this if collection doesn't exist - vec = embedding_service.embed_query(content) - LogWriter.info(f"Initializing {name}") - s.add_embeddings([(content, vec)], [{vertex_field: content}]) - s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") - index_stores[name] = s + async with asyncio.TaskGroup() as tg: + for index_name in index_names: + name = conn.graphname + "_" + index_name + s = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, + ) + + LogWriter.info(f"Initializing {name}") + # init collection if it doesn't exist + if not s.check_collection_exists(): + tg.create_task(init_embedding_index(s, vertex_field)) + + index_stores[name] = s return extractor, index_stores diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 3eb0d0dd..b7267b60 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -6,15 +6,14 @@ import ecc_util import httpx from aiochannel import Channel -from graphrag import util # import upsert_edge, upsert_vertex -from langchain_community.graphs.graph_document import GraphDocument -from pyTigerGraph import TigerGraphConnection - from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import util # import upsert_edge, upsert_vertex +from langchain_community.graphs.graph_document import GraphDocument +from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -36,7 +35,7 @@ async def install_query( tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() headers = {"Authorization": f"Basic {tkn}"} - async with httpx.AsyncClient(timeout=util.http_timeout) as client: + async with httpx.AsyncClient(timeout=None) as client: res = await client.post( conn.gsUrl + "/gsqlserver/gsql/file", data=quote_plus(query.encode("utf-8")), @@ -70,26 +69,24 @@ async def chunk_doc( chunks = chunker.chunk(doc["attributes"]["text"]) v_id = doc["v_id"] logger.info(f"Chunking {v_id}") - # TODO: n chunks at a time for i, chunk in enumerate(chunks): chunk_id = f"{v_id}_chunk_{i}" # send chunks to be upserted (func, args) - logger.info("chunk writes to upsert") + logger.info("chunk writes to upsert_chan") await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) # send chunks to be embedded - logger.info("chunk writes to embed") + logger.info("chunk writes to embed_chan") await embed_chan.put((v_id, chunk, "DocumentChunk")) # send chunks to have entities extracted - logger.info("chunk writes to extract") + logger.info("chunk writes to extract_chan") await extract_chan.put((chunk, chunk_id)) return doc["v_id"] async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): - logger.info(f"Upserting chunk {chunk_id}") logger.info(f"Upserting chunk {chunk_id}") date_added = int(time.time()) await util.upsert_vertex( @@ -142,7 +139,7 @@ async def embed( index_name: str the vertex index to write to """ - logger.info(f"Embedding {v_id}, {content}") + logger.info(f"Embedding {v_id}") vec = await embed_svc.aembed_query(content) await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) @@ -161,8 +158,7 @@ async def extract( # upsert nodes and edges to the graph for doc in extracted: for node in doc.nodes: - logger.info("extract writes entity vert to upsert") - logger.info(f"Node: {node.id}| props: {node.properties}") + logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") v_id = str(node.id) desc = node.properties.get("description", "") await upsert_chan.put( @@ -203,8 +199,9 @@ async def extract( await embed_chan.put((v_id, desc, "Entity")) for edge in doc.relationships: - logger.info("extract writes relates edge to upsert") - logger.info(f"{edge}") + logger.info( + f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" + ) await upsert_chan.put( ( util.upsert_edge, @@ -221,6 +218,3 @@ async def extract( ) # embed "Relationship", # (v_id, content, index_name) - - # TODO: - # embed the extracted entities From e9f178e34e39404774e76dd599f3917ba5856ac6 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:09:26 -0400 Subject: [PATCH 08/91] save --- common/embeddings/embedding_services.py | 2 +- common/embeddings/milvus_embedding_store.py | 73 ++++- common/gsql/graphRAG/.clang-format | 269 ++++++++++++++++++ common/gsql/graphRAG/.clangd | 2 + .../gsql/graphRAG/ResolveRelationships.gsql | 26 ++ common/gsql/graphRAG/SetEpochProcessing.gsql | 7 + common/gsql/graphRAG/StreamIds.gsql | 16 ++ common/gsql/graphRAG/leven.cpp | 59 ++++ common/gsql/graphRAG/louvain/louvain1.gsql | 17 ++ .../louvain_1_first_pass.gsql | 16 +- .../louvain_2_other_passes.gsql | 0 .../louvain_3_final_community.gsql | 0 .../louvain_4_modularity_1_for_pass.gsql | 0 .../louvain_4_modularity_2_final.gsql | 0 .../louvain_5_reset.gsql | 0 common/gsql/supportai/SupportAI_Schema.gsql | 4 +- copilot/docs/notebooks/graphrag.ipynb | 227 +++++++++++++-- .../app/graphrag/graph_rag.py | 188 ++++++++---- .../app/graphrag/util.py | 50 +++- .../app/graphrag/workers.py | 148 +++++++++- 20 files changed, 989 insertions(+), 115 deletions(-) create mode 100644 common/gsql/graphRAG/.clang-format create mode 100644 common/gsql/graphRAG/.clangd create mode 100644 common/gsql/graphRAG/ResolveRelationships.gsql create mode 100644 common/gsql/graphRAG/SetEpochProcessing.gsql create mode 100644 common/gsql/graphRAG/StreamIds.gsql create mode 100644 common/gsql/graphRAG/leven.cpp create mode 100644 common/gsql/graphRAG/louvain/louvain1.gsql rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_1_first_pass.gsql (88%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_2_other_passes.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_3_final_community.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_4_modularity_1_for_pass.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_4_modularity_2_final.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_5_reset.gsql (100%) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index 7ce17478..dd506670 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -136,7 +136,7 @@ def __init__(self, config): ) from langchain.embeddings import OpenAIEmbeddings - self.embeddings = OpenAIEmbeddings().aembed_query + self.embeddings = OpenAIEmbeddings() class VertexAI_PaLM_Embedding(EmbeddingModel): diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index ac9c5389..fd57c783 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -3,15 +3,16 @@ from time import sleep, time from typing import Iterable, List, Optional, Tuple -from langchain_community.vectorstores import Milvus -from langchain_core.documents.base import Document -from pymilvus import MilvusException, connections, utility - +import Levenshtein as lev +from asyncer import asyncify from common.embeddings.base_embedding_store import EmbeddingStore from common.embeddings.embedding_services import EmbeddingModel from common.logs.log import req_id_cv from common.logs.logwriter import LogWriter from common.metrics.prometheus_metrics import metrics +from langchain_community.vectorstores import Milvus +from langchain_core.documents.base import Document +from pymilvus import MilvusException, SearchResult, connections, utility logger = logging.getLogger(__name__) @@ -32,6 +33,7 @@ def __init__( alias: str = "alias", retry_interval: int = 2, max_retry_attempts: int = 10, + drop_old=False, ): self.embedding_service = embedding_service self.vector_field = vector_field @@ -42,6 +44,7 @@ def __init__( self.milvus_alias = alias self.retry_interval = retry_interval self.max_retry_attempts = max_retry_attempts + self.drop_old = drop_old if host.startswith("http"): if host.endswith(str(port)): @@ -86,7 +89,7 @@ def connect_to_milvus(self): collection_name=self.collection_name, connection_args=self.milvus_connection, auto_id=True, - drop_old=False, + drop_old=self.drop_old, text_field=self.text_field, vector_field=self.vector_field, ) @@ -118,6 +121,9 @@ def metadata_func(record: dict, metadata: dict) -> dict: return metadata LogWriter.info("Milvus add initial load documents init()") + import os + + logger.info(f"*******{os.path.exists('tg_documents')}") loader = DirectoryLoader( "./tg_documents/", glob="*.json", @@ -584,5 +590,62 @@ def query(self, expr: str, output_fields: List[str]): return query_result + def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): + a = a.lower() + b = b.lower() + # if the words are short, they should be the same + if len(a) < 5 and len(b) < 5: + return a == b + + # edit_dist_threshold (as a percent) of word must match + threshold = int(min(len(a), len(b)) * (1 - edit_dist_threshold)) + if p: + print(a, b, threshold, lev.distance(a, b)) + return lev.distance(a, b) < threshold + + async def aget_k_closest( + self, v_id: str, k=15, threshold_similarity=0.90, edit_dist_threshold_pct=0.75 + ) -> list[Document]: + """ + asdf + """ + threshold_dist = 1 - threshold_similarity + + # asyncify necessary funcs + query = asyncify(self.milvus.col.query) + search = asyncify(self.milvus.similarity_search_with_score_by_vector) + + # Get all vectors with this ID + verts = await query( + f'{self.vertex_field} == "{v_id}"', + output_fields=[self.vertex_field, self.vector_field], + ) + result = [] + for v in verts: + # get the k closest verts + sim = await search( + v["document_vector"], + k=k, + ) + # filter verts using similiarity threshold and leven_dist + similar_verts = [ + doc.metadata["vertex_id"] + for doc, dist in sim + # check semantic similarity + if dist < threshold_dist + # check name similarity (won't merge Apple and Google if they're semantically similar) + and self.edit_dist_check( + doc.metadata["vertex_id"], + v_id, + edit_dist_threshold_pct, + # v_id == "Dataframe", + ) + # don't have to merge verts with the same id (they're the same) + and doc.metadata["vertex_id"] != v_id + ] + result.extend(similar_verts) + result.append(v_id) + return set(result) + def __del__(self): metrics.milvus_active_connections.labels(self.collection_name).dec diff --git a/common/gsql/graphRAG/.clang-format b/common/gsql/graphRAG/.clang-format new file mode 100644 index 00000000..f0dcec6c --- /dev/null +++ b/common/gsql/graphRAG/.clang-format @@ -0,0 +1,269 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Left +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterExternBlock: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: true +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 3 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: true +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +KeepEmptyLinesAtEOF: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 4 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: NextLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +PPIndentWidth: -1 +QualifierAlignment: Leave +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + - ParseTestProto + - ParsePartialTestProto + CanonicalDelimiter: pb + BasedOnStyle: google +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Never +SpacesInParensOptions: + InCStyleCasts: false + InConditionalStatements: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Auto +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... diff --git a/common/gsql/graphRAG/.clangd b/common/gsql/graphRAG/.clangd new file mode 100644 index 00000000..ec3be0d8 --- /dev/null +++ b/common/gsql/graphRAG/.clangd @@ -0,0 +1,2 @@ +CompileFlags: + Add: [ -std=c++23 ] diff --git a/common/gsql/graphRAG/ResolveRelationships.gsql b/common/gsql/graphRAG/ResolveRelationships.gsql new file mode 100644 index 00000000..d3c69297 --- /dev/null +++ b/common/gsql/graphRAG/ResolveRelationships.gsql @@ -0,0 +1,26 @@ +CREATE DISTRIBUTED QUERY ResolveRelationships(BOOL printResults=FALSE) SYNTAX V2 { + /* + * RE1 <- entity -RELATES-> entity -> RE2 + * to + * RE1 -resolved-> RE + * + * Combines all of a Resolved entity's children's relationships into + * RESOLVED_RELATIONSHIP + */ + REs = {ResolvedEntity.*}; + + + REs = SELECT re1 FROM REs:re1 -(:rel)- Entity:e_tgt -(RESOLVES_TO>:r)- ResolvedEntity:re2 + // Connect the The first RE to the second RE + ACCUM + INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO) VALUES(re1, re2); + + + IF printResults THEN + // show which entities didn't get resolved + Ents = {Entity.*}; + rEnts = SELECT e FROM Ents:e -(RESOLVES_TO>)- _; + ents = Ents minus rEnts; + PRINT ents; + END; +} diff --git a/common/gsql/graphRAG/SetEpochProcessing.gsql b/common/gsql/graphRAG/SetEpochProcessing.gsql new file mode 100644 index 00000000..9a92ecf9 --- /dev/null +++ b/common/gsql/graphRAG/SetEpochProcessing.gsql @@ -0,0 +1,7 @@ +CREATE DISTRIBUTED QUERY SetEpochProcessing(Vertex v_id) { + Verts = {v_id}; + + // mark the vertex as processed + Verts = SELECT v FROM Verts:v + POST-ACCUM v.epoch_processed = datetime_to_epoch(now()); +} diff --git a/common/gsql/graphRAG/StreamIds.gsql b/common/gsql/graphRAG/StreamIds.gsql new file mode 100644 index 00000000..41181007 --- /dev/null +++ b/common/gsql/graphRAG/StreamIds.gsql @@ -0,0 +1,16 @@ +CREATE DISTRIBUTED QUERY StreamIds(INT current_batch, INT ttl_batches, STRING v_type) { + /* + * Get the IDs of entities that have not already been processed + * (one batch at a time) + */ + ListAccum @@ids; + Verts = {v_type}; + + Verts = SELECT v FROM Verts:v + WHERE vertex_to_int(v) % ttl_batches == current_batch + AND v.epoch_processed == 0 + ACCUM @@ids += v.id + POST-ACCUM v.epoch_processing = datetime_to_epoch(now()); // set the processing time + + PRINT @@ids; +} diff --git a/common/gsql/graphRAG/leven.cpp b/common/gsql/graphRAG/leven.cpp new file mode 100644 index 00000000..10c45669 --- /dev/null +++ b/common/gsql/graphRAG/leven.cpp @@ -0,0 +1,59 @@ +#include +#include + +// Returns the Levenshtein distance between word1 and word2. +int levenshteinDist(std::string word1, std::string word2) { + int size1 = word1.size(); + int size2 = word2.size(); + int verif[size1 + 1][size2 + 1]; // Verification matrix i.e. 2D array + // which will store the calculated distance. + + // If one of the words has zero length, the distance is equal to the size of + // the other word. + if (size1 == 0) return size2; + if (size2 == 0) return size1; + + // Sets the first row and the first column of the verification matrix with + // the numerical order from 0 to the length of each word. + for (int i = 0; i <= size1; i++) verif[i][0] = i; + for (int j = 0; j <= size2; j++) verif[0][j] = j; + + // Verification step / matrix filling. + for (int i = 1; i <= size1; i++) { + for (int j = 1; j <= size2; j++) { + // Sets the modification cost. + // 0 means no modification (i.e. equal letters) and 1 means that a + // modification is needed (i.e. unequal letters). + int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; + + // Sets the current position of the matrix as the minimum value + // between a (deletion), b (insertion) and c (substitution). a = the + // upper adjacent value plus 1: verif[i - 1][j] + 1 b = the left + // adjacent value plus 1: verif[i][j - 1] + 1 c = the upper left + // adjacent value plus the modification cost: verif[i - 1][j - 1] + + // cost + verif[i][j] = + std::min(std::min(verif[i - 1][j] + 1, verif[i][j - 1] + 1), + verif[i - 1][j - 1] + cost); + } + } + + // The last position of the matrix will contain the Levenshtein distance. + return verif[size1][size2]; +} + +int main() { + std::string word1, word2; + + std::cout << "Please input the first word: " << std::endl; + std::cin >> word1; + std::cout << "Please input the second word: " << std::endl; + std::cin >> word2; + + // cout << "The number of modifications needed in order to make one word " + // "equal to the other is: " + std::cout << "The edit distance is: " << levenshteinDist(word1, word2) + << std::endl; + + return 0; +} diff --git a/common/gsql/graphRAG/louvain/louvain1.gsql b/common/gsql/graphRAG/louvain/louvain1.gsql new file mode 100644 index 00000000..494a3625 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain1.gsql @@ -0,0 +1,17 @@ +CREATE DISTRIBUTED QUERY graphRAG_louvain_1() { + + Ents = {ResolvedEntity.*}; + + // Put each node into a distinct community + // Assume each Entity starts in its own community + + // For each node i + // Compute ∆Q (modularity) when putting node i into the community of some neighbor j + // move i to community that yields the largest gain in ∆Q + + Z = SELECT v FROM Ents:v -(_:e)-> ResolvedEntity:r + + + ; +} + diff --git a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql similarity index 88% rename from common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql rename to common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql index 4ca06029..0251909f 100644 --- a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql +++ b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql @@ -2,20 +2,20 @@ CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( UINT max_hop = 10, UINT batch_num = 12, UINT sample_edge_num = 100 -) FOR GRAPH {graph_name} SYNTAX v1 { +) { - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; --> this should be Community, I think + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; //--> this should be Community, I think SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum> @{community_id_attribute_name}; // the community ID of the node MinAccum @community_vid; // the community ID of the node SumAccum @k; // the sum of the weights of the links incident to the node SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community MaxAccum @@min_double; // used to reset the @best_move @@ -27,7 +27,7 @@ CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( DOUBLE wt = 1.0; // Initialization - All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = {{ResolvedEntity.*}}; All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t ACCUM @@m += wt / 2, s.@k += wt, diff --git a/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql rename to common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_3_final_community.gsql rename to common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql rename to common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql rename to common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_5_reset.gsql rename to common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 0e3cf6c3..1a705eaf 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description SET, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -21,7 +21,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { // GraphRAG ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 38b4939b..bde1b78f 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 2, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.335 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.059 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.208 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 3.025 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 4, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_a245f14bb5f443acaa051125e4d9a497',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522'}" + "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295'}" ] }, - "execution_count": 6, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -121,7 +121,41 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "\n", + "def make_headers(conn: TigerGraphConnection):\n", + " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + " return headers\n", + "\n", + "\n", + "httpx.get(\n", + " \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", + " headers=make_headers(conn),\n", + ")\n", + "# conn.ai.forceConsistencyUpdate()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -131,7 +165,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -142,11 +176,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658'}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\",\"Entity\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", " except:\n", @@ -170,10 +218,147 @@ "source": [ "conn.gsql(f\"\"\"\n", "USE GRAPH {conn.graphname}\n", - "DROP QUERY StreamDocIds\n", - "DROP QUERY StreamDocContent\n", + "DROP QUERY ResolveRelationships\n", "\"\"\")" ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'deleted_vertices'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m conn\u001b[38;5;241m.\u001b[39mgetToken()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# for v in [\"ResolvedEntity\"]:\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelVertices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.venv/ml/lib/python3.11/site-packages/pyTigerGraph/pyTigerGraphVertex.py:688\u001b[0m, in \u001b[0;36mpyTigerGraphVertex.delVertices\u001b[0;34m(self, vertexType, where, limit, sort, permanent, timeout)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 686\u001b[0m url \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m?\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m isFirst \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m&\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(timeout)\n\u001b[0;32m--> 688\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_delete\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdeleted_vertices\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m logger\u001b[38;5;241m.\u001b[39mlevel \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mDEBUG:\n\u001b[1;32m 691\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(ret))\n", + "\u001b[0;31mKeyError\u001b[0m: 'deleted_vertices'" + ] + } + ], + "source": [ + "conn.graphname = \"Cora\"\n", + "conn.getToken()\n", + "for v in [\"Community\"]:\n", + " # for v in [\"ResolvedEntity\"]:\n", + " conn.delVertices(v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "import json\n", + "import httpx\n", + "import logging\n", + "\n", + "_ = logging.getLogger(__name__)\n", + "\n", + "\n", + "http_timeout = None\n", + "\n", + "\n", + "def make_headers(conn: TigerGraphConnection):\n", + " if conn.apiToken is None or conn.apiToken == \"\":\n", + " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + " else:\n", + " headers = {\"Authorization\": f\"Bearer {conn.apiToken}\"}\n", + "\n", + " return headers\n", + "\n", + "\n", + "def check_vertex_exists(conn, id):\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=http_timeout) as client:\n", + " res = client.get(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{id}\",\n", + " headers=headers,\n", + " )\n", + "\n", + " res.raise_for_status()\n", + " return res.json()\n", + "\n", + "\n", + "# r = check_vertex_exists(conn, \"asdfTigergraphexception\")\n", + "# print(json.dumps(r, indent=2), r[\"error\"])\n", + "r = check_vertex_exists(conn, \"Tigergraphexception\")\n", + "print(json.dumps(r, indent=2), r[\"error\"])\n", + "r[\"results\"][0][\"attributes\"][\"description\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def map_attrs(attributes: dict):\n", + " # map attrs\n", + " attrs = {}\n", + " for k, v in attributes.items():\n", + " if isinstance(v, tuple):\n", + " attrs[k] = {\"value\": v[0], \"op\": v[1]}\n", + " elif isinstance(v, dict):\n", + " attrs[k] = {\n", + " \"value\": {\"keylist\": list(v.keys()), \"valuelist\": list(v.values())}\n", + " }\n", + " else:\n", + " attrs[k] = {\"value\": v}\n", + " return attrs\n", + "\n", + "\n", + "def process_id(v_id: str):\n", + " return v_id.replace(\" \", \"_\").replace(\"/\", \"\")\n", + "\n", + "\n", + "def a(vertex_id=\"Post /Requesttoken\"):\n", + " vertex_id = process_id(vertex_id)\n", + " attributes = { # attrs\n", + " \"description\": [\"test\"],\n", + " \"epoch_added\": int(time.time()),\n", + " }\n", + "\n", + " vertex_id = vertex_id.replace(\" \", \"_\")\n", + " attrs = map_attrs(attributes)\n", + " data = json.dumps({\"vertices\": {\"Entity\": {vertex_id: attrs}}})\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=http_timeout) as client:\n", + " res = client.post(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}\", data=data, headers=headers\n", + " )\n", + "\n", + " res.raise_for_status()\n", + "\n", + " return res.json()\n", + "\n", + "\n", + "a()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib import parse\n", + "\n", + "v_id = \"Post_/Requesttoken\"\n", + "v_id = process_id(v_id)\n", + "print(v_id)\n", + "\n", + "r = check_vertex_exists(conn, v_id)\n", + "print(json.dumps(r, indent=2), r[\"error\"])\n", + "r[\"results\"][0][\"attributes\"][\"description\"]" + ] } ], "metadata": { diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 7e67b342..4403756d 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -9,7 +9,7 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from graphrag import workers -from graphrag.util import http_timeout, init, make_headers, stream_doc_ids +from graphrag.util import http_timeout, init, make_headers, stream_ids from pyTigerGraph import TigerGraphConnection http_logs = logging.getLogger("httpx") @@ -29,15 +29,15 @@ async def stream_docs( """ logger.info("streaming docs") headers = make_headers(conn) - for i in range(ttl_batches): - doc_ids = await stream_doc_ids(conn, i, ttl_batches) - if doc_ids["error"]: - # continue to the next batch. - # These docs will not be marked as processed, so the ecc will process it eventually. - continue + async with httpx.AsyncClient(timeout=http_timeout) as client: + for i in range(ttl_batches): + doc_ids = await stream_ids(conn, "Document", i, ttl_batches) + if doc_ids["error"]: + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - for d in doc_ids["ids"]: - async with httpx.AsyncClient(timeout=http_timeout) as client: + for d in doc_ids["ids"]: try: res = await client.get( f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", @@ -104,19 +104,13 @@ async def upsert(upsert_chan: Channel): logger.info("Reading from upsert channel") # consume task queue - upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: logger.info(f"{func.__name__}, {args[1]}") - # continue # execute the task - t = grp.create_task(func(*args)) - upsert_tasks.append(t) + grp.create_task(func(*args)) logger.info(f"upsert done") - # do something with doc_tasks? - # for t in upsert_tasks: - # logger.info(t.result()) async def embed( @@ -131,7 +125,6 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: - # continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") grp.create_task( @@ -173,49 +166,136 @@ async def extract( embed_chan.close() -async def run(graphname: str, conn: TigerGraphConnection): +async def stream_entities( + conn: TigerGraphConnection, + entity_chan: Channel, + ttl_batches: int = 50, +): + """ + Streams entity IDs from the grpah """ - ecc flow + logger.info("streaming entities") + for i in range(ttl_batches): + ids = await stream_ids(conn, "Entity", i, ttl_batches) + if ids["error"]: + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - initialize_eventual_consistency_checker - instantiates ecc object - writes checker to checker dict - runs ecc_obj.initialize() + for i in ids["ids"]: + if len(i) > 0: + await entity_chan.put(i) + # break + # break # one batch + + logger.info("stream_enities done") + # close the docs chan -- this function is the only sender + logger.info("closing entities chan") + entity_chan.close() - ECC.initialize - loops and calls fetch and process +async def resolve_entities( + conn: TigerGraphConnection, + emb_store: MilvusEmbeddingStore, + entity_chan: Channel, + upsert_chan: Channel, +): """ + Merges entities into their ResolvedEntity form + Groups what should be the same entity into a resolved entity (e.g. V_type and VType should be merged) - extractor, index_stores = await init(conn) - # return - start = time.perf_counter() - - tasks = [] - docs_chan = Channel(1) - embed_chan = Channel(100) - upsert_chan = Channel(100) - extract_chan = Channel(100) + Copies edges between entities to their respective ResolvedEntities + """ async with asyncio.TaskGroup() as grp: - # get docs - t = grp.create_task(stream_docs(conn, docs_chan, 10)) - tasks.append(t) - # process docs - t = grp.create_task( - chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) - ) - tasks.append(t) - # upsert chunks - t = grp.create_task(upsert(upsert_chan)) - tasks.append(t) - # # embed - t = grp.create_task(embed(embed_chan, index_stores, graphname)) - tasks.append(t) - # extract entities - t = grp.create_task( - extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + # for every entity + async for entity_id in entity_chan: + print(f"***Etity ID from chan {entity_id}") + grp.create_task( + workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) + ) + logger.info("closing upsert_chan") + upsert_chan.close() + + # Copy RELATIONSHIP edges to RESOLVED_RELATIONSHIP + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/ResolveRelationships/", + headers=headers, ) - tasks.append(t) - end = time.perf_counter() + res.raise_for_status() + + +async def communities(conn: TigerGraphConnection): + pass + # Setup + + +async def run(graphname: str, conn: TigerGraphConnection): + """ + Set up GraphRAG: + - Install necessary queries. + - Process the documents into: + - chunks + - embeddings + - entities/relationships (and their embeddings) + - upsert everything to the graph + """ + + extractor, index_stores = await init(conn) + init_start = time.perf_counter() + + if False: + docs_chan = Channel(1) + embed_chan = Channel(100) + upsert_chan = Channel(100) + extract_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # get docs + grp.create_task(stream_docs(conn, docs_chan, 10)) + # process docs + grp.create_task( + chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) + ) + # upsert chunks + grp.create_task(upsert(upsert_chan)) + # embed + grp.create_task(embed(embed_chan, index_stores, graphname)) + # extract entities + grp.create_task( + extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + ) + init_end = time.perf_counter() + + # Entity Resolution + entity_start = time.perf_counter() + + if False: + entities_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + grp.create_task(stream_entities(conn, entities_chan, 50)) + grp.create_task( + resolve_entities( + conn, + index_stores[f"{conn.graphname}_Entity"], + entities_chan, + upsert_chan, + ) + ) + grp.create_task(upsert(upsert_chan)) + entity_end = time.perf_counter() - logger.info(f"DONE. graphrag.run elapsed: {end-start}") + # Community Detection + community_start = time.perf_counter() + if True: + await communities(conn) + + community_end = time.perf_counter() + + # Community Summarization + end = time.perf_counter() + logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") + logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") + logger.info(f"DONE. graphrag initializer dT: {community_end-community_start}") + logger.info(f"DONE. graphrag.run() total time elaplsed: {end-init_start}") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 8f2c2141..74dbc56d 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -3,6 +3,7 @@ import json import logging import traceback +from glob import glob import httpx from common.config import ( @@ -42,6 +43,7 @@ async def install_queries( for t in tasks: res = t.result() + print(res) # stop system if a required query doesn't install if res["error"]: raise Exception(res["message"]) @@ -63,9 +65,14 @@ async def init( # "common/gsql/supportai/Update_Vertices_Processing_Status", # "common/gsql/supportai/ECC_Status", # "common/gsql/supportai/Check_Nonexistent_Vertices", - "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamIds", "common/gsql/graphRAG/StreamDocContent", + "common/gsql/graphRAG/SetEpochProcessing", + "common/gsql/graphRAG/ResolveRelationships", ] + # add louvain to queries + q = [x.split('.gsql')[0] for x in glob("common/gsql/graphRAG/louvain/*")] + requried_queries.extend(q) await install_queries(requried_queries, conn) # extractor @@ -101,13 +108,14 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, + drop_old=False, ) LogWriter.info(f"Initializing {name}") # init collection if it doesn't exist if not s.check_collection_exists(): tg.create_task(init_embedding_index(s, vertex_field)) - + index_stores[name] = s return extractor, index_stores @@ -123,29 +131,28 @@ def make_headers(conn: TigerGraphConnection): return headers -async def stream_doc_ids( - conn: TigerGraphConnection, current_batch: int, ttl_batches: int +async def stream_ids( + conn: TigerGraphConnection, v_type: str, current_batch: int, ttl_batches: int ) -> dict[str, str | list[str]]: headers = make_headers(conn) try: async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", params={ "current_batch": current_batch, "ttl_batches": ttl_batches, + "v_type": v_type, }, headers=headers, ) - ids = res.json()["results"][0]["@@doc_ids"] + ids = res.json()["results"][0]["@@ids"] return {"error": False, "ids": ids} except Exception as e: exc = traceback.format_exc() - LogWriter.error( - f"/{conn.graphname}/query/StreamDocIds\nException Trace:\n{exc}" - ) + LogWriter.error(f"/{conn.graphname}/query/StreamIds\nException Trace:\n{exc}") return {"error": True, "message": str(e)} @@ -165,16 +172,24 @@ def map_attrs(attributes: dict): return attrs +def process_id(v_id: str): + v_id = v_id.replace(" ", "_").replace("/", "") + if v_id == "''" or v_id == '""': + return "" + + return v_id + + async def upsert_vertex( conn: TigerGraphConnection, vertex_type: str, vertex_id: str, attributes: dict, ): + vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) - # print("upsert vertex>>>", vertex_id) async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers @@ -183,6 +198,18 @@ async def upsert_vertex( res.raise_for_status() +async def check_vertex_exists(conn, v_id: str): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) + + res.raise_for_status() + return res.json() + + async def upsert_edge( conn: TigerGraphConnection, src_v_type: str, @@ -196,6 +223,8 @@ async def upsert_edge( attrs = {} else: attrs = map_attrs(attributes) + src_v_id = src_v_id.replace(" ", "_") + tgt_v_id = tgt_v_id.replace(" ", "_") data = json.dumps( { "edges": { @@ -212,7 +241,6 @@ async def upsert_edge( } ) headers = make_headers(conn) - # print("upsert edge >>>", src_v_id, tgt_v_id) async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index b7267b60..4c1174df 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -11,8 +11,8 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import util # import upsert_edge, upsert_vertex -from langchain_community.graphs.graph_document import GraphDocument +from graphrag import util +from langchain_community.graphs.graph_document import GraphDocument, Node from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -67,7 +67,7 @@ async def chunk_doc( """ chunker = ecc_util.get_chunker() chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] + v_id = util.process_id(doc["v_id"]) logger.info(f"Chunking {v_id}") for i, chunk in enumerate(chunks): chunk_id = f"{v_id}_chunk_{i}" @@ -145,6 +145,17 @@ async def embed( await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) +async def get_vert_desc(conn, v_id, node: Node): + desc = [node.properties.get("description", "")] + exists = await util.check_vertex_exists(conn, v_id) + # if vertex exists, get description content and append this description to it + if not exists["error"]: + # dedup descriptions + desc.extend(exists["results"][0]["attributes"]["description"]) + desc = list(set(desc)) + return desc + + async def extract( upsert_chan: Channel, embed_chan: Channel, @@ -159,12 +170,22 @@ async def extract( for doc in extracted: for node in doc.nodes: logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") - v_id = str(node.id) - desc = node.properties.get("description", "") + v_id = util.process_id(str(node.id)) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, node) + + # embed the entity + # embed with the v_id if the description is blank + if len(desc[0]): + await embed_chan.put((v_id, v_id, "Entity")) + else: + # (v_id, content, index_name) + await embed_chan.put((v_id, desc[0], "Entity")) + await upsert_chan.put( ( util.upsert_vertex, # func to call - # conn, v_id, chunk_id, chunk ( conn, "Entity", # v_type @@ -188,33 +209,134 @@ async def extract( chunk_id, # src_id "CONTAINS_ENTITY", # edge_type "Entity", # tgt_type - str(node.id), # tgt_id + v_id, # tgt_id None, # attributes ), ) ) - # embed the entity - # (v_id, content, index_name) - await embed_chan.put((v_id, desc, "Entity")) - for edge in doc.relationships: logger.info( f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" ) + # upsert verts first to make sure their ID becomes an attr + v_id = util.process_id(edge.source.id) # src_id + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.source) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + v_id = util.process_id(edge.target.id) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.target) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # src_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # upsert the edge between the two entities await upsert_chan.put( ( util.upsert_edge, ( conn, "Entity", # src_type - edge.source.id, # src_id + util.process_id(edge.source.id), # src_id "RELATIONSHIP", # edgeType "Entity", # tgt_type - edge.target.id, # tgt_id + util.process_id(edge.target.id), # tgt_id {"relation_type": edge.type}, # attributes ), ) ) # embed "Relationship", # (v_id, content, index_name) + + +async def resolve_entity( + conn: TigerGraphConnection, + upsert_chan: Channel, + emb_store: MilvusEmbeddingStore, + entity_id: str, +): + """ + get all vectors of E (one name can have multiple discriptions) + get ents close to E + for e in ents: + if e is 95% similar to E and edit_dist(E,e) <=3: + merge + mark e as processed + + mark as processed + """ + results = await emb_store.aget_k_closest(entity_id) + if len(results) == 0: + logger.error( + f"aget_k_closest should, minimally, return the entity itself.\n{results}" + ) + raise Exception() + if entity_id == "Dataframe": + print("result:", entity_id, results) + + # merge all entities into the ResolvedEntity vertex + # use the longest v_id as the resolved entity's v_id + resolved_entity_id = "" + for v in results: + # v_id = v.metadata["vertex_id"] + if len(v) > len(resolved_entity_id): + resolved_entity_id = v + + # upsert the resolved entity + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "ResolvedEntity", # v_type + resolved_entity_id, # v_id + { # attrs + "description": [] + }, + ), + ) + ) + + # create RESOLVES_TO edges from each entity to the ResolvedEntity + for v in results: + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + v, # src_id + "RESOLVES_TO", # edge_type + "ResolvedEntity", # tgt_type + resolved_entity_id, # tgt_id + None, # attributes + ), + ) + ) From 8ab8774cc160445a1602c18ddf2b9e7bc1b87a35 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Fri, 9 Aug 2024 18:47:13 -0400 Subject: [PATCH 09/91] starting to upsert community summaries --- common/embeddings/embedding_services.py | 2 +- .../gsql/graphRAG/ResolveRelationships.gsql | 2 +- .../gsql/graphRAG/get_community_children.gsql | 12 ++ common/gsql/graphRAG/leven.cpp | 59 ------ .../louvain/graphrag_louvain_communities.gsql | 199 ++++++++++++++++++ .../louvain/graphrag_louvain_init.gsql | 185 ++++++++++++++++ common/gsql/graphRAG/louvain/louvain1.gsql | 17 -- common/gsql/graphRAG/louvain/modularity.gsql | 49 +++++ .../graphRAG/louvain/stream_community.gsql | 9 + common/gsql/supportai/SupportAI_Schema.gsql | 14 +- common/py_schemas/tool_io_schemas.py | 25 ++- copilot/docs/notebooks/graphrag.ipynb | 127 +++++------ eventual-consistency-service/app/ecc_util.py | 33 ++- .../app/graphrag/community_summarizer.py | 138 ++++++++++++ .../app/graphrag/graph_rag.py | 158 ++++++++++++-- .../app/graphrag/util.py | 63 ++++-- .../app/graphrag/workers.py | 63 +++++- eventual-consistency-service/requirements.txt | 34 +-- 18 files changed, 968 insertions(+), 221 deletions(-) create mode 100644 common/gsql/graphRAG/get_community_children.gsql delete mode 100644 common/gsql/graphRAG/leven.cpp create mode 100644 common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql create mode 100644 common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql delete mode 100644 common/gsql/graphRAG/louvain/louvain1.gsql create mode 100644 common/gsql/graphRAG/louvain/modularity.gsql create mode 100644 common/gsql/graphRAG/louvain/stream_community.gsql create mode 100644 eventual-consistency-service/app/graphrag/community_summarizer.py diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index dd506670..13c2cfd0 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -134,7 +134,7 @@ def __init__(self, config): super().__init__( config, model_name=config.get("model_name", "OpenAI gpt-4-0613") ) - from langchain.embeddings import OpenAIEmbeddings + from langchain_openai import OpenAIEmbeddings self.embeddings = OpenAIEmbeddings() diff --git a/common/gsql/graphRAG/ResolveRelationships.gsql b/common/gsql/graphRAG/ResolveRelationships.gsql index d3c69297..6a0e515d 100644 --- a/common/gsql/graphRAG/ResolveRelationships.gsql +++ b/common/gsql/graphRAG/ResolveRelationships.gsql @@ -13,7 +13,7 @@ CREATE DISTRIBUTED QUERY ResolveRelationships(BOOL printResults=FALSE) SYNTAX V2 REs = SELECT re1 FROM REs:re1 -(:rel)- Entity:e_tgt -(RESOLVES_TO>:r)- ResolvedEntity:re2 // Connect the The first RE to the second RE ACCUM - INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO) VALUES(re1, re2); + INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO, relation_type) VALUES(re1, re2, rel.relation_type); IF printResults THEN diff --git a/common/gsql/graphRAG/get_community_children.gsql b/common/gsql/graphRAG/get_community_children.gsql new file mode 100644 index 00000000..7913e1b7 --- /dev/null +++ b/common/gsql/graphRAG/get_community_children.gsql @@ -0,0 +1,12 @@ +CREATE DISTRIBUTED QUERY get_community_children(Vertex comm, UINT iter) SYNTAX V2{ + Comms = {comm}; + + IF iter > 1 THEN + Comms = SELECT t FROM Comms:c -()- ResolvedEntity -(_>)- Entity:t; + + PRINT Ents[Ents.description as description] as children; + END; +} diff --git a/common/gsql/graphRAG/leven.cpp b/common/gsql/graphRAG/leven.cpp deleted file mode 100644 index 10c45669..00000000 --- a/common/gsql/graphRAG/leven.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include - -// Returns the Levenshtein distance between word1 and word2. -int levenshteinDist(std::string word1, std::string word2) { - int size1 = word1.size(); - int size2 = word2.size(); - int verif[size1 + 1][size2 + 1]; // Verification matrix i.e. 2D array - // which will store the calculated distance. - - // If one of the words has zero length, the distance is equal to the size of - // the other word. - if (size1 == 0) return size2; - if (size2 == 0) return size1; - - // Sets the first row and the first column of the verification matrix with - // the numerical order from 0 to the length of each word. - for (int i = 0; i <= size1; i++) verif[i][0] = i; - for (int j = 0; j <= size2; j++) verif[0][j] = j; - - // Verification step / matrix filling. - for (int i = 1; i <= size1; i++) { - for (int j = 1; j <= size2; j++) { - // Sets the modification cost. - // 0 means no modification (i.e. equal letters) and 1 means that a - // modification is needed (i.e. unequal letters). - int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; - - // Sets the current position of the matrix as the minimum value - // between a (deletion), b (insertion) and c (substitution). a = the - // upper adjacent value plus 1: verif[i - 1][j] + 1 b = the left - // adjacent value plus 1: verif[i][j - 1] + 1 c = the upper left - // adjacent value plus the modification cost: verif[i - 1][j - 1] + - // cost - verif[i][j] = - std::min(std::min(verif[i - 1][j] + 1, verif[i][j - 1] + 1), - verif[i - 1][j - 1] + cost); - } - } - - // The last position of the matrix will contain the Levenshtein distance. - return verif[size1][size2]; -} - -int main() { - std::string word1, word2; - - std::cout << "Please input the first word: " << std::endl; - std::cin >> word1; - std::cout << "Please input the second word: " << std::endl; - std::cin >> word2; - - // cout << "The number of modifications needed in order to make one word " - // "equal to the other is: " - std::cout << "The edit distance is: " << levenshteinDist(word1, word2) - << std::endl; - - return 0; -} diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql new file mode 100644 index 00000000..366b7ea7 --- /dev/null +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -0,0 +1,199 @@ +CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max_hop = 10, UINT n_batches = 1) SYNTAX V2{ + /* + * This is the same query as tg_louvain, just that Paper-related schema + * are changed to Community-related schema + * + * For the first call to this query, iteration = 1 + */ + TYPEDEF TUPLE community, STRING ext_vid> Move; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @community_id; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community, @is_current_iter, @has_parent; + SumAccum @batch_id; + MinAccum @vid; + + AllNodes = {Community.*}; + + // Get communities of the current iteration + AllNodes = SELECT s FROM AllNodes:s + WHERE s.iteration == iteration + ACCUM s.@is_current_iter += TRUE; + + // init + z = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@is_current_iter AND t.@is_current_iter + ACCUM s.@k += e.weight, + @@m += e.weight/2, + IF s == t THEN // self loop + s.@k_self_loop += e.weight + END + POST-ACCUM + s.@community_id = s, // assign node to its own community + s.@community_vid = to_string(s.id), // external id + s.@vid = getvid(s), // internal id (used in batching) + s.@batch_id = s.@vid % n_batches; // get batch number + + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = AllNodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop += 1; + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -(_>:e)- Community:t + WHERE s.@community_id != t.@community_id // can't move within the same community + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE dq = 1 - s.@k * t.@k / (2 * @@m), + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community += TRUE + END + HAVING s.@to_change_community == TRUE; // only select nodes that will move + ELSE // other iterations + // Calculate sum_total of links in each community + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + @@community_sum_total_map += (s.@community_id -> s.@k); + // store community's total edges in each vert (easier access) + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + + // find the best move + ChangedNodes = {}; + + // process nodes in batch + FOREACH batch_id IN RANGE[0, n_batches-1] DO + Nodes = SELECT s FROM Candidates:s -(_>:e)- Community:t + WHERE s.@batch_id == batch_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + IF s.@community_id == t.@community_id THEN + // add edge weights connected to s + s.@k_in += e.weight + ELSE + // add edge weights connecetd to t + s.@community_k_in_map += (t.@community_id -> e.weight) + END + POST-ACCUM + // ∆Q if s is moved out of its current community + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = Move(@@min_double, s, to_string(s.id)); // reset best move + + // find the best move + Nodes = SELECT s FROM Nodes:s -(_>:E)- Community:t + WHERE s.@community_id != t.@community_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE dq = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community = TRUE// s should move + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; // only select nodes that will move + + // Add nodes that will move to ChangedNodes + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -(_>:e)- Community:t + WHERE s.@best_move.community == t.@community_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + AND t.@to_change_community + AND t.@best_move.community == s.@community_id + // if delta Q are the same, only change the one with larger delta Q or the one with smaller @vid + AND ( + s.@delta_Q_remove + s.@best_move.delta_q < t.@delta_Q_remove + t.@best_move.delta_q + OR ( + abs( + (s.@delta_Q_remove + s.@best_move.delta_q) + - (t.@delta_Q_remove + t.@best_move.delta_q) + ) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + // remove SwapNodes (don't need to be changed) + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Update node communities (based on max ∆Q) + SwapNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, // move the node + s.@community_vid = s.@best_move.ext_vid, // move the node (external v_id update) + s.@to_change_community = FALSE; + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -(_>:e)- Community:t + WHERE t.@community_id != s.@community_id + AND s.@is_current_iter AND t.@is_current_iter; // only use Communities in the current iteration + END; + + // Coarsening + @@community_sum_total_map.clear(); + Tmp = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + IF s.@community_id == t.@community_id THEN + // keep track of how many edges are within the community + @@community_sum_in_map += (s.@community_id -> e.weight) + ELSE + // get LINKS_TO edge weights (how many edges are between communities) + // s.@community_k_in_map += (t.@community_id -> 1) + @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> e.weight)) + END, + t.@has_parent += TRUE // Used to help find unattached partitions + POST-ACCUM + // Write the results to a new community vertex (iteration + 1) + // ID , iter, edges within the community + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community + ; + + // Continue community hierarchy for unattached partitions + Tmp = SELECT s FROM AllNodes:s + WHERE s.@is_current_iter + AND NOT s.@has_parent + POST-ACCUM + // if s is a part of an unattached partition, add to its community hierarchy to maintain parity with rest of graph + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO HAS_PARENT VALUES (s, s.id+"_"+to_string(iteration+1)) // link Community's child/parent community + ; + + // link communities + // "If two communities have an edge between them, their parents should also have an edge bewtween them" + Tmp = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@community_vid != t.@community_vid + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE w = @@source_target_k_in_map.get(s.@community_vid).get(t.@community_vid)/2, + INSERT INTO LINKS_TO VALUES (s.@community_vid+"_"+to_string(iteration+1), t.@community_vid+"_"+to_string(iteration+1), w) + ; +} diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql new file mode 100644 index 00000000..2ccbaf2c --- /dev/null +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql @@ -0,0 +1,185 @@ +CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches = 1) { + /* + * Initialize GraphRAG's hierarchical communities. + */ + TYPEDEF TUPLE community, STRING ext_vid> Move; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @community_id; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + MinAccum @vid; + + AllNodes = {ResolvedEntity.*}; + DOUBLE wt = 1.0; + + // prevent multiple init runs + // z = SELECT s FROM AllNodes:s -(_)-> Community:t; + // IF z.size() > 0 THEN + // EXCEPTION reinit(400001); + // RAISE reinit("ERROR: the hierarchical communities have already been initialized"); + // END; + + // init + z = SELECT s FROM AllNodes:s + ACCUM + s.@community_id = s, // assign node to its own community + s.@community_vid = s.id, // external id + s.@vid = getvid(s), // internal id (used in batching) + s.@batch_id = s.@vid % n_batches; // get batch number + z = SELECT s FROM AllNodes:s -(_)-> ResolvedEntity:t + ACCUM s.@k += wt, + @@m += 1; + // POST-ACCUM + // s.@community_id = s, // assign node to its own community + // s.@community_vid = s.id, // external id + // s.@vid = getvid(s), // internal id (used in batching) + // s.@batch_id = s.@vid % n_batches; // get batch number + + PRINT z.size(); + PRINT z; + + // Local moving + INT hop = 0; + Candidates = AllNodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop += 1; + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_id != t.@community_id // can't move within the same community + ACCUM + DOUBLE dq = 1 - s.@k * t.@k / (2 * @@m), + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community += TRUE + END + HAVING s.@to_change_community == TRUE; // only select nodes that will move + PRINT ChangedNodes.size(); + ELSE // other iterations + // Calculate sum_total of links in each community + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + @@community_sum_total_map += (s.@community_id -> s.@k); + // store community's total edges in each vert (easier access) + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + + // find the best move + ChangedNodes = {}; + + // process nodes in batch + FOREACH batch_id IN RANGE[0, n_batches-1] DO + Nodes = SELECT s FROM Candidates:s -(_:e)-> ResolvedEntity:t + WHERE s.@batch_id == batch_id + ACCUM + IF s.@community_id == t.@community_id THEN + // add edge weights connected to s + s.@k_in += wt + ELSE + // add edge weights connecetd to t + s.@community_k_in_map += (t.@community_id -> wt) + END + POST-ACCUM + // ∆Q if s is moved out of its current community + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = Move(@@min_double, s, to_string(s.id)); // reset best move + + // find the best move + Nodes = SELECT s FROM Nodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_id != t.@community_id + ACCUM + DOUBLE dq = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community = TRUE// s should move + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; // only select nodes that will move + + // Add nodes that will move to ChangedNodes + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@best_move.community == t.@community_id + AND t.@to_change_community + AND t.@best_move.community == s.@community_id + // if delta Q are the same, only change the one with larger delta Q or the one with smaller @vid + AND ( + s.@delta_Q_remove + s.@best_move.delta_q < t.@delta_Q_remove + t.@best_move.delta_q + OR ( + abs( + (s.@delta_Q_remove + s.@best_move.delta_q) + - (t.@delta_Q_remove + t.@best_move.delta_q) + ) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + // remove SwapNodes (don't need to be changed) + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Update node communities (based on max ∆Q) + SwapNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, // move the node + s.@community_vid = s.@best_move.ext_vid, // move the node (external v_id update) + s.@to_change_community = FALSE; + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -(_:e)-> ResolvedEntity:t + WHERE t.@community_id != s.@community_id; + END; + + // Coarsening + UINT new_layer = 0; + @@community_sum_total_map.clear(); + Tmp = SELECT s FROM AllNodes:s -(_:e)-> ResolvedEntity:t + ACCUM + IF s.@community_id == t.@community_id THEN + // keep track of how many edges are within the community + @@community_sum_in_map += (s.@community_id -> wt) + ELSE + // get LINKS_TO edge weights (how many edges are between communities) + @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> 1)) + END + POST-ACCUM + // ID , iter, edges within the community + INSERT INTO Community VALUES (s.@community_vid+"_1", 1, @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO IN_COMMUNITY VALUES (s, s.@community_vid+"_1") // link entity to it's first community + ; + + PRINT @@source_target_k_in_map; + + @@community_sum_total_map.clear(); + // link communities + Tmp = SELECT s FROM AllNodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_vid != t.@community_vid + ACCUM + DOUBLE w = @@source_target_k_in_map.get(s.@community_vid).get(t.@community_vid), + INSERT INTO LINKS_TO VALUES (s.@community_vid+"_1", t.@community_vid+"_1", w); + + + PRINT @@source_target_k_in_map; +} diff --git a/common/gsql/graphRAG/louvain/louvain1.gsql b/common/gsql/graphRAG/louvain/louvain1.gsql deleted file mode 100644 index 494a3625..00000000 --- a/common/gsql/graphRAG/louvain/louvain1.gsql +++ /dev/null @@ -1,17 +0,0 @@ -CREATE DISTRIBUTED QUERY graphRAG_louvain_1() { - - Ents = {ResolvedEntity.*}; - - // Put each node into a distinct community - // Assume each Entity starts in its own community - - // For each node i - // Compute ∆Q (modularity) when putting node i into the community of some neighbor j - // move i to community that yields the largest gain in ∆Q - - Z = SELECT v FROM Ents:v -(_:e)-> ResolvedEntity:r - - - ; -} - diff --git a/common/gsql/graphRAG/louvain/modularity.gsql b/common/gsql/graphRAG/louvain/modularity.gsql new file mode 100644 index 00000000..3aaad826 --- /dev/null +++ b/common/gsql/graphRAG/louvain/modularity.gsql @@ -0,0 +1,49 @@ +CREATE DISTRIBUTED QUERY modularity(UINT iteration=1) SYNTAX V2 { + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MinAccum @community_id; // the community ID of the node + MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + MinAccum @parent; + DOUBLE wt = 1.0; + Comms = {Community.*}; + + // Assign Entities to their correct community (given the specified iteration level) + IF iteration > 1 THEN + Comms = SELECT t FROM Comms:c -()- ResolvedEntity:t + ACCUM t.@community_id = c.@parent; + + ELSE + Entities = SELECT t FROM Comms:c -(_>)- ResolvedEntity:t + WHERE c.iteration == iteration + ACCUM t.@community_id = c.id; + END; + + Nodes = SELECT s FROM Entities:s -(_>:e)- ResolvedEntity:t + ACCUM + IF s.@community_id == t.@community_id THEN + @@community_in_weight_map += (s.@community_id -> wt) + END, + @@community_total_weight_map += (s.@community_id -> wt), + @@sum_weight += wt; + + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + + PRINT @@modularity as mod; +} diff --git a/common/gsql/graphRAG/louvain/stream_community.gsql b/common/gsql/graphRAG/louvain/stream_community.gsql new file mode 100644 index 00000000..d01959d2 --- /dev/null +++ b/common/gsql/graphRAG/louvain/stream_community.gsql @@ -0,0 +1,9 @@ +CREATE DISTRIBUTED QUERY stream_community(UINT iter) { + Comms = {Community.*}; + + // Get communities of the current iteration + Comms = SELECT s FROM Comms:s + WHERE s.iteration == iter; + + PRINT Comms; +} diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 1a705eaf..3e127d82 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -20,16 +20,14 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; // GraphRAG - ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, k_in UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results + ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities - ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved - ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved - // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out - // Hierarchical communities (Louvain/Leiden) - // ADD UNDIRECTED EDGE LINKS_TO(FROM Community, TO Community); - // ADD DIRECTED EDGE BELONGS_TO(FROM Community, TO Community); + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; + ADD DIRECTED EDGE LINKS_TO (from Community, to Community, weight DOUBLE) WITH REVERSE_EDGE="reverse_LINKS_TO"; + ADD DIRECTED EDGE HAS_PARENT (from Community, to Community) WITH REVERSE_EDGE="reverse_HAS_PARENT"; } diff --git a/common/py_schemas/tool_io_schemas.py b/common/py_schemas/tool_io_schemas.py index 1ea6ed3e..4ca91b3d 100644 --- a/common/py_schemas/tool_io_schemas.py +++ b/common/py_schemas/tool_io_schemas.py @@ -1,10 +1,8 @@ +from typing import Dict, List, Optional + from langchain.pydantic_v1 import BaseModel, Field -from typing import Optional -from langchain_community.graphs.graph_document import ( - Node as BaseNode, - Relationship as BaseRelationship, -) -from typing import List, Dict, Type +from langchain_community.graphs.graph_document import Node as BaseNode +from langchain_community.graphs.graph_document import Relationship as BaseRelationship class MapQuestionToSchemaResponse(BaseModel): @@ -81,14 +79,27 @@ class KnowledgeGraph(BaseModel): ..., description="List of relationships in the knowledge graph" ) + class ReportQuestion(BaseModel): question: str = Field("The question to be asked") reasoning: str = Field("The reasoning behind the question") + class ReportSection(BaseModel): section: str = Field("Name of the section") description: str = Field("Description of the section") - questions: List[ReportQuestion] = Field("List of questions and reasoning for the section") + questions: List[ReportQuestion] = Field( + "List of questions and reasoning for the section" + ) + class ReportSections(BaseModel): sections: List[ReportSection] = Field("List of sections for the report") + + +class CommunitySummary(BaseModel): + """Generate a summary of the documents that are within this community.""" + + summary: str = Field( + ..., description="The community summary derived from the input documents" + ) diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index bde1b78f..e915f392 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 10, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -54,32 +54,32 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.208 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 3.025 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.043 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.066 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# And then add CoPilot's address to the connection. This address\n", - "# is the host's address where the CoPilot container is running.\n", + "# # And then add CoPilot's address to the connection. This address\n", + "# # is the host's address where the CoPilot container is running.\n", "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", "conn.ai.initializeSupportAI()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295'}" + "{'job_name': 'load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268'}" ] }, - "execution_count": 14, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -121,41 +121,31 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import httpx\n", - "import base64\n", + "# import httpx\n", + "# import base64\n", "\n", "\n", - "def make_headers(conn: TigerGraphConnection):\n", - " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - " return headers\n", + "# def make_headers(conn: TigerGraphConnection):\n", + "# tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + "# headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + "# return headers\n", "\n", "\n", - "httpx.get(\n", - " \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", - " headers=make_headers(conn),\n", - ")\n", - "# conn.ai.forceConsistencyUpdate()" + "# httpx.get(\n", + "# \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", + "# headers=make_headers(conn),\n", + "# timeout=None,\n", + "# )\n", + "# # conn.ai.forceConsistencyUpdate()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -165,7 +155,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -176,24 +166,39 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "for v in [\"Community\"]:\n", + " try:\n", + " conn.delVertices(v)\n", + " except:\n", + " pass\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658'}" + "{'job_name': 'load_documents_content_json_3e62fb87723945ea9a0380956694b7ec',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186'}" ] }, - "execution_count": 30, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"ResolvedEntity\"]:\n", "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", @@ -222,32 +227,6 @@ "\"\"\")" ] }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'deleted_vertices'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m conn\u001b[38;5;241m.\u001b[39mgetToken()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# for v in [\"ResolvedEntity\"]:\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelVertices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.venv/ml/lib/python3.11/site-packages/pyTigerGraph/pyTigerGraphVertex.py:688\u001b[0m, in \u001b[0;36mpyTigerGraphVertex.delVertices\u001b[0;34m(self, vertexType, where, limit, sort, permanent, timeout)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 686\u001b[0m url \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m?\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m isFirst \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m&\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(timeout)\n\u001b[0;32m--> 688\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_delete\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdeleted_vertices\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m logger\u001b[38;5;241m.\u001b[39mlevel \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mDEBUG:\n\u001b[1;32m 691\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(ret))\n", - "\u001b[0;31mKeyError\u001b[0m: 'deleted_vertices'" - ] - } - ], - "source": [ - "conn.graphname = \"Cora\"\n", - "conn.getToken()\n", - "for v in [\"Community\"]:\n", - " # for v in [\"ResolvedEntity\"]:\n", - " conn.delVertices(v)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/eventual-consistency-service/app/ecc_util.py b/eventual-consistency-service/app/ecc_util.py index 5656e219..bccadd77 100644 --- a/eventual-consistency-service/app/ecc_util.py +++ b/eventual-consistency-service/app/ecc_util.py @@ -1,5 +1,15 @@ from common.chunkers import character_chunker, regex_chunker, semantic_chunker -from common.config import doc_processing_config, embedding_service +from common.config import doc_processing_config, embedding_service, llm_config +from common.llm_services import ( + AWS_SageMaker_Endpoint, + AWSBedrock, + AzureOpenAI, + GoogleVertexAI, + Groq, + HuggingFaceEndpoint, + Ollama, + OpenAI, +) def get_chunker(): @@ -22,3 +32,24 @@ def get_chunker(): raise ValueError("Invalid chunker type") return chunker + + +def get_llm_service(): + if llm_config["completion_service"]["llm_service"].lower() == "openai": + llm_provider = OpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "azure": + llm_provider = AzureOpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "sagemaker": + llm_provider = AWS_SageMaker_Endpoint(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "vertexai": + llm_provider = GoogleVertexAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "bedrock": + llm_provider = AWSBedrock(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "groq": + llm_provider = Groq(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "ollama": + llm_provider = Ollama(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "huggingface": + llm_provider = HuggingFaceEndpoint(llm_config["completion_service"]) + + return llm_provider diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py new file mode 100644 index 00000000..d250b1f3 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -0,0 +1,138 @@ +import json + +from langchain.output_parsers import PydanticOutputParser +from langchain.prompts import ChatPromptTemplate +from langchain_core.prompts import PromptTemplate + +from common.llm_services import LLM_Model +from common.py_schemas import CommunitySummary + +# src: https://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/summarize/prompts.py +SUMMARIZE_PROMPT = PromptTemplate.from_template(""" +You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""") + + +class CommunitySummarizer: + def __init__( + self, + llm_service: LLM_Model, + ): + self.llm_service = llm_service + + def _extract_kg_from_doc(self, doc, chain, parser): + try: + out = chain.invoke( + {"input": doc, "format_instructions": parser.get_format_instructions()} + ) + except Exception as e: + print("Error: ", e) + return {"nodes": [], "rels": []} + try: + if "```json" not in out.content: + json_out = json.loads(out.content.strip("content=")) + else: + json_out = json.loads( + out.content.split("```")[1].strip("```").strip("json").strip() + ) + + formatted_rels = [] + for rels in json_out["rels"]: + if isinstance(rels["source"], str) and isinstance(rels["target"], str): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], str + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], str) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + else: + raise Exception("Relationship parsing error") + formatted_nodes = [] + for node in json_out["nodes"]: + formatted_nodes.append( + { + "id": node["id"], + "type": node["node_type"].replace(" ", "_").capitalize(), + "definition": node["definition"], + } + ) + + # filter relationships and nodes based on allowed types + if self.strict_mode: + if self.allowed_vertex_types: + formatted_nodes = [ + node + for node in formatted_nodes + if node["type"] in self.allowed_vertex_types + ] + if self.allowed_edge_types: + formatted_rels = [ + rel + for rel in formatted_rels + if rel["type"] in self.allowed_edge_types + ] + return {"nodes": formatted_nodes, "rels": formatted_rels} + except: + print("Error Processing: ", out) + return {"nodes": [], "rels": []} + + async def summarize(self, name: str, text: list[str]) -> CommunitySummary: + # parser = PydanticOutputParser(pydantic_object=CommunitySummary) + structured_llm = self.llm_service.model.with_structured_output(CommunitySummary) + chain = SUMMARIZE_PROMPT | structured_llm + summary = await chain.ainvoke( + { + "entity_name": name, + "description_list": text, + # "format_instructions": parser.get_format_instructions(), + } + ) + # summary = self._extract_kg_from_doc(text, chain, parser) + # summary = None + return summary.summary diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 4403756d..d4e3a7d6 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -5,15 +5,16 @@ import httpx from aiochannel import Channel -from common.config import embedding_service -from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore -from common.extractors.BaseExtractor import BaseExtractor from graphrag import workers from graphrag.util import http_timeout, init, make_headers, stream_ids from pyTigerGraph import TigerGraphConnection -http_logs = logging.getLogger("httpx") -http_logs.setLevel(logging.WARNING) +from common.config import embedding_service +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor + +# http_logs = logging.getLogger("httpx") +# http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) consistency_checkers = {} @@ -209,7 +210,7 @@ async def resolve_entities( async with asyncio.TaskGroup() as grp: # for every entity async for entity_id in entity_chan: - print(f"***Etity ID from chan {entity_id}") + print(f"***Entity ID from chan {entity_id}", flush=True) grp.create_task( workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) ) @@ -226,9 +227,115 @@ async def resolve_entities( res.raise_for_status() -async def communities(conn: TigerGraphConnection): - pass - # Setup +async def communities(conn: TigerGraphConnection, community_chan: Channel): + """ + Run louvain + """ + # first pass: Group ResolvedEntities into Communities + logger.info("Initializing Communities (first louvain pass)") + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_init", + params={"n_batches": 1}, + headers=headers, + ) + res.raise_for_status() + # get the modularity + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": 1, "batch_num": 1}, + headers=headers, + ) + res.raise_for_status() + mod = res.json()["results"][0]["mod"] + print(f"****mod 1: {mod}", flush=True) + await community_chan.put(1) + + # nth pass: Iterate on Resolved Entities until modularity stops increasing + prev_mod = -10 + i = 0 + # for _ in range(1, 5): + prev_mod = 0 + while abs(prev_mod - mod) > 0.0000001 and prev_mod != 0: + prev_mod = mod + logger.info(f"Running louvain on Communities (iteration: {i})") + i += 1 + # louvain pass + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", + params={"n_batches": 1}, + headers=headers, + ) + + res.raise_for_status() + + # get the modularity + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": i + 1, "batch_num": 1}, + headers=headers, + ) + res.raise_for_status() + mod = res.json()["results"][0]["mod"] + print(f"*** mod {i+1}: {mod}", flush=True) + print(f"****** mod diff: {abs(prev_mod - mod)}", flush=True) + + # write iter to chan for layer to be processed + await community_chan.put(i + 1) + + # TODO: erase last run since it's ∆q to the run before it will be small + logger.info("closing communities chan") + community_chan.close() + + +async def stream_communities( + conn: TigerGraphConnection, + community_chan: Channel, + comm_process_chan: Channel, +): + """ + Streams Community IDs from the grpah for a given iteration (from the channel) + """ + logger.info("streaming communities") + + headers = make_headers(conn) + # TODO: + # can only do one layer at a time to ensure that every child community has their descriptions + async for i in community_chan: + # get the community from that layer + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/stream_community", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + comms = resp.json()["results"][0]["Comms"] + + for c in comms: + await comm_process_chan.put((i, c["v_id"])) + + logger.info("stream_communities done") + logger.info("closing comm_process_chan") + comm_process_chan.close() + + +async def summarize_communities( + conn: TigerGraphConnection, + comm_process_chan: Channel, + upsert_chan: Channel, +): + async with asyncio.TaskGroup() as tg: + async for c in comm_process_chan: + tg.create_task(workers.process_community(conn, upsert_chan, *c)) + break + + logger.info("closing upsert_chan") + upsert_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -245,7 +352,10 @@ async def run(graphname: str, conn: TigerGraphConnection): extractor, index_stores = await init(conn) init_start = time.perf_counter() - if False: + abc = True + abc = False + if abc: + logger.info("Doc Processing Start") docs_chan = Channel(1) embed_chan = Channel(100) upsert_chan = Channel(100) @@ -266,11 +376,13 @@ async def run(graphname: str, conn: TigerGraphConnection): extract(extract_chan, upsert_chan, embed_chan, extractor, conn) ) init_end = time.perf_counter() + logger.info("Doc Processing End") # Entity Resolution entity_start = time.perf_counter() - if False: + if abc: + logger.info("Entity Processing Start") entities_chan = Channel(100) upsert_chan = Channel(100) async with asyncio.TaskGroup() as grp: @@ -285,13 +397,35 @@ async def run(graphname: str, conn: TigerGraphConnection): ) grp.create_task(upsert(upsert_chan)) entity_end = time.perf_counter() + logger.info("Entity Processing End") # Community Detection community_start = time.perf_counter() if True: - await communities(conn) + # FIXME: delete community delete + for v in ["Community"]: + try: + conn.delVertices(v) + except: + pass + logger.info("Community Processing Start") + communities_chan = Channel(1) + upsert_chan = Channel(10) + comm_process_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # run louvain + grp.create_task(communities(conn, communities_chan)) + # get the communities + grp.create_task( + stream_communities(conn, communities_chan, comm_process_chan) + ) + # summarize each community + grp.create_task(summarize_communities(conn, comm_process_chan, upsert_chan)) + grp.create_task(upsert(upsert_chan)) community_end = time.perf_counter() + logger.info("Community Processing End") # Community Summarization end = time.perf_counter() diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 74dbc56d..6876b5de 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -6,6 +6,9 @@ from glob import glob import httpx +from graphrag import workers +from pyTigerGraph import TigerGraphConnection + from common.config import ( doc_processing_config, embedding_service, @@ -17,36 +20,28 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import workers -from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) async def install_queries( - requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 + requried_queries: list[str], + conn: TigerGraphConnection, ): # queries that are currently installed installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - tasks = [] - async with asyncio.TaskGroup() as grp: - for q in requried_queries: - # only install n queries at a time (n=n_workers) - async with asyncio.Semaphore(n_workers): - q_name = q.split("/")[-1] - # if the query is not installed, install it - if q_name not in installed_queries: - task = grp.create_task(workers.install_query(conn, q)) - tasks.append(task) - - for t in tasks: - res = t.result() - print(res) - # stop system if a required query doesn't install - if res["error"]: - raise Exception(res["message"]) + # doesn't need to be parallel since tg only does it one at a time + for q in requried_queries: + # only install n queries at a time (n=n_workers) + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + res = await workers.install_query(conn, q) + # stop system if a required query doesn't install + if res["error"]: + raise Exception(res["message"]) async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): @@ -69,9 +64,14 @@ async def init( "common/gsql/graphRAG/StreamDocContent", "common/gsql/graphRAG/SetEpochProcessing", "common/gsql/graphRAG/ResolveRelationships", + "common/gsql/graphRAG/get_community_children", + "common/gsql/graphRAG/louvain/graphrag_louvain_init", + "common/gsql/graphRAG/louvain/graphrag_louvain_communities", + "common/gsql/graphRAG/louvain/modularity", + "common/gsql/graphRAG/louvain/stream_community", ] # add louvain to queries - q = [x.split('.gsql')[0] for x in glob("common/gsql/graphRAG/louvain/*")] + q = [x.split(".gsql")[0] for x in glob("common/gsql/graphRAG/louvain/*")] requried_queries.extend(q) await install_queries(requried_queries, conn) @@ -246,3 +246,24 @@ async def upsert_edge( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) res.raise_for_status() + + +async def get_commuinty_children(conn, i: int, c: str): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", + params={"comm": c, "iter": i}, + headers=headers, + ) + resp.raise_for_status() + descrs = [] + for d in resp.json()["results"][0]["children"]: + desc = d["attributes"]["description"] + if len(desc) == 0: + desc = d["v_id"] + + descrs.append(desc) + + print(">>>", descrs, flush=True) + return descrs diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 4c1174df..22980d96 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -6,14 +6,15 @@ import ecc_util import httpx from aiochannel import Channel +from graphrag import community_summarizer, util +from langchain_community.graphs.graph_document import GraphDocument, Node +from pyTigerGraph import TigerGraphConnection + from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import util -from langchain_community.graphs.graph_document import GraphDocument, Node -from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -298,14 +299,14 @@ async def resolve_entity( f"aget_k_closest should, minimally, return the entity itself.\n{results}" ) raise Exception() - if entity_id == "Dataframe": - print("result:", entity_id, results) + # FIXME: deleteme + # if entity_id == "Dataframe": + # print("result:", entity_id, results) # merge all entities into the ResolvedEntity vertex # use the longest v_id as the resolved entity's v_id - resolved_entity_id = "" + resolved_entity_id = entity_id for v in results: - # v_id = v.metadata["vertex_id"] if len(v) > len(resolved_entity_id): resolved_entity_id = v @@ -318,7 +319,7 @@ async def resolve_entity( "ResolvedEntity", # v_type resolved_entity_id, # v_id { # attrs - "description": [] + # "id": resolved_entity_id, }, ), ) @@ -340,3 +341,49 @@ async def resolve_entity( ), ) ) + + +async def process_community( + conn: TigerGraphConnection, + upsert_chan: Channel, + i: int, + c: str, +): + """ + https://github.com/microsoft/graphrag/blob/main/graphrag/prompt_tune/template/community_report_summarization.py + + Get children verts (Entity for layer-1 Communities, Community otherwise) + if the commuinty only has one child, use its description -- no need to summarize + + embed summaries + """ + print(i, c, flush=True) + + # get the children of the community + children = await util.get_commuinty_children(conn, i, c) + if i == 1: + tmp = [] + for c in children: + tmp.extend(c) + children = list(filter(lambda x: len(x) > 0, tmp)) + print(">>>", children, flush=True) + llm = ecc_util.get_llm_service() + summarizer = community_summarizer.CommunitySummarizer(llm) + summary = await summarizer.summarize(c, children) + await upsert_chan.put((upsert_summary, (conn,summary))) + + +async def upsert_summary(conn: TigerGraphConnection, summary: str): + print(f"SUMMARY:> {summary}", flush=True) + + # vertex_id = vertex_id.replace(" ", "_") + # attrs = map_attrs(attributes) + # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + # headers = make_headers(conn) + # async with httpx.AsyncClient(timeout=http_timeout) as client: + # res = await client.post( + # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + # ) + # + # res.raise_for_status() + # diff --git a/eventual-consistency-service/requirements.txt b/eventual-consistency-service/requirements.txt index 3bc0dae0..5d566dd1 100644 --- a/eventual-consistency-service/requirements.txt +++ b/eventual-consistency-service/requirements.txt @@ -7,6 +7,7 @@ appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 +asyncer==0.0.7 attrs==23.1.0 azure-core==1.30.1 azure-storage-blob==12.19.1 @@ -24,12 +25,15 @@ cryptography==42.0.5 dataclasses-json==0.5.14 distro==1.8.0 docker-pycreds==0.4.0 +docstring_parser==0.16 emoji==2.8.0 environs==9.5.0 exceptiongroup==1.1.3 fastapi==0.103.1 +filelock==3.15.4 filetype==1.2.0 frozenlist==1.4.0 +fsspec==2024.6.1 gitdb==4.0.11 GitPython==3.1.40 google-api-core==2.14.0 @@ -51,24 +55,28 @@ h11==0.14.0 httpcore==0.18.0 httptools==0.6.0 httpx==0.25.0 -huggingface_hub==0.23.0 +huggingface-hub==0.23.0 idna==3.4 +iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.1.12 -langchain-community==0.0.28 -langchain-core==0.1.49 -langchain-experimental==0.0.54 +langchain==0.2.12 +langchain-community==0.2.11 +langchain-core==0.2.29 +langchain-experimental==0.0.64 langchain-groq==0.1.3 -langchain-text-splitters==0.0.1 +langchain-openai==0.1.20 +langchain-text-splitters==0.2.2 langchainhub==0.1.14 langdetect==1.0.9 langgraph==0.0.40 -langsmith==0.1.24 +langsmith==0.1.98 +Levenshtein==0.25.1 lxml==4.9.3 marshmallow==3.20.1 minio==7.2.5 @@ -76,11 +84,12 @@ multidict==6.0.4 mypy-extensions==1.0.0 nltk==3.8.1 numpy==1.26.4 -openai==1.3.7 +openai==1.40.2 orjson==3.9.15 packaging==23.2 pandas==2.1.1 pathtools==0.1.2 +pluggy==1.5.0 prometheus_client==0.20.0 proto-plus==1.22.3 protobuf==4.24.4 @@ -94,15 +103,16 @@ pydantic==2.3.0 pydantic_core==2.6.3 pygit2==1.13.2 pymilvus==2.3.6 +pytest==8.2.0 python-dateutil==2.8.2 python-dotenv==1.0.0 python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.1 +pyTigerGraph==1.6.5 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.9.6 regex==2023.10.3 requests==2.31.0 rsa==4.9 @@ -118,12 +128,12 @@ SQLAlchemy==2.0.20 starlette==0.27.0 tabulate==0.9.0 tenacity==8.2.3 -tiktoken==0.5.1 +tiktoken==0.7.0 tqdm==4.66.1 types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.7.1 +typing_extensions==4.12.2 tzdata==2023.3 ujson==5.9.0 unstructured==0.10.23 From ef842ba278fd8cadd9b5be54dd6800040386cb8b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:50:33 -0400 Subject: [PATCH 10/91] graphrag pipeline done --- common/embeddings/milvus_embedding_store.py | 3 - .../gsql/graphRAG/communities_have_desc.gsql | 14 ++ .../louvain_old/louvain_1_first_pass.gsql | 176 -------------- .../louvain_old/louvain_2_other_passes.gsql | 217 ------------------ .../louvain_3_final_community.gsql | 44 ---- .../louvain_4_modularity_1_for_pass.gsql | 39 ---- .../louvain_4_modularity_2_final.gsql | 52 ----- .../graphRAG/louvain_old/louvain_5_reset.gsql | 13 -- copilot/docs/notebooks/graphrag.ipynb | 82 +++++-- .../app/graphrag/community_summarizer.py | 110 +-------- .../app/graphrag/graph_rag.py | 97 ++++---- .../app/graphrag/util.py | 30 ++- .../app/graphrag/workers.py | 58 ++--- 13 files changed, 196 insertions(+), 739 deletions(-) create mode 100644 common/gsql/graphRAG/communities_have_desc.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index fd57c783..7384e76f 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -606,9 +606,6 @@ def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): async def aget_k_closest( self, v_id: str, k=15, threshold_similarity=0.90, edit_dist_threshold_pct=0.75 ) -> list[Document]: - """ - asdf - """ threshold_dist = 1 - threshold_similarity # asyncify necessary funcs diff --git a/common/gsql/graphRAG/communities_have_desc.gsql b/common/gsql/graphRAG/communities_have_desc.gsql new file mode 100644 index 00000000..f5cda70e --- /dev/null +++ b/common/gsql/graphRAG/communities_have_desc.gsql @@ -0,0 +1,14 @@ +CREATE DISTRIBUTED QUERY communities_have_desc(UINT iter) SYNTAX V2{ + SumAccum @@descrs; + Comms = {Community.*}; + Comms = SELECT c FROM Comms:c + WHERE c.iteration == iter + ACCUM + IF length(c.description) > 0 THEN + @@descrs += 1 + END; + + + PRINT (@@descrs == Comms.size()) as all_have_desc; + PRINT @@descrs, Comms.size(); +} diff --git a/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql deleted file mode 100644 index 0251909f..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql +++ /dev/null @@ -1,176 +0,0 @@ -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( - UINT max_hop = 10, - UINT batch_num = 12, - UINT sample_edge_num = 100 -) { - - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; //--> this should be Community, I think - SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node - MinAccum @community_vid; // the community ID of the node - SumAccum @k; // the sum of the weights of the links incident to the node - SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node - SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C - SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) - SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community - MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community - MaxAccum @@min_double; // used to reset the @best_move - SumAccum @@move_cnt; - OrAccum @to_change_community; - SumAccum @batch_id; - SumAccum @vid; - - DOUBLE wt = 1.0; - - // Initialization - All_Nodes = {{ResolvedEntity.*}}; - All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM @@m += wt / 2, - s.@k += wt, - IF s == t THEN // self-loop link - js.@k_self_loop += wt - END - POST-ACCUM - s.@{community_id_attribute_name} = s, - s.@community_vid = to_string(s.id), - s.@vid = getvid(s), - s.@batch_id = s.@vid % batch_num; - - IF @@m < 0.00000000001 THEN - PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; - RETURN; - END; - - // Local moving - INT hop = 0; - Candidates = All_Nodes; - WHILE Candidates.size() > 0 AND hop < max_hop DO - hop = hop + 1; - LOG(TRUE, hop); - IF hop == 1 THEN // first iteration - ChangedNodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t - WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END - HAVING s.@to_change_community == TRUE; - - ELSE // remaining iterations - // Calculate sum_total - Tmp = SELECT s FROM All_Nodes:s - POST-ACCUM - @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k); - Tmp = SELECT s FROM All_Nodes:s - POST-ACCUM - s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}); - - @@community_sum_total_map.clear(); - // Find the best move - ChangedNodes = {{}}; - FOREACH batch_id IN RANGE[0, batch_num-1] DO - LOG(TRUE, batch_id); - // Calculate the delta Q to remove the node from the previous community - Nodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t - WHERE s.@batch_id == batch_id - ACCUM - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - s.@k_in += wt - ELSE - s.@community_k_in_map += (t.@{community_id_attribute_name} -> wt) - END - POST-ACCUM - s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, - s.@k_in = 0, - s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add - ; - - // Find the best move - Nodes = SELECT s FROM Nodes:s -({relation_edge_name}:e)- :t - //SAMPLE sample_edge_num EDGE WHEN s.outdegree("{relation_edge_name}") > sample_edge_num - WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, - s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END, - s.@community_k_in_map.clear() - HAVING s.@to_change_community == TRUE; - - ChangedNodes = ChangedNodes UNION Nodes; - END; - END; - // If two nodes swap, only change the community of one of them - SwapNodes = SELECT s FROM ChangedNodes:s -({relation_edge_name}:e)- :t - WHERE s.@best_move.community == t.@{community_id_attribute_name} - AND t.@to_change_community == TRUE - AND t.@best_move.community == s.@{community_id_attribute_name} - // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same - AND ( - s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add - OR ( - abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 - AND s.@vid > t.@vid - ) - ) - POST-ACCUM - s.@to_change_community = FALSE; - - ChangedNodes = ChangedNodes MINUS SwapNodes; - - // Place each node of ChangedNodes in the community in which the gain is maximum - ChangedNodes = SELECT s FROM ChangedNodes:s - POST-ACCUM - s.@{community_id_attribute_name} = s.@best_move.community, - s.@community_vid = s.@best_move.ext_vid, - s.@to_change_community = FALSE; - - @@move_cnt += ChangedNodes.size(); - - // Get all neighbours of the changed node that do not belong to the node’s new community - Candidates = SELECT t FROM ChangedNodes:s -({relation_edge_name}:e)- :t - WHERE t.@{community_id_attribute_name} != s.@{community_id_attribute_name}; - END; - - PRINT @@move_cnt AS Delta; - - // Coarsening - UINT new_layer = 0; - @@community_sum_total_map.clear(); - Tmp = - SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - @@community_sum_in_map += (s.@{community_id_attribute_name} -> wt) - END - POST-ACCUM - //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), - INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), - IF @@community_sum_in_map.containsKey(s) THEN - //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) - END; - - @@community_sum_in_map.clear(); - - Tmp = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM - IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN - @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> wt)) - END - POST-ACCUM - IF @@source_target_k_in_map.containsKey(s) THEN - FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO - //f_links_to.println(s.id, target_community, k_in, new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) - END - END; - - @@source_target_k_in_map.clear(); -} diff --git a/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql deleted file mode 100644 index 231631d6..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql +++ /dev/null @@ -1,217 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_2( - UINT layer = 0, - UINT max_hop = 10, - UINT batch_num = 1 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; - SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node - MinAccum @community_vid; // the community ID of the node - SumAccum @k; // the sum of the weights of the links incident to the node - SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node - SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C - SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) - SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community - MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community - MaxAccum @@min_double; // used to reset the @best_move - SumAccum @@move_cnt; - OrAccum @to_change_community; - SumAccum @batch_id; - SumAccum @vid; - SumAccum @@links_to_check; - - // Initialization - LOG(TRUE, "Query started!"); - All_Nodes = {{{entity_vertex_name}.*}}; - _tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - ACCUM - @@links_to_check += 1; - - All_Nodes = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - @@m += weight / 2, - s.@k += weight, - IF s == t THEN // self-loop link - s.@k_self_loop += weight - END - POST-ACCUM - s.@{community_id_attribute_name} = s, - s.@community_vid = to_string(s.id), - s.@vid = getvid(s), - s.@batch_id = s.@vid % batch_num - ; - LOG(TRUE, All_Nodes.size()); - IF @@m < 0.00000000001 THEN - PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; - RETURN; - END; - - // Local moving - INT hop = 0; - Candidates = All_Nodes; - WHILE Candidates.size() > 0 AND hop < max_hop DO - hop = hop + 1; - LOG(TRUE, hop); - IF hop == 1 THEN // first iteration - ChangedNodes = - SELECT s - FROM Candidates:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END - HAVING s.@to_change_community == TRUE - ; - ELSE // remaining iterations - // Calculate sum_total - Tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k) - ; - Tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}) - ; - LOG(TRUE, @@community_sum_total_map.size()); - @@community_sum_total_map.clear(); - // Find the best move - ChangedNodes = {{}}; - FOREACH batch_id IN RANGE[0, batch_num-1] DO - LOG(TRUE, batch_id); - // Calculate the delta Q to remove the node from the previous community - Nodes = - SELECT s - FROM Candidates:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@batch_id == batch_id - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - s.@k_in += weight - ELSE - s.@community_k_in_map += (t.@{community_id_attribute_name} -> weight) - END - POST-ACCUM - s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, - s.@k_in = 0, - s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add - ; - // Find the best move - Nodes = - SELECT s - FROM Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, - s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END, - s.@community_k_in_map.clear() - HAVING s.@to_change_community == TRUE - ; - ChangedNodes = ChangedNodes UNION Nodes; - END; - END; - // If two nodes swap, only change the community of one of them - SwapNodes = - SELECT s - FROM ChangedNodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@best_move.community == t.@{community_id_attribute_name} - AND t.@to_change_community == TRUE - AND t.@best_move.community == s.@{community_id_attribute_name} - // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same - AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add - OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 - AND s.@vid > t.@vid)) - POST-ACCUM - s.@to_change_community = FALSE - ; - LOG(TRUE, SwapNodes.size()); - ChangedNodes = ChangedNodes MINUS SwapNodes; - LOG(TRUE, ChangedNodes.size()); - // Place each node of ChangedNodes in the community in which the gain is maximum - ChangedNodes = - SELECT s - FROM ChangedNodes:s - POST-ACCUM - s.@{community_id_attribute_name} = s.@best_move.community, - s.@community_vid = s.@best_move.ext_vid, - s.@to_change_community = FALSE - ; - - @@move_cnt += ChangedNodes.size(); - // Get all neighbours of the changed node that do not belong to the node’s new community - Candidates = - SELECT t - FROM ChangedNodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND t.@{community_id_attribute_name} != s.@{community_id_attribute_name} - ; - LOG(TRUE, Candidates.size()); - END; - - PRINT @@move_cnt AS Delta; - - // Coarsening - LOG(TRUE, "Coarsening"); - UINT new_layer = layer + 1; - @@community_sum_total_map.clear(); - Tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - DOUBLE weight = e.layer_weight_map.get(layer), - @@community_sum_in_map += (s.@{community_id_attribute_name} -> weight) - END - POST-ACCUM - //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), - INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), - IF @@community_sum_in_map.containsKey(s) THEN - //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) - END - ; - LOG(TRUE, @@community_sum_in_map.size()); - @@community_sum_in_map.clear(); - Tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN - @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> weight)) - END - POST-ACCUM - IF @@source_target_k_in_map.containsKey(s) THEN - FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO - //f_links_to.println(s.uniq_id, target_community, k_in, new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) - END - END - ; - LOG(TRUE, @@source_target_k_in_map.size()); - @@source_target_k_in_map.clear(); - PRINT @@links_to_check; - LOG(TRUE, "Query finished!"); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql deleted file mode 100644 index 75cbad7e..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql +++ /dev/null @@ -1,44 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_3( - UINT top_layer = 2 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - MinAccum @{community_id_attribute_name}; // the community ID of the node - INT layer = top_layer; - - // Initialization - LOG(TRUE, "Query started!"); - All_Nodes = {{{entity_vertex_name}.*}}; - - // Top layer - Nodes = - SELECT t - FROM All_Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t - WHERE layer IN e.layer_set - ACCUM t.@{community_id_attribute_name} = to_string(s.id) - ; - LOG(TRUE, layer, Nodes.size()); - - // Other layers - WHILE Nodes.size() > 0 AND layer > 0 DO - layer = layer - 1; - Nodes = - SELECT t - FROM Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t - WHERE layer IN e.layer_set - ACCUM t.@{community_id_attribute_name} = s.@{community_id_attribute_name} - ; - LOG(TRUE, layer, Nodes.size()); - END; - - // Write to the file - Nodes = - SELECT s - FROM Nodes:s - POST-ACCUM - //f.println(s.uniq_id, s.@{community_id_attribute_name}) - s.{community_id_attribute_name} = s.@{community_id_attribute_name} - - ; - LOG(TRUE, "Query finished!"); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql deleted file mode 100644 index 0058d0ee..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql +++ /dev/null @@ -1,39 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4a( - UINT layer=0 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - SumAccum @@sum_weight; // the sum of the weights of all the links in the network - MapAccum, SumAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C - MapAccum, SumAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community - SumAccum @@modularity; - - All_Nodes = {{{entity_vertex_name}.*}}; - All_Nodes = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s == t THEN - @@community_in_weight_map += (s -> weight) - END, - @@community_total_weight_map += (s -> weight), - @@sum_weight += weight - ; - LOG(TRUE, All_Nodes.size()); - @@modularity = 0; - FOREACH (community, total_weight) IN @@community_total_weight_map DO - DOUBLE in_weight = 0; - IF @@community_in_weight_map.containsKey(community) THEN - in_weight = @@community_in_weight_map.get(community); - END; - @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); - END; - // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; - PRINT layer; - PRINT @@modularity AS modularity; - PRINT @@community_total_weight_map.size() AS community_number; - PRINT All_Nodes.size(); - @@community_in_weight_map.clear(); - @@community_total_weight_map.clear(); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql deleted file mode 100644 index 31ba4d0b..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql +++ /dev/null @@ -1,52 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4b( -) FOR GRAPH {graph_name} SYNTAX v1 {{ - SumAccum @@sum_weight; // the sum of the weights of all the links in the network - MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C - MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community - SumAccum @@modularity; - MapAccum> @@Community_sizes; - MapAccum> @@count_of_sizes; - AvgAccum @@avg_community_size; - - DOUBLE wt = 1.0; - All_Nodes = {{{entity_vertex_name}.*}}; - Nodes = - SELECT s - FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM IF s.{community_id_attribute_name} == t.{community_id_attribute_name} THEN - @@community_in_weight_map += (s.{community_id_attribute_name} -> wt) - END, - @@community_total_weight_map += (s.{community_id_attribute_name} -> wt), - @@sum_weight += wt - ; - @@modularity = 0; - FOREACH (community, total_weight) IN @@community_total_weight_map DO - DOUBLE in_weight = 0; - IF @@community_in_weight_map.containsKey(community) THEN - in_weight = @@community_in_weight_map.get(community); - END; - @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); - END; - - _tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - @@Community_sizes += (s.{community_id_attribute_name} -> 1); - - FOREACH (comm, cnt) IN @@Community_sizes DO - @@count_of_sizes += (cnt -> 1); - @@avg_community_size += cnt; - END; - - // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; - PRINT @@modularity AS modularity; - PRINT @@community_total_weight_map.size() AS community_number; - PRINT @@count_of_sizes AS num_communities_by_size; - PRINT @@avg_community_size AS avg_community_size; - - @@community_in_weight_map.clear(); - @@community_total_weight_map.clear(); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql deleted file mode 100644 index 7590935a..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql +++ /dev/null @@ -1,13 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_5_reset( -) FOR GRAPH {graph_name} SYNTAX v1 {{ - - // Initialization - Nodes = {{{entity_vertex_name}.*}}; - - // Top layer - DELETE e - FROM Nodes:s -(({belongs_to_edge_name}|{links_to_edge_name}):e)- :t - ; -}} diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index e915f392..411f5d62 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 17, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.043 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.066 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.845 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.085 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 19, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268'}" + "{'job_name': 'load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507'}" ] }, - "execution_count": 21, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -155,7 +155,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -166,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -183,22 +183,28 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sleep\n" + ] + }, { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_3e62fb87723945ea9a0380956694b7ec',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186'}" + "{'job_name': 'load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603'}" ] }, - "execution_count": 11, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", - "# for v in [\"ResolvedEntity\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\",\"Community\"]:\n", "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", @@ -207,6 +213,7 @@ "\n", "import time\n", "\n", + "print('sleep')\n", "time.sleep(3)\n", "conn.ai.runDocumentIngest(\n", " res[\"load_job_id\"],\n", @@ -273,6 +280,33 @@ "r[\"results\"][0][\"attributes\"][\"description\"]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "def check_vertex_has_desc(conn, comm: str):\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=None) as client:\n", + " resp = client.get(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Community/{comm}\",\n", + " headers=headers,\n", + " )\n", + " resp.raise_for_status()\n", + "\n", + " print(json.dumps(resp.json(),indent=2))\n", + " desc = resp.json()[\"results\"][0][\"attributes\"][\"description\"]\n", + " print(f\">>>*****{comm}:{desc}********\", flush=True)\n", + "\n", + " return len(desc) > 0\n", + "check_vertex_has_desc(conn,'Value_Property_1_2')\n", + "conn.upsertVertex(\"Community\",\"Rmse_1_2\",{\n", + " \"description\":\"asdf\"\n", + "})" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index d250b1f3..2bef4095 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -1,7 +1,5 @@ -import json +import re -from langchain.output_parsers import PydanticOutputParser -from langchain.prompts import ChatPromptTemplate from langchain_core.prompts import PromptTemplate from common.llm_services import LLM_Model @@ -12,17 +10,17 @@ You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary, but do not add any information that is not in the description. Make sure it is written in third person, and include the entity names so we the have full context. ####### -Data- -Entities: {entity_name} +Commuinty Title: {entity_name} Description List: {description_list} -####### -Output: """) +id_pat = re.compile(r"[_\d]*") + class CommunitySummarizer: def __init__( @@ -31,108 +29,16 @@ def __init__( ): self.llm_service = llm_service - def _extract_kg_from_doc(self, doc, chain, parser): - try: - out = chain.invoke( - {"input": doc, "format_instructions": parser.get_format_instructions()} - ) - except Exception as e: - print("Error: ", e) - return {"nodes": [], "rels": []} - try: - if "```json" not in out.content: - json_out = json.loads(out.content.strip("content=")) - else: - json_out = json.loads( - out.content.split("```")[1].strip("```").strip("json").strip() - ) - - formatted_rels = [] - for rels in json_out["rels"]: - if isinstance(rels["source"], str) and isinstance(rels["target"], str): - formatted_rels.append( - { - "source": rels["source"], - "target": rels["target"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], dict) and isinstance( - rels["target"], str - ): - formatted_rels.append( - { - "source": rels["source"]["id"], - "target": rels["target"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], str) and isinstance( - rels["target"], dict - ): - formatted_rels.append( - { - "source": rels["source"], - "target": rels["target"]["id"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], dict) and isinstance( - rels["target"], dict - ): - formatted_rels.append( - { - "source": rels["source"]["id"], - "target": rels["target"]["id"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - else: - raise Exception("Relationship parsing error") - formatted_nodes = [] - for node in json_out["nodes"]: - formatted_nodes.append( - { - "id": node["id"], - "type": node["node_type"].replace(" ", "_").capitalize(), - "definition": node["definition"], - } - ) - - # filter relationships and nodes based on allowed types - if self.strict_mode: - if self.allowed_vertex_types: - formatted_nodes = [ - node - for node in formatted_nodes - if node["type"] in self.allowed_vertex_types - ] - if self.allowed_edge_types: - formatted_rels = [ - rel - for rel in formatted_rels - if rel["type"] in self.allowed_edge_types - ] - return {"nodes": formatted_nodes, "rels": formatted_rels} - except: - print("Error Processing: ", out) - return {"nodes": [], "rels": []} - async def summarize(self, name: str, text: list[str]) -> CommunitySummary: - # parser = PydanticOutputParser(pydantic_object=CommunitySummary) structured_llm = self.llm_service.model.with_structured_output(CommunitySummary) chain = SUMMARIZE_PROMPT | structured_llm + + # remove iteration tags from name + name = id_pat.sub("", name) summary = await chain.ainvoke( { "entity_name": name, "description_list": text, - # "format_instructions": parser.get_format_instructions(), } ) - # summary = self._extract_kg_from_doc(text, chain, parser) - # summary = None return summary.summary diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index d4e3a7d6..d1e7fdc0 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -6,7 +6,13 @@ import httpx from aiochannel import Channel from graphrag import workers -from graphrag.util import http_timeout, init, make_headers, stream_ids +from graphrag.util import ( + check_vertex_has_desc, + http_timeout, + init, + make_headers, + stream_ids, +) from pyTigerGraph import TigerGraphConnection from common.config import embedding_service @@ -210,7 +216,6 @@ async def resolve_entities( async with asyncio.TaskGroup() as grp: # for every entity async for entity_id in entity_chan: - print(f"***Entity ID from chan {entity_id}", flush=True) grp.create_task( workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) ) @@ -227,7 +232,7 @@ async def resolve_entities( res.raise_for_status() -async def communities(conn: TigerGraphConnection, community_chan: Channel): +async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): """ Run louvain """ @@ -250,23 +255,21 @@ async def communities(conn: TigerGraphConnection, community_chan: Channel): ) res.raise_for_status() mod = res.json()["results"][0]["mod"] - print(f"****mod 1: {mod}", flush=True) - await community_chan.put(1) + logger.info(f"****mod pass 1: {mod}") + await stream_communities(conn, 1, comm_process_chan) # nth pass: Iterate on Resolved Entities until modularity stops increasing prev_mod = -10 i = 0 - # for _ in range(1, 5): - prev_mod = 0 while abs(prev_mod - mod) > 0.0000001 and prev_mod != 0: prev_mod = mod - logger.info(f"Running louvain on Communities (iteration: {i})") i += 1 + logger.info(f"Running louvain on Communities (iteration: {i})") # louvain pass async with httpx.AsyncClient(timeout=None) as client: res = await client.get( f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", - params={"n_batches": 1}, + params={"n_batches": 1, "iteration": i}, headers=headers, ) @@ -281,20 +284,20 @@ async def communities(conn: TigerGraphConnection, community_chan: Channel): ) res.raise_for_status() mod = res.json()["results"][0]["mod"] - print(f"*** mod {i+1}: {mod}", flush=True) - print(f"****** mod diff: {abs(prev_mod - mod)}", flush=True) + logger.info(f"*** mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") # write iter to chan for layer to be processed - await community_chan.put(i + 1) + await stream_communities(conn, i + 1, comm_process_chan) # TODO: erase last run since it's ∆q to the run before it will be small logger.info("closing communities chan") - community_chan.close() + comm_process_chan.close() async def stream_communities( conn: TigerGraphConnection, - community_chan: Channel, + # community_chan: Channel, + i: int, comm_process_chan: Channel, ): """ @@ -305,37 +308,48 @@ async def stream_communities( headers = make_headers(conn) # TODO: # can only do one layer at a time to ensure that every child community has their descriptions - async for i in community_chan: - # get the community from that layer - async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/stream_community", - params={"iter": i}, - headers=headers, - ) - resp.raise_for_status() - comms = resp.json()["results"][0]["Comms"] - for c in comms: - await comm_process_chan.put((i, c["v_id"])) + # async for i in community_chan: + # get the community from that layer + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/stream_community", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + comms = resp.json()["results"][0]["Comms"] + + for c in comms: + await comm_process_chan.put((i, c["v_id"])) + + # Wait for all communities for layer i to be processed before doing next layer + # all community descriptions must be populated before the next layer can be processed + if len(comms) > 0: + while not await check_vertex_has_desc(conn, i): + logger.info(f"Waiting for layer{i} to finish processing") + await asyncio.sleep(5) + await asyncio.sleep(3) logger.info("stream_communities done") logger.info("closing comm_process_chan") - comm_process_chan.close() + # comm_process_chan.close() async def summarize_communities( conn: TigerGraphConnection, comm_process_chan: Channel, upsert_chan: Channel, + embed_chan: Channel, ): async with asyncio.TaskGroup() as tg: async for c in comm_process_chan: - tg.create_task(workers.process_community(conn, upsert_chan, *c)) - break + tg.create_task(workers.process_community(conn, upsert_chan, embed_chan, *c)) + # break logger.info("closing upsert_chan") upsert_chan.close() + embed_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -347,14 +361,17 @@ async def run(graphname: str, conn: TigerGraphConnection): - embeddings - entities/relationships (and their embeddings) - upsert everything to the graph + - Resolve Entities + Ex: "Vincent van Gogh" and "van Gogh" should be resolved to "Vincent van Gogh" """ extractor, index_stores = await init(conn) init_start = time.perf_counter() - abc = True - abc = False - if abc: + doc_process_switch = True + entity_resolution_switch = True + community_detection_switch = True + if doc_process_switch: logger.info("Doc Processing Start") docs_chan = Channel(1) embed_chan = Channel(100) @@ -381,7 +398,7 @@ async def run(graphname: str, conn: TigerGraphConnection): # Entity Resolution entity_start = time.perf_counter() - if abc: + if entity_resolution_switch: logger.info("Entity Processing Start") entities_chan = Channel(100) upsert_chan = Channel(100) @@ -401,7 +418,7 @@ async def run(graphname: str, conn: TigerGraphConnection): # Community Detection community_start = time.perf_counter() - if True: + if community_detection_switch: # FIXME: delete community delete for v in ["Community"]: try: @@ -409,20 +426,22 @@ async def run(graphname: str, conn: TigerGraphConnection): except: pass logger.info("Community Processing Start") - communities_chan = Channel(1) upsert_chan = Channel(10) comm_process_chan = Channel(100) upsert_chan = Channel(100) + embed_chan = Channel(100) async with asyncio.TaskGroup() as grp: # run louvain - grp.create_task(communities(conn, communities_chan)) + # grp.create_task(communities(conn, communities_chan)) + grp.create_task(communities(conn, comm_process_chan)) # get the communities + # grp.create_task( stream_communities(conn, communities_chan, comm_process_chan)) + # summarize each community grp.create_task( - stream_communities(conn, communities_chan, comm_process_chan) + summarize_communities(conn, comm_process_chan, upsert_chan, embed_chan) ) - # summarize each community - grp.create_task(summarize_communities(conn, comm_process_chan, upsert_chan)) grp.create_task(upsert(upsert_chan)) + grp.create_task(embed(embed_chan, index_stores, graphname)) community_end = time.perf_counter() logger.info("Community Processing End") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 6876b5de..bcf1befe 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -2,6 +2,7 @@ import base64 import json import logging +import re import traceback from glob import glob @@ -65,6 +66,7 @@ async def init( "common/gsql/graphRAG/SetEpochProcessing", "common/gsql/graphRAG/ResolveRelationships", "common/gsql/graphRAG/get_community_children", + "common/gsql/graphRAG/communities_have_desc", "common/gsql/graphRAG/louvain/graphrag_louvain_init", "common/gsql/graphRAG/louvain/graphrag_louvain_communities", "common/gsql/graphRAG/louvain/modularity", @@ -91,6 +93,7 @@ async def init( "Entity", "Relationship", # "Concept", + "Community", ], ) index_stores = {} @@ -108,7 +111,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=False, + drop_old=True, ) LogWriter.info(f"Initializing {name}") @@ -174,6 +177,10 @@ def map_attrs(attributes: dict): def process_id(v_id: str): v_id = v_id.replace(" ", "_").replace("/", "") + + has_func = re.compile(r"(.*)\(").findall(v_id) + if len(has_func) > 0: + v_id = has_func[0] if v_id == "''" or v_id == '""': return "" @@ -186,6 +193,7 @@ async def upsert_vertex( vertex_id: str, attributes: dict, ): + logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) @@ -260,10 +268,26 @@ async def get_commuinty_children(conn, i: int, c: str): descrs = [] for d in resp.json()["results"][0]["children"]: desc = d["attributes"]["description"] - if len(desc) == 0: + if i == 1 and all(len(x) == 0 for x in desc): + desc = [d["v_id"]] + elif len(desc) == 0: desc = d["v_id"] descrs.append(desc) - print(">>>", descrs, flush=True) return descrs + + +async def check_vertex_has_desc(conn, i: int): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + + res = resp.json()["results"][0]["all_have_desc"] + + return res diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 22980d96..77f3d6d8 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -299,9 +299,6 @@ async def resolve_entity( f"aget_k_closest should, minimally, return the entity itself.\n{results}" ) raise Exception() - # FIXME: deleteme - # if entity_id == "Dataframe": - # print("result:", entity_id, results) # merge all entities into the ResolvedEntity vertex # use the longest v_id as the resolved entity's v_id @@ -346,8 +343,9 @@ async def resolve_entity( async def process_community( conn: TigerGraphConnection, upsert_chan: Channel, + embed_chan: Channel, i: int, - c: str, + comm_id: str, ): """ https://github.com/microsoft/graphrag/blob/main/graphrag/prompt_tune/template/community_report_summarization.py @@ -357,33 +355,39 @@ async def process_community( embed summaries """ - print(i, c, flush=True) + logger.info(f"Processing Community: {comm_id}") # get the children of the community - children = await util.get_commuinty_children(conn, i, c) + children = await util.get_commuinty_children(conn, i, comm_id) if i == 1: tmp = [] for c in children: tmp.extend(c) children = list(filter(lambda x: len(x) > 0, tmp)) - print(">>>", children, flush=True) - llm = ecc_util.get_llm_service() - summarizer = community_summarizer.CommunitySummarizer(llm) - summary = await summarizer.summarize(c, children) - await upsert_chan.put((upsert_summary, (conn,summary))) - - -async def upsert_summary(conn: TigerGraphConnection, summary: str): - print(f"SUMMARY:> {summary}", flush=True) - - # vertex_id = vertex_id.replace(" ", "_") - # attrs = map_attrs(attributes) - # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) - # headers = make_headers(conn) - # async with httpx.AsyncClient(timeout=http_timeout) as client: - # res = await client.post( - # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - # ) - # - # res.raise_for_status() - # + comm_id = util.process_id(comm_id) + + # if the community only has one child, use its description + if len(children) == 1: + summary = children[0] + else: + llm = ecc_util.get_llm_service() + summarizer = community_summarizer.CommunitySummarizer(llm) + summary = await summarizer.summarize(comm_id, children) + + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Community", # v_type + comm_id, # v_id + { # attrs + "description": summary, + "iteration": i, + }, + ), + ) + ) + + # (v_id, content, index_name) + await embed_chan.put((comm_id, summary, "Community")) From 08aca044b071352020cb3bfec3e743e8e178aaa0 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:54:16 -0400 Subject: [PATCH 11/91] cleanup --- .../app/graphrag/graph_rag.py | 15 +-------------- .../app/graphrag/workers.py | 3 +-- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index d1e7fdc0..86f172b8 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -19,8 +19,6 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor -# http_logs = logging.getLogger("httpx") -# http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) consistency_checkers = {} @@ -192,8 +190,6 @@ async def stream_entities( for i in ids["ids"]: if len(i) > 0: await entity_chan.put(i) - # break - # break # one batch logger.info("stream_enities done") # close the docs chan -- this function is the only sender @@ -296,7 +292,6 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): async def stream_communities( conn: TigerGraphConnection, - # community_chan: Channel, i: int, comm_process_chan: Channel, ): @@ -333,7 +328,6 @@ async def stream_communities( logger.info("stream_communities done") logger.info("closing comm_process_chan") - # comm_process_chan.close() async def summarize_communities( @@ -345,7 +339,6 @@ async def summarize_communities( async with asyncio.TaskGroup() as tg: async for c in comm_process_chan: tg.create_task(workers.process_community(conn, upsert_chan, embed_chan, *c)) - # break logger.info("closing upsert_chan") upsert_chan.close() @@ -369,7 +362,7 @@ async def run(graphname: str, conn: TigerGraphConnection): init_start = time.perf_counter() doc_process_switch = True - entity_resolution_switch = True + entity_resolution_switch = True community_detection_switch = True if doc_process_switch: logger.info("Doc Processing Start") @@ -419,12 +412,6 @@ async def run(graphname: str, conn: TigerGraphConnection): # Community Detection community_start = time.perf_counter() if community_detection_switch: - # FIXME: delete community delete - for v in ["Community"]: - try: - conn.delVertices(v) - except: - pass logger.info("Community Processing Start") upsert_chan = Channel(10) comm_process_chan = Channel(100) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 77f3d6d8..755b1085 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -151,7 +151,7 @@ async def get_vert_desc(conn, v_id, node: Node): exists = await util.check_vertex_exists(conn, v_id) # if vertex exists, get description content and append this description to it if not exists["error"]: - # dedup descriptions + # deduplicate descriptions desc.extend(exists["results"][0]["attributes"]["description"]) desc = list(set(desc)) return desc @@ -316,7 +316,6 @@ async def resolve_entity( "ResolvedEntity", # v_type resolved_entity_id, # v_id { # attrs - # "id": resolved_entity_id, }, ), ) From f2828406ee10fd71b486a32fb0e704d6db895cb3 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:01:26 -0400 Subject: [PATCH 12/91] fmt after merge conflicts --- common/embeddings/milvus_embedding_store.py | 8 +++----- eventual-consistency-service/app/main.py | 7 +++++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index ae352c9e..9302f6f8 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -5,9 +5,10 @@ import Levenshtein as lev from asyncer import asyncify -from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from pymilvus import connections, utility +from langchain_milvus.vectorstores import Milvus +from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException from common.embeddings.base_embedding_store import EmbeddingStore @@ -15,9 +16,6 @@ from common.logs.log import req_id_cv from common.logs.logwriter import LogWriter from common.metrics.prometheus_metrics import metrics -from langchain_community.vectorstores import Milvus -from langchain_core.documents.base import Document -from pymilvus import MilvusException, connections, utility logger = logging.getLogger(__name__) diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index ce7a2e04..701e363e 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -96,7 +96,7 @@ def initialize_eventual_consistency_checker( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - alias=milvus_config.get("alias", "default") + alias=milvus_config.get("alias", "default"), ) chunker = ecc_util.get_chunker() @@ -190,7 +190,10 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) # asyncio.run(graphrag.run(graphname, conn)) import time - ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + + ecc_status = ( + f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + ) case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From 50a4fd516cd4195a2693f4693dbbc545e5524326 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:27:30 -0400 Subject: [PATCH 13/91] rm clang dotfiles --- common/gsql/graphRAG/.clang-format | 269 ----------------------------- common/gsql/graphRAG/.clangd | 2 - 2 files changed, 271 deletions(-) delete mode 100644 common/gsql/graphRAG/.clang-format delete mode 100644 common/gsql/graphRAG/.clangd diff --git a/common/gsql/graphRAG/.clang-format b/common/gsql/graphRAG/.clang-format deleted file mode 100644 index f0dcec6c..00000000 --- a/common/gsql/graphRAG/.clang-format +++ /dev/null @@ -1,269 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: true -AlignConsecutiveBitFields: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveDeclarations: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveMacros: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveShortCaseStatements: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Left -AlignOperands: Align -AlignTrailingComments: - Kind: Always - OverEmptyLines: 0 -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: WithoutElse -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterExternBlock: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakArrays: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: Always -BreakBeforeBraces: Attach -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: BeforeColon -BreakInheritanceList: BeforeColon -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*\.h>' - Priority: 1 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 3 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '([-_](test|unittest))?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseBlocks: false -IndentCaseLabels: true -IndentExternBlock: AfterExternBlock -IndentGotoLabels: true -IndentPPDirectives: None -IndentRequiresClause: true -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: false -InsertNewlineAtEOF: false -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 0 - BinaryMinDigits: 0 - Decimal: 0 - DecimalMinDigits: 0 - Hex: 0 - HexMinDigits: 0 -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -KeepEmptyLinesAtEOF: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 4 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PackConstructorInitializers: NextLine -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 0 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -PPIndentWidth: -1 -QualifierAlignment: Leave -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - - ParseTestProto - - ParsePartialTestProto - CanonicalDelimiter: pb - BasedOnStyle: google -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: false -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: CaseSensitive -SortJavaStaticImport: Before -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - AfterRequiresInClause: false - AfterRequiresInExpression: false - BeforeNonEmptyParentheses: false -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: Never -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParens: Never -SpacesInParensOptions: - InCStyleCasts: false - InConditionalStatements: false - InEmptyParentheses: false - Other: false -SpacesInSquareBrackets: false -Standard: Auto -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseTab: Never -VerilogBreakBetweenInstancePorts: true -WhitespaceSensitiveMacros: - - BOOST_PP_STRINGIZE - - CF_SWIFT_NAME - - NS_SWIFT_NAME - - PP_STRINGIZE - - STRINGIZE -... diff --git a/common/gsql/graphRAG/.clangd b/common/gsql/graphRAG/.clangd deleted file mode 100644 index ec3be0d8..00000000 --- a/common/gsql/graphRAG/.clangd +++ /dev/null @@ -1,2 +0,0 @@ -CompileFlags: - Add: [ -std=c++23 ] From f007c8aac453d7e00009d10d69fb0b49ed174acf Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:12:05 -0400 Subject: [PATCH 14/91] final cleanup --- common/embeddings/milvus_embedding_store.py | 5 +- common/extractors/GraphExtractor.py | 3 +- .../louvain/graphrag_louvain_communities.gsql | 5 +- .../louvain/graphrag_louvain_init.gsql | 17 +- common/gsql/supportai/Scan_For_Updates.gsql | 8 +- common/gsql/supportai/SupportAI_Schema.gsql | 2 +- common/logs/logwriter.py | 2 +- common/py_schemas/schemas.py | 4 - copilot/docs/notebooks/graphrag.ipynb | 398 ------------------ eventual-consistency-service/app/main.py | 3 +- 10 files changed, 17 insertions(+), 430 deletions(-) delete mode 100644 copilot/docs/notebooks/graphrag.ipynb diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 9302f6f8..7169379e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -596,7 +596,7 @@ def query(self, expr: str, output_fields: List[str]): return query_result - def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): + def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float): a = a.lower() b = b.lower() # if the words are short, they should be the same @@ -605,8 +605,6 @@ def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): # edit_dist_threshold (as a percent) of word must match threshold = int(min(len(a), len(b)) * (1 - edit_dist_threshold)) - if p: - print(a, b, threshold, lev.distance(a, b)) return lev.distance(a, b) < threshold async def aget_k_closest( @@ -641,7 +639,6 @@ async def aget_k_closest( doc.metadata["vertex_id"], v_id, edit_dist_threshold_pct, - # v_id == "Dataframe", ) # don't have to merge verts with the same id (they're the same) and doc.metadata["vertex_id"] != v_id diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py index 282729a4..2a7ba505 100644 --- a/common/extractors/GraphExtractor.py +++ b/common/extractors/GraphExtractor.py @@ -40,8 +40,7 @@ def extract(self, text) -> list[GraphDocument]: """ doc = Document(page_content=text) graph_docs = self.transformer.convert_to_graph_documents([doc]) - translated_docs = self.translate(graph_docs) - return translated_docs + return graph_docs async def aextract(self, text:str) -> list[GraphDocument]: """ diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql index 366b7ea7..4137ca68 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -166,14 +166,13 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max @@community_sum_in_map += (s.@community_id -> e.weight) ELSE // get LINKS_TO edge weights (how many edges are between communities) - // s.@community_k_in_map += (t.@community_id -> 1) @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> e.weight)) END, t.@has_parent += TRUE // Used to help find unattached partitions POST-ACCUM // Write the results to a new community vertex (iteration + 1) // ID , iter, edges within the community - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1), ""), INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community ; @@ -183,7 +182,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max AND NOT s.@has_parent POST-ACCUM // if s is a part of an unattached partition, add to its community hierarchy to maintain parity with rest of graph - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, ""), INSERT INTO HAS_PARENT VALUES (s, s.id+"_"+to_string(iteration+1)) // link Community's child/parent community ; diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql index 2ccbaf2c..42e9108d 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql @@ -26,11 +26,11 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches DOUBLE wt = 1.0; // prevent multiple init runs - // z = SELECT s FROM AllNodes:s -(_)-> Community:t; - // IF z.size() > 0 THEN - // EXCEPTION reinit(400001); - // RAISE reinit("ERROR: the hierarchical communities have already been initialized"); - // END; + z = SELECT s FROM AllNodes:s -(_)-> Community:t; + IF z.size() > 0 THEN + EXCEPTION reinit(400001); + RAISE reinit("ERROR: the hierarchical communities have already been initialized"); + END; // init z = SELECT s FROM AllNodes:s @@ -42,11 +42,6 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches z = SELECT s FROM AllNodes:s -(_)-> ResolvedEntity:t ACCUM s.@k += wt, @@m += 1; - // POST-ACCUM - // s.@community_id = s, // assign node to its own community - // s.@community_vid = s.id, // external id - // s.@vid = getvid(s), // internal id (used in batching) - // s.@batch_id = s.@vid % n_batches; // get batch number PRINT z.size(); PRINT z; @@ -166,7 +161,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches END POST-ACCUM // ID , iter, edges within the community - INSERT INTO Community VALUES (s.@community_vid+"_1", 1, @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.@community_vid+"_1", 1, ""), INSERT INTO IN_COMMUNITY VALUES (s, s.@community_vid+"_1") // link entity to it's first community ; diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index 7d9d1b83..ba5444bd 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - // ELSE IF v_type == "Concept" THEN - // res = SELECT s FROM start:s - // POST-ACCUM @@v_and_text += (s.id -> s.description), - // s.epoch_processing = datetime_to_epoch(now()); + ELSE IF v_type == "Concept" THEN + res = SELECT s FROM start:s + POST-ACCUM @@v_and_text += (s.id -> s.description), + s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 3e127d82..718ab1a7 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -20,7 +20,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; // GraphRAG - ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, k_in UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; diff --git a/common/logs/logwriter.py b/common/logs/logwriter.py index f75be00c..ff13feed 100644 --- a/common/logs/logwriter.py +++ b/common/logs/logwriter.py @@ -142,7 +142,7 @@ def log(level, message, mask_pii=True, **kwargs): LogWriter.general_logger.info(message) @staticmethod - def info(message, mask_pii=False, **kwargs): + def info(message, mask_pii=True, **kwargs): LogWriter.log("info", message, mask_pii, **kwargs) @staticmethod diff --git a/common/py_schemas/schemas.py b/common/py_schemas/schemas.py index 07a2113f..a58d4660 100644 --- a/common/py_schemas/schemas.py +++ b/common/py_schemas/schemas.py @@ -20,10 +20,6 @@ class SupportAIMethod(enum.StrEnum): GRAPHRAG = enum.auto() -class EccConfig(BaseModel): - method: SupportAIMethod = SupportAIMethod.SUPPORTAI - - class GSQLQueryInfo(BaseModel): function_header: str description: str diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb deleted file mode 100644 index 411f5d62..00000000 --- a/copilot/docs/notebooks/graphrag.ipynb +++ /dev/null @@ -1,398 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pyTigerGraph import TigerGraphConnection\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()\n", - "# We first create a connection to the database\n", - "host = os.environ[\"HOST\"]\n", - "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", - "password = os.getenv(\"PASS\", \"tigergraph\")\n", - "conn = TigerGraphConnection(\n", - " host=host,\n", - " username=username,\n", - " password=password,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The graph GraphRAG_pytgdocs is created.'" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.graphname = \"GraphRAG_pytgdocs\"\n", - "conn.gsql(\"\"\"CREATE GRAPH GraphRAG_pytgdocs()\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "_ = conn.getToken()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.845 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.085 seconds!\\\\nLocal schema change succeeded.\"'}" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# # And then add CoPilot's address to the connection. This address\n", - "# # is the host's address where the CoPilot container is running.\n", - "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", - "conn.ai.initializeSupportAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", - "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", - "res = conn.ai.createDocumentIngest(\n", - " data_source=\"s3\",\n", - " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", - " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", - " file_format=\"json\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507'}" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.ai.runDocumentIngest(\n", - " res[\"load_job_id\"],\n", - " res[\"data_source_id\"],\n", - " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# import httpx\n", - "# import base64\n", - "\n", - "\n", - "# def make_headers(conn: TigerGraphConnection):\n", - "# tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - "# headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - "# return headers\n", - "\n", - "\n", - "# httpx.get(\n", - "# \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", - "# headers=make_headers(conn),\n", - "# timeout=None,\n", - "# )\n", - "# # conn.ai.forceConsistencyUpdate()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'asdf' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", - "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" - ] - } - ], - "source": [ - "asdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for v in [\"Community\"]:\n", - " try:\n", - " conn.delVertices(v)\n", - " except:\n", - " pass\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sleep\n" - ] - }, - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603'}" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\",\"Community\"]:\n", - "# for v in [\"ResolvedEntity\"]:\n", - " try:\n", - " conn.delVertices(v)\n", - " except:\n", - " pass\n", - "\n", - "import time\n", - "\n", - "print('sleep')\n", - "time.sleep(3)\n", - "conn.ai.runDocumentIngest(\n", - " res[\"load_job_id\"],\n", - " res[\"data_source_id\"],\n", - " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.gsql(f\"\"\"\n", - "USE GRAPH {conn.graphname}\n", - "DROP QUERY ResolveRelationships\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "import json\n", - "import httpx\n", - "import logging\n", - "\n", - "_ = logging.getLogger(__name__)\n", - "\n", - "\n", - "http_timeout = None\n", - "\n", - "\n", - "def make_headers(conn: TigerGraphConnection):\n", - " if conn.apiToken is None or conn.apiToken == \"\":\n", - " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - " else:\n", - " headers = {\"Authorization\": f\"Bearer {conn.apiToken}\"}\n", - "\n", - " return headers\n", - "\n", - "\n", - "def check_vertex_exists(conn, id):\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=http_timeout) as client:\n", - " res = client.get(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{id}\",\n", - " headers=headers,\n", - " )\n", - "\n", - " res.raise_for_status()\n", - " return res.json()\n", - "\n", - "\n", - "# r = check_vertex_exists(conn, \"asdfTigergraphexception\")\n", - "# print(json.dumps(r, indent=2), r[\"error\"])\n", - "r = check_vertex_exists(conn, \"Tigergraphexception\")\n", - "print(json.dumps(r, indent=2), r[\"error\"])\n", - "r[\"results\"][0][\"attributes\"][\"description\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "def check_vertex_has_desc(conn, comm: str):\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=None) as client:\n", - " resp = client.get(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Community/{comm}\",\n", - " headers=headers,\n", - " )\n", - " resp.raise_for_status()\n", - "\n", - " print(json.dumps(resp.json(),indent=2))\n", - " desc = resp.json()[\"results\"][0][\"attributes\"][\"description\"]\n", - " print(f\">>>*****{comm}:{desc}********\", flush=True)\n", - "\n", - " return len(desc) > 0\n", - "check_vertex_has_desc(conn,'Value_Property_1_2')\n", - "conn.upsertVertex(\"Community\",\"Rmse_1_2\",{\n", - " \"description\":\"asdf\"\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def map_attrs(attributes: dict):\n", - " # map attrs\n", - " attrs = {}\n", - " for k, v in attributes.items():\n", - " if isinstance(v, tuple):\n", - " attrs[k] = {\"value\": v[0], \"op\": v[1]}\n", - " elif isinstance(v, dict):\n", - " attrs[k] = {\n", - " \"value\": {\"keylist\": list(v.keys()), \"valuelist\": list(v.values())}\n", - " }\n", - " else:\n", - " attrs[k] = {\"value\": v}\n", - " return attrs\n", - "\n", - "\n", - "def process_id(v_id: str):\n", - " return v_id.replace(\" \", \"_\").replace(\"/\", \"\")\n", - "\n", - "\n", - "def a(vertex_id=\"Post /Requesttoken\"):\n", - " vertex_id = process_id(vertex_id)\n", - " attributes = { # attrs\n", - " \"description\": [\"test\"],\n", - " \"epoch_added\": int(time.time()),\n", - " }\n", - "\n", - " vertex_id = vertex_id.replace(\" \", \"_\")\n", - " attrs = map_attrs(attributes)\n", - " data = json.dumps({\"vertices\": {\"Entity\": {vertex_id: attrs}}})\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=http_timeout) as client:\n", - " res = client.post(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}\", data=data, headers=headers\n", - " )\n", - "\n", - " res.raise_for_status()\n", - "\n", - " return res.json()\n", - "\n", - "\n", - "a()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from urllib import parse\n", - "\n", - "v_id = \"Post_/Requesttoken\"\n", - "v_id = process_id(v_id)\n", - "print(v_id)\n", - "\n", - "r = check_vertex_exists(conn, v_id)\n", - "print(json.dumps(r, indent=2), r[\"error\"])\n", - "r[\"results\"][0][\"attributes\"][\"description\"]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ml", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 701e363e..34403f1e 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -188,11 +188,10 @@ def consistency_status( LogWriter.info(f"Returning consistency status for {graphname}: {status}") case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) - # asyncio.run(graphrag.run(graphname, conn)) import time ecc_status = ( - f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" ) case _: response.status_code = status.HTTP_404_NOT_FOUND From 2d1e98b16a759100686e115f35c3c479ad537ddb Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:34:16 -0400 Subject: [PATCH 15/91] reqs to fix unit tests --- copilot/requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7a8bd83f..03157f17 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,15 +68,15 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.25 -langchain-experimental==0.0.63 +langchain==0.2.12 +langchain-community==0.2.11 +langchain-core==0.2.29 +langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain_openai==0.1.19 +langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 @@ -152,4 +152,4 @@ wandb==0.15.12 watchfiles==0.20.0 websockets==11.0.3 yarl==1.9.2 -zipp==3.19.2 \ No newline at end of file +zipp==3.19.2 From e0065ee60b85b42b483e28ce0603c4ef2451c05b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:43:42 -0400 Subject: [PATCH 16/91] reqs to fix unit test --- copilot/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 03157f17..3035d7c1 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,6 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.12 langchain-community==0.2.11 -langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 2a5434abd2dcffac69e689d097e232f25be1ca09 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:47:01 -0400 Subject: [PATCH 17/91] reqs to fix unit test --- copilot/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 3035d7c1..302c9b44 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -79,7 +79,6 @@ langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 -langsmith==0.1.94 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From a43490a852729c076a7aa0a11d685298ce66c3da Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:51:28 -0400 Subject: [PATCH 18/91] reqs to fix unit test --- copilot/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 302c9b44..7df43165 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,17 +68,17 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 langchain-community==0.2.11 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 +langchain==0.2.12 +langchain-openai==0.1.20 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From 4b76e73d5a284b90bb923707e52e72dae3c1d040 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:57:39 -0400 Subject: [PATCH 19/91] reqs to fix unit test --- copilot/requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7df43165..ba1f04e3 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,17 +68,19 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 +langchain==0.2.12 langchain-community==0.2.11 +langchain-core==0.2.3 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 +langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 -langchain==0.2.12 -langchain-openai==0.1.20 +langsmith==0.1.94 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From 115b1b3f9f5c046b1f8d03761dccdf43a9b32320 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:02:42 -0400 Subject: [PATCH 20/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index ba1f04e3..5e475767 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,7 +68,7 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 +langchain==0.2.13 langchain-community==0.2.11 langchain-core==0.2.3 langchain-experimental==0.0.64 From 58b5cbe6694f24f46f5e669e85b2e3abde0a1598 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:04:33 -0400 Subject: [PATCH 21/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 5e475767..7b30e5b5 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,7 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.13 langchain-community==0.2.11 -langchain-core==0.2.3 +langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From fa960394b2acb3f88ef9171218445c5c57915b84 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:11:50 -0400 Subject: [PATCH 22/91] reqs to fix unit test --- copilot/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7b30e5b5..632a6eba 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,15 +68,15 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.13 -langchain-community==0.2.11 -langchain-core==0.2.30 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain-openai==0.1.20 +langchain_openai==0.1.19 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 From 905d5cfa324d373af3dd7f9266c6d795ec122b1c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:30:37 -0400 Subject: [PATCH 23/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 632a6eba..e69f2be6 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,7 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -langchain-core==0.2.25 +langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 5e8b0aeaf569ffa9570ac871fd804dce21b89414 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:09:27 -0400 Subject: [PATCH 24/91] reqs to fix unit test --- copilot/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e69f2be6..e6fb3718 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -81,6 +81,7 @@ langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 langsmith==0.1.94 +Levenshtein==0.25.1 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 @@ -118,7 +119,7 @@ pyTigerDriver==1.0.15 pyTigerGraph==1.6.2 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.8.0 regex==2023.10.3 requests==2.32.2 rsa==4.9 From be0177e9b5d9dd50231d07fb43c8c5b6dd69b377 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:29:45 -0400 Subject: [PATCH 25/91] reqs to fix unit test --- copilot/requirements.txt | 237 +++++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 108 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e6fb3718..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,156 +1,177 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.29 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 langchain-experimental==0.0.64 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.19 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 -rapidfuzz==3.8.0 -regex==2023.10.3 +pyTigerGraph==1.6.5 +pytz==2024.1 +PyYAML==6.0.2 +rapidfuzz==3.9.6 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +typing_extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From cb43815468caf756311d087c03b25dc2395184fb Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:35:57 -0400 Subject: [PATCH 26/91] reqs to fix unit test --- common/requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index bb20e5b9..a8cc3d51 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,15 +70,14 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 +langchain==0.2.13 langchain-community==0.2.11 -langchain-core==0.2.29 +langchain-core==0.2.3 langchain-experimental==0.0.64 langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langsmith==0.1.98 Levenshtein==0.25.1 -langchain==0.2.11 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain_milvus==0.1.3 From ac6d3fe8d910eee102af6bab204437fc45626486 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:52:41 -0400 Subject: [PATCH 27/91] reqs to fix unit test --- .github/workflows/pull-test-merge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..3a61ecaf 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -33,7 +33,7 @@ jobs: python -m venv venv source venv/bin/activate python -m pip install --upgrade pip - pip install -r copilot/requirements.txt + pip install --no-cache-dir -r copilot/requirements.txt pip install pytest - name: Create db config From 60aa569ef12749af9b36c09684c62b12fda7231a Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:07:02 -0400 Subject: [PATCH 28/91] reqs to fix unit test --- copilot/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..7ee3073f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -78,7 +78,6 @@ jsonpointer==3.0.0 kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 -langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 @@ -89,7 +88,6 @@ langchainhub==0.1.21 langdetect==1.0.9 langgraph==0.2.3 langgraph-checkpoint==1.0.2 -langsmith==0.1.99 Levenshtein==0.25.1 lomond==0.3.3 lxml==5.3.0 From 2d377569d5aadac4396abe456320de39d4106966 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:17:56 -0400 Subject: [PATCH 29/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7ee3073f..7f6269f2 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 From 1929aa22cba052b004a335d61941fe7f5deb0d9b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:51:26 -0400 Subject: [PATCH 30/91] reqs to fix unit test --- .github/workflows/pull-test-merge.yaml | 2 +- common/requirements.txt | 4 +- copilot/requirements.txt | 239 ++++++++++++------------- 3 files changed, 113 insertions(+), 132 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 3a61ecaf..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -33,7 +33,7 @@ jobs: python -m venv venv source venv/bin/activate python -m pip install --upgrade pip - pip install --no-cache-dir -r copilot/requirements.txt + pip install -r copilot/requirements.txt pip install pytest - name: Create db config diff --git a/common/requirements.txt b/common/requirements.txt index a8cc3d51..2d9a90ba 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,9 +70,9 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.13 +langchain==0.2.12 langchain-community==0.2.11 -langchain-core==0.2.3 +langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-openai==0.1.20 langchain-text-splitters==0.2.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7f6269f2..df06f401 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,175 +1,156 @@ -aiohappyeyeballs==2.3.5 -aiohttp==3.10.3 +aiohttp==3.9.3 aiosignal==1.3.1 -annotated-types==0.7.0 -anyio==4.4.0 +annotated-types==0.5.0 +anyio==3.7.1 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==24.2.0 -azure-core==1.30.2 -azure-storage-blob==12.22.0 +attrs==23.1.0 +azure-core==1.30.1 +azure-storage-blob==12.19.1 backoff==2.2.1 -beautifulsoup4==4.12.3 -boto3==1.34.159 -botocore==1.34.159 -cachetools==5.4.0 -certifi==2024.7.4 -cffi==1.17.0 +beautifulsoup4==4.12.2 +boto3==1.28.83 +botocore==1.31.83 +cachetools==5.3.2 +certifi==2023.7.22 +cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.2 +charset-normalizer==3.2.0 click==8.1.7 -contourpy==1.2.1 -cryptography==43.0.0 -cycler==0.12.1 -dataclasses-json==0.6.7 -deepdiff==7.0.1 -distro==1.9.0 +cryptography==42.0.5 +dataclasses-json==0.5.14 +distro==1.8.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.12.1 +emoji==2.8.0 environs==9.5.0 -exceptiongroup==1.2.2 -fastapi==0.112.0 +exceptiongroup==1.1.3 +fastapi==0.103.1 filelock==3.15.4 filetype==1.2.0 -fonttools==4.53.1 -frozenlist==1.4.1 -fsspec==2024.6.1 +frozenlist==1.4.0 +fsspec==2024.6.0 gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.19.1 -google-auth==2.33.0 -google-cloud-aiplatform==1.61.0 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-resource-manager==1.12.5 -google-cloud-storage==2.18.2 +GitPython==3.1.40 +google-api-core==2.14.0 +google-auth==2.23.4 +google-cloud-aiplatform==1.52.0 +google-cloud-bigquery==3.13.0 +google-cloud-core==2.3.3 +google-cloud-resource-manager==1.10.4 +google-cloud-storage==2.13.0 google-crc32c==1.5.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.63.2 -greenlet==3.0.3 -groq==0.9.0 -grpc-google-iam-v1==0.13.1 -grpcio==1.63.0 -grpcio-status==1.63.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 +greenlet==2.0.2 +groq==0.5.0 +grpc-google-iam-v1==0.12.7 +grpcio==1.59.2 +grpcio-status==1.59.2 h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub==0.24.5 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 +huggingface-hub==0.23.0 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.1.5 -idna==3.7 -importlib_metadata==8.2.0 +ibm_watsonx_ai==1.0.11 +idna==3.4 +importlib_metadata==8.0.0 iniconfig==2.0.0 isodate==0.6.1 -jiter==0.5.0 jmespath==1.0.1 -joblib==1.4.2 -jq==1.7.0 +joblib==1.3.2 +jq==1.6.0 jsonpatch==1.33 -jsonpath-python==1.0.6 -jsonpointer==3.0.0 -kiwisolver==1.4.5 -langchain==0.2.13 -langchain-community==0.2.12 -langchain-experimental==0.0.64 -langchain-groq==0.1.9 -langchain-ibm==0.1.12 -langchain-milvus==0.1.4 -langchain-openai +jsonpointer==2.4 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 +langchain-experimental==0.0.63 +langchain-groq==0.1.8 +langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 -langchainhub==0.1.21 -langdetect==1.0.9 -langgraph==0.2.3 -langgraph-checkpoint==1.0.2 +langchain_milvus==0.1.3 +langchain_openai==0.1.19 +langchainhub==0.1.20 Levenshtein==0.25.1 +langdetect==1.0.9 +langgraph==0.1.16 +langsmith==0.1.94 lomond==0.3.3 -lxml==5.3.0 -marshmallow==3.21.3 -matplotlib==3.9.2 -milvus-lite==2.4.9 -minio==7.2.7 -multidict==6.0.5 +lxml==4.9.3 +marshmallow==3.20.1 +matplotlib==3.9.1 +minio==7.2.5 +multidict==6.0.4 mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 -openai==1.40.6 -ordered-set==4.1.0 -orjson==3.10.7 -packaging==24.1 -pandas==2.1.4 +openai==1.37.1 +orjson==3.9.15 +packaging==23.2 +pandas==2.1.1 pathtools==0.1.2 -pillow==10.4.0 -platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.24.0 -protobuf==5.27.3 -psutil==6.0.0 -pyarrow==17.0.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pycparser==2.22 +proto-plus==1.22.3 +protobuf==4.24.4 +psutil==5.9.6 +pyarrow==15.0.1 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 pycryptodome==3.20.0 -pydantic==2.8.2 -pydantic_core==2.20.1 -pygit2==1.15.1 -pymilvus==2.4.5 -pyparsing==3.1.2 -pypdf==4.3.1 -pytest==8.3.2 +pydantic==2.3.0 +pydantic_core==2.6.3 +pygit2==1.13.2 +pymilvus==2.4.4 +pytest==8.2.0 python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-iso639==2024.4.27 +python-dotenv==1.0.0 +python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.5 -pytz==2024.1 -PyYAML==6.0.2 -rapidfuzz==3.9.6 -regex==2024.7.24 +pyTigerGraph==1.6.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +rapidfuzz==3.4.0 +regex==2023.10.3 requests==2.32.2 -requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.10.2 +s3transfer==0.7.0 scikit-learn==1.5.1 -scipy==1.14.0 -sentry-sdk==2.13.0 +sentry-sdk==1.32.0 setproctitle==1.3.3 -shapely==2.0.5 +shapely==2.0.2 six==1.16.0 smmap==5.0.1 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.32 -starlette==0.37.2 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.20 +starlette==0.27.0 tabulate==0.9.0 -tenacity==8.5.0 -threadpoolctl==3.5.0 +tenacity==8.2.3 tiktoken==0.7.0 -tqdm==4.66.5 -types-requests==2.32.0.20240712 +tqdm==4.66.1 +types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -unstructured==0.15.1 -unstructured-client==0.25.5 -urllib3==2.2.2 -uvicorn==0.30.6 -uvloop==0.19.0 -validators==0.33.0 -wandb==0.17.6 -watchfiles==0.23.0 -websockets==12.0 -wrapt==1.16.0 -yarl==1.9.4 -zipp==3.20.0 +typing_extensions==4.8.0 +tzdata==2023.3 +ujson==5.9.0 +unstructured==0.10.23 +urllib3==1.26.18 +uvicorn==0.23.2 +uvloop==0.17.0 +validators==0.22.0 +wandb==0.15.12 +watchfiles==0.20.0 +websockets==11.0.3 +yarl==1.9.2 +zipp==3.19.2 From f33ddef95e855bf1b400966ff4615ae64b89cdbc Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:52:49 -0400 Subject: [PATCH 31/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index df06f401..fad0e729 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -119,7 +119,7 @@ pyTigerDriver==1.0.15 pyTigerGraph==1.6.2 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.9.6 regex==2023.10.3 requests==2.32.2 rsa==4.9 From 1a971813609fb9eca7849826025f1d04ea1e85b4 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:57:19 -0400 Subject: [PATCH 32/91] langchain-openai conflicts --- common/llm_services/openai_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 22b32380..7b166398 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,6 +1,7 @@ import logging import os -from langchain_openai import ChatOpenAI + +from langchain_community.chat_models.openai import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv From e9f7468e44ec311e1621fb91d4abe7b4665137f1 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:31:30 -0400 Subject: [PATCH 33/91] reqs to fix unit test --- common/requirements.txt | 8 +- copilot/requirements.txt | 239 +++++++++++++++++++++------------------ 2 files changed, 134 insertions(+), 113 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index 2d9a90ba..122b1b73 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,11 +70,11 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 -langchain-community==0.2.11 -langchain-core==0.2.29 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 langchain-experimental==0.0.64 -langchain-openai==0.1.20 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langsmith==0.1.98 Levenshtein==0.25.1 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index fad0e729..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,156 +1,177 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.25 -langchain-experimental==0.0.63 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 +langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.19 -langchainhub==0.1.20 -Levenshtein==0.25.1 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 +pyTigerGraph==1.6.5 +pytz==2024.1 +PyYAML==6.0.2 rapidfuzz==3.9.6 -regex==2023.10.3 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +typing_extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From c8248d72e614d055f4fc1e5969373fa25d134f1e Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:48:48 -0400 Subject: [PATCH 34/91] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..e1a28c91 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 -langchain-milvus==0.1.4 +langchain-milvus==0.1.3 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 From 210d0fc74c57bb7919f36798822286461402fd95 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:58:24 -0400 Subject: [PATCH 35/91] reqs to fix unit test --- common/requirements.txt | 223 ++++++++++++++++++++++------------------ 1 file changed, 121 insertions(+), 102 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index 122b1b73..af45c357 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,158 +1,177 @@ -aiochannel==1.2.1 -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 +fonttools==4.53.1 +frozenlist==1.4.1 fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 langchain-core==0.2.30 langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langsmith==0.1.98 -Levenshtein==0.25.1 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 -langchain_milvus==0.1.3 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.40.2 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 pyTigerGraph==1.6.5 -pytz==2023.3.post1 -PyYAML==6.0.1 +pytz==2024.1 +PyYAML==6.0.2 rapidfuzz==3.9.6 -regex==2023.10.3 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 typing_extensions==4.12.2 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From 9c8b183273e5649d36c4a6e8bdd0f42c198df77c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:06:59 -0400 Subject: [PATCH 36/91] reqs to fix unit test --- common/requirements.txt | 2 -- copilot/requirements.txt | 2 -- 2 files changed, 4 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index af45c357..97fe5736 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -95,7 +95,6 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 -milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -123,7 +122,6 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 -pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e1a28c91..d2426a03 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -95,7 +95,6 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 -milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -123,7 +122,6 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 -pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 From e4d8168dfe4d3c44ba57844a3e6abbe2472ac8a2 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:11:28 -0400 Subject: [PATCH 37/91] reqs to fix unit test --- common/requirements.txt | 2 ++ copilot/requirements.txt | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/common/requirements.txt b/common/requirements.txt index 97fe5736..af45c357 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -95,6 +95,7 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 +milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -122,6 +123,7 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 +pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index d2426a03..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 -langchain-milvus==0.1.3 +langchain-milvus==0.1.4 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 @@ -95,6 +95,7 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 +milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -122,6 +123,7 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 +pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 From 538653f2f09c3abd0d1df456d501758b776e9f57 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:24:20 -0400 Subject: [PATCH 38/91] reqs to fix unit tests --- common/embeddings/milvus_embedding_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 7169379e..de7812fd 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,7 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from langchain_milvus.vectorstores import Milvus +# from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException From a63d3768971f6ddd01dfc59e7cd33a15ec073aa4 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:28:32 -0400 Subject: [PATCH 39/91] reqs to fix unit tests --- common/embeddings/embedding_services.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index 13c2cfd0..8020b97f 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -134,7 +134,8 @@ def __init__(self, config): super().__init__( config, model_name=config.get("model_name", "OpenAI gpt-4-0613") ) - from langchain_openai import OpenAIEmbeddings + # from langchain_openai import OpenAIEmbeddings + from langchain_community.embeddings.openai import OpenAIEmbeddings self.embeddings = OpenAIEmbeddings() From fe6643c79af599316cc6bf397c3ffac4053fa361 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:55:04 -0400 Subject: [PATCH 40/91] smoke test --- .../graphRAG/louvain/graphrag_louvain_communities.gsql | 2 +- common/gsql/supportai/Scan_For_Updates.gsql | 8 ++++---- common/llm_services/openai_service.py | 2 +- copilot/app/routers/supportai.py | 7 ------- copilot/requirements.txt | 1 + eventual-consistency-service/app/graphrag/util.py | 2 +- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql index 4137ca68..241ccaf0 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -172,7 +172,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max POST-ACCUM // Write the results to a new community vertex (iteration + 1) // ID , iter, edges within the community - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, ""), INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community ; diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index ba5444bd..7d9d1b83 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - ELSE IF v_type == "Concept" THEN - res = SELECT s FROM start:s - POST-ACCUM @@v_and_text += (s.id -> s.description), - s.epoch_processing = datetime_to_epoch(now()); + // ELSE IF v_type == "Concept" THEN + // res = SELECT s FROM start:s + // POST-ACCUM @@v_and_text += (s.id -> s.description), + // s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 7b166398..4f70b8cf 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,7 +1,7 @@ import logging import os -from langchain_community.chat_models.openai import ChatOpenAI +from langchain_openai.chat_models import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index 7b09acc9..0eff3c41 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -18,13 +18,6 @@ HNSWSiblingRetriever, ) -from common.config import ( - db_config, - embedding_service, - embedding_store, - get_llm_service, - llm_config, -) from common.config import ( db_config, embedding_service, diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..d287660f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index bcf1befe..186ab11a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -111,7 +111,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") From 64b3998e3d1a3838e46848eb9d69954ccf12b763 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:03:03 -0400 Subject: [PATCH 41/91] smoke test --- .github/workflows/pull-test-merge.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..a7c93c7e 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,6 +34,7 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt + pip install -U langchain-core pip install pytest - name: Create db config From e08d42a5d498615679b9859e93b3f67e94d70d0f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:07:55 -0400 Subject: [PATCH 42/91] smoke test --- .github/workflows/pull-test-merge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index a7c93c7e..e7cdd5a1 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,7 +34,7 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt - pip install -U langchain-core + pip install -U langchain-core langchain pip install pytest - name: Create db config From 17b09df8611321363012db71a75d3fc404ee0e54 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:12:16 -0400 Subject: [PATCH 43/91] smoke test --- copilot/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index d287660f..662ec077 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -79,12 +79,11 @@ jsonpointer==3.0.0 kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 -langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 From 6ce885f341bef5ae1fa5b1216d7dfa7254d3a17c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:22:12 -0400 Subject: [PATCH 44/91] smoke test --- .github/workflows/pull-test-merge.yaml | 1 - copilot/requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index e7cdd5a1..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,7 +34,6 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt - pip install -U langchain-core langchain pip install pytest - name: Create db config diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 662ec077..e4da4613 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -87,6 +87,7 @@ langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 +langchain-core==0.2.29 langgraph==0.2.3 langgraph-checkpoint==1.0.2 langsmith==0.1.99 From 442564bde03c69974ed8a953c5baa04d68681964 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:25:14 -0400 Subject: [PATCH 45/91] smoke test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e4da4613..56b5f71f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -77,7 +77,7 @@ jsonpatch==1.33 jsonpath-python==1.0.6 jsonpointer==3.0.0 kiwisolver==1.4.5 -langchain==0.2.13 +langchain==0.2.12 langchain-community==0.2.12 langchain-experimental==0.0.64 langchain-groq==0.1.9 From 2d8675eb540caa4fca57283f24f4ded62ec90752 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:32:40 -0400 Subject: [PATCH 46/91] smoke test --- copilot/requirements.txt | 240 ++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 131 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 56b5f71f..f737ed1f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,178 +1,156 @@ -aiochannel==1.2.1 -aiohappyeyeballs==2.3.5 -aiohttp==3.10.3 +aiohttp==3.9.3 aiosignal==1.3.1 -annotated-types==0.7.0 -anyio==4.4.0 +annotated-types==0.5.0 +anyio==3.7.1 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==24.2.0 -azure-core==1.30.2 -azure-storage-blob==12.22.0 +attrs==23.1.0 +azure-core==1.30.1 +azure-storage-blob==12.19.1 backoff==2.2.1 -beautifulsoup4==4.12.3 -boto3==1.34.159 -botocore==1.34.159 -cachetools==5.4.0 -certifi==2024.7.4 -cffi==1.17.0 +beautifulsoup4==4.12.2 +boto3==1.28.83 +botocore==1.31.83 +cachetools==5.3.2 +certifi==2023.7.22 +cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.2 +charset-normalizer==3.2.0 click==8.1.7 -contourpy==1.2.1 -cryptography==43.0.0 -cycler==0.12.1 -dataclasses-json==0.6.7 -deepdiff==7.0.1 -distro==1.9.0 +cryptography==42.0.5 +dataclasses-json==0.5.14 +distro==1.8.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.12.1 +emoji==2.8.0 environs==9.5.0 -exceptiongroup==1.2.2 -fastapi==0.112.0 +exceptiongroup==1.1.3 +fastapi==0.103.1 filelock==3.15.4 filetype==1.2.0 -fonttools==4.53.1 -frozenlist==1.4.1 -fsspec==2024.6.1 +frozenlist==1.4.0 +fsspec==2024.6.0 gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.19.1 -google-auth==2.33.0 -google-cloud-aiplatform==1.61.0 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-resource-manager==1.12.5 -google-cloud-storage==2.18.2 +GitPython==3.1.40 +google-api-core==2.14.0 +google-auth==2.23.4 +google-cloud-aiplatform==1.52.0 +google-cloud-bigquery==3.13.0 +google-cloud-core==2.3.3 +google-cloud-resource-manager==1.10.4 +google-cloud-storage==2.13.0 google-crc32c==1.5.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.63.2 -greenlet==3.0.3 -groq==0.9.0 -grpc-google-iam-v1==0.13.1 -grpcio==1.63.0 -grpcio-status==1.63.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 +greenlet==2.0.2 +groq==0.5.0 +grpc-google-iam-v1==0.12.7 +grpcio==1.59.2 +grpcio-status==1.59.2 h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub==0.24.5 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 +huggingface-hub==0.23.0 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.1.5 -idna==3.7 -importlib_metadata==8.2.0 +ibm_watsonx_ai==1.0.11 +idna==3.4 +importlib_metadata==8.0.0 iniconfig==2.0.0 isodate==0.6.1 -jiter==0.5.0 jmespath==1.0.1 -joblib==1.4.2 -jq==1.7.0 +joblib==1.3.2 +jq==1.6.0 jsonpatch==1.33 -jsonpath-python==1.0.6 -jsonpointer==3.0.0 -kiwisolver==1.4.5 -langchain==0.2.12 -langchain-community==0.2.12 -langchain-experimental==0.0.64 -langchain-groq==0.1.9 -langchain-ibm==0.1.12 -langchain-milvus==0.1.4 -langchain-openai==0.1.20 +jsonpointer==2.4 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 +langchain-experimental==0.0.63 +langchain-groq==0.1.8 +langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 -langchainhub==0.1.21 +langchain_milvus==0.1.3 +langchain_openai==0.1.19 +langchainhub==0.1.20 langdetect==1.0.9 -langchain-core==0.2.29 -langgraph==0.2.3 -langgraph-checkpoint==1.0.2 -langsmith==0.1.99 +langgraph==0.1.16 +langsmith==0.1.94 Levenshtein==0.25.1 lomond==0.3.3 -lxml==5.3.0 -marshmallow==3.21.3 -matplotlib==3.9.2 -milvus-lite==2.4.9 -minio==7.2.7 -multidict==6.0.5 +lxml==4.9.3 +marshmallow==3.20.1 +matplotlib==3.9.1 +minio==7.2.5 +multidict==6.0.4 mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 -openai==1.40.6 -ordered-set==4.1.0 -orjson==3.10.7 -packaging==24.1 -pandas==2.1.4 +openai==1.37.1 +orjson==3.9.15 +packaging==23.2 +pandas==2.1.1 pathtools==0.1.2 -pillow==10.4.0 -platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.24.0 -protobuf==5.27.3 -psutil==6.0.0 -pyarrow==17.0.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pycparser==2.22 +proto-plus==1.22.3 +protobuf==4.24.4 +psutil==5.9.6 +pyarrow==15.0.1 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 pycryptodome==3.20.0 -pydantic==2.8.2 -pydantic_core==2.20.1 -pygit2==1.15.1 -pymilvus==2.4.5 -pyparsing==3.1.2 -pypdf==4.3.1 -pytest==8.3.2 +pydantic==2.3.0 +pydantic_core==2.6.3 +pygit2==1.13.2 +pymilvus==2.4.4 +pytest==8.2.0 python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-iso639==2024.4.27 +python-dotenv==1.0.0 +python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.5 -pytz==2024.1 -PyYAML==6.0.2 -rapidfuzz==3.9.6 -regex==2024.7.24 +pyTigerGraph==1.6.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +rapidfuzz==3.8.0 +regex==2023.10.3 requests==2.32.2 -requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.10.2 +s3transfer==0.7.0 scikit-learn==1.5.1 -scipy==1.14.0 -sentry-sdk==2.13.0 +sentry-sdk==1.32.0 setproctitle==1.3.3 -shapely==2.0.5 +shapely==2.0.2 six==1.16.0 smmap==5.0.1 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.32 -starlette==0.37.2 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.20 +starlette==0.27.0 tabulate==0.9.0 -tenacity==8.5.0 -threadpoolctl==3.5.0 +tenacity==8.2.3 tiktoken==0.7.0 -tqdm==4.66.5 -types-requests==2.32.0.20240712 +tqdm==4.66.1 +types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -unstructured==0.15.1 -unstructured-client==0.25.5 -urllib3==2.2.2 -uvicorn==0.30.6 -uvloop==0.19.0 -validators==0.33.0 -wandb==0.17.6 -watchfiles==0.23.0 -websockets==12.0 -wrapt==1.16.0 -yarl==1.9.4 -zipp==3.20.0 +typing_extensions==4.8.0 +tzdata==2023.3 +ujson==5.9.0 +unstructured==0.10.23 +urllib3==1.26.18 +uvicorn==0.23.2 +uvloop==0.17.0 +validators==0.22.0 +wandb==0.15.12 +watchfiles==0.20.0 +websockets==11.0.3 +yarl==1.9.2 +zipp==3.19.2 From e9f5e9d2719e68022f742f5dd2d50943669eb051 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:42:56 -0400 Subject: [PATCH 47/91] smoke test --- common/embeddings/milvus_embedding_store.py | 4 ++-- copilot/requirements.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index de7812fd..c60a8e2e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,8 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -# from langchain_milvus.vectorstores import Milvus -from langchain_community.vectorstores.milvus import Milvus +from langchain_milvus.vectorstores import Milvus +# from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException diff --git a/copilot/requirements.txt b/copilot/requirements.txt index f737ed1f..98af8b4b 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,13 +70,13 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -langchain-core==0.2.25 +# langchain-core==0.2.25 langchain-experimental==0.0.63 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain_openai==0.1.19 +langchain_openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 From 0ca73a31dfb653b8cab07111a4db019e000171d9 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:50:02 -0400 Subject: [PATCH 48/91] smoke test --- .github/workflows/pull-test-merge.yaml | 2 ++ copilot/requirements.txt | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..20024b34 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -64,6 +64,8 @@ jobs: - name: Run pytest run: | source venv/bin/activate + pip install -r copilot/requirements.txt + pip install -U langchain langchain-core cp -r copilot/tests/*test* copilot/tests/create_wandb_report.py copilot/app/ cd copilot/app python -m pytest --disable-warnings diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 98af8b4b..ac0a6ba6 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,6 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -# langchain-core==0.2.25 langchain-experimental==0.0.63 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 8252c1ecb193f11d570f27263afa1ff990814806 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:58:48 -0400 Subject: [PATCH 49/91] smoke test --- copilot/requirements.txt | 247 +++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 112 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index ac0a6ba6..5aed6147 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,155 +1,178 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.160 +botocore==1.34.160 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 -docstring_parser==0.16 -emoji==2.8.0 +docstring-parser==0.16 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +gitpython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.62.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm-watsonx-ai==1.1.5 +idna==3.7 +importlib-metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-experimental==0.0.63 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 +langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.20 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 -Levenshtein==0.25.1 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 -prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +prometheus-client==0.20.0 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1-modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 -pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 -rapidfuzz==3.8.0 -regex==2023.10.3 +pytigerdriver==1.0.15 +pytigergraph==1.6.5 +pytz==2024.1 +pyyaml==6.0.2 +rapidfuzz==3.9.6 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +setuptools==72.2.0 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +sqlalchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 +typing-extensions==4.12.2 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From 8777b3c0927348a5021b437da0614ee10de02c00 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:35:37 -0400 Subject: [PATCH 50/91] smoke test --- .github/workflows/pull-test-merge.yaml | 16 ++++++++-------- common/embeddings/milvus_embedding_store.py | 1 - common/llm_services/openai_service.py | 5 ++++- eventual-consistency-service/app/main.py | 7 ++++--- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 20024b34..2c032524 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -12,12 +12,12 @@ jobs: test: runs-on: [ self-hosted, dind ] - services: - milvus: - image: milvusdb/milvus:latest - ports: - - 19530:19530 - - 19121:19121 + # services: + # milvus: + # image: milvusdb/milvus:latest + # ports: + # - 19530:19530 + # - 19121:19121 steps: - name: Checkout code @@ -30,6 +30,8 @@ jobs: - name: Install and Check Python Setup run: | + pip install uv + alias pip='uv pip' python -m venv venv source venv/bin/activate python -m pip install --upgrade pip @@ -64,8 +66,6 @@ jobs: - name: Run pytest run: | source venv/bin/activate - pip install -r copilot/requirements.txt - pip install -U langchain langchain-core cp -r copilot/tests/*test* copilot/tests/create_wandb_report.py copilot/app/ cd copilot/app python -m pytest --disable-warnings diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index c60a8e2e..7169379e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -8,7 +8,6 @@ from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document from langchain_milvus.vectorstores import Milvus -# from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 4f70b8cf..aad5d44f 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,7 +1,10 @@ import logging import os -from langchain_openai.chat_models import ChatOpenAI +if os.getenv("ECC"): + from langchain_openai.chat_models import ChatOpenAI +else: + from langchain_community.chat_models import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 34403f1e..2c308074 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,3 +1,6 @@ +import os + +os.environ["ECC"] = True import json import logging from contextlib import asynccontextmanager @@ -190,9 +193,7 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) import time - ecc_status = ( - f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" - ) + ecc_status = f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From 69a7db449135d2a9413d53238b3a23906043da54 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:38:32 -0400 Subject: [PATCH 51/91] smoke test --- .github/workflows/pull-test-merge.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 2c032524..19e1ab08 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -12,12 +12,12 @@ jobs: test: runs-on: [ self-hosted, dind ] - # services: - # milvus: - # image: milvusdb/milvus:latest - # ports: - # - 19530:19530 - # - 19121:19121 + services: + milvus: + image: milvusdb/milvus:latest + ports: + - 19530:19530 + - 19121:19121 steps: - name: Checkout code From 4dfa51cefb6b5364894920efb58ecedc54760ef6 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:41:41 -0400 Subject: [PATCH 52/91] smoke test --- .github/workflows/pull-test-merge.yaml | 2 -- common/embeddings/milvus_embedding_store.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 19e1ab08..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -30,8 +30,6 @@ jobs: - name: Install and Check Python Setup run: | - pip install uv - alias pip='uv pip' python -m venv venv source venv/bin/activate python -m pip install --upgrade pip diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 7169379e..de7812fd 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,7 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from langchain_milvus.vectorstores import Milvus +# from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException From 56f8e16bc72fa5dbb0985db81bef71419c274ec2 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 22:10:57 -0400 Subject: [PATCH 53/91] working --- copilot/requirements.txt | 1 + eventual-consistency-service/app/graphrag/graph_rag.py | 2 +- eventual-consistency-service/app/main.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 5aed6147..4a5ac3d1 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 86f172b8..ecca36b2 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -437,5 +437,5 @@ async def run(graphname: str, conn: TigerGraphConnection): end = time.perf_counter() logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") - logger.info(f"DONE. graphrag initializer dT: {community_end-community_start}") + logger.info(f"DONE. graphrag community initializer dT: {community_end-community_start}") logger.info(f"DONE. graphrag.run() total time elaplsed: {end-init_start}") diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 2c308074..2ccc10e2 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,6 +1,6 @@ import os -os.environ["ECC"] = True +os.environ["ECC"] = "true" import json import logging from contextlib import asynccontextmanager @@ -193,7 +193,7 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) import time - ecc_status = f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" + ecc_status = f"GraphRAG initialization on {conn.graphname} {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From 1c66d9937b6e258656ba38c8bc242974aa576b00 Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Wed, 14 Aug 2024 17:43:03 -0700 Subject: [PATCH 54/91] GML-1852 supportai init --- common/requirements.txt | 1 + docker-compose.yml | 2 +- .../app/graphrag/graph_rag.py | 2 +- eventual-consistency-service/app/main.py | 25 +- .../app/supportai/__init__.py | 1 + .../app/supportai/supportai_init.py | 210 ++++++++++++ .../app/supportai/util.py | 244 ++++++++++++++ .../app/supportai/workers.py | 309 ++++++++++++++++++ eventual-consistency-service/ecc_util.py | 54 +++ 9 files changed, 835 insertions(+), 13 deletions(-) create mode 100644 eventual-consistency-service/app/supportai/__init__.py create mode 100644 eventual-consistency-service/app/supportai/supportai_init.py create mode 100644 eventual-consistency-service/app/supportai/util.py create mode 100644 eventual-consistency-service/app/supportai/workers.py create mode 100644 eventual-consistency-service/ecc_util.py diff --git a/common/requirements.txt b/common/requirements.txt index af45c357..d287660f 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 diff --git a/docker-compose.yml b/docker-compose.yml index 058c0d77..1d4b01ad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,7 +21,7 @@ services: eventual-consistency-service: image: tigergraphml/ecc:latest - # container_name: eventual-consistency-service + container_name: eventual-consistency-service build: context: . dockerfile: eventual-consistency-service/Dockerfile diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index ecca36b2..1a979b4f 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -53,7 +53,7 @@ async def stream_docs( # continue to the next doc. # This doc will not be marked as processed, so the ecc will process it eventually. continue - logger.info("steam_docs writes to docs") + logger.info("stream_docs writes to docs") await docs_chan.put(res.json()["results"][0]["DocContent"][0]) except Exception as e: exc = traceback.format_exc() diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 2ccc10e2..c018dd43 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -2,6 +2,7 @@ os.environ["ECC"] = "true" import json +import time import logging from contextlib import asynccontextmanager from threading import Thread @@ -9,6 +10,7 @@ import ecc_util import graphrag +import supportai from eventual_consistency_checker import EventualConsistencyChecker from fastapi import BackgroundTasks, Depends, FastAPI, Response, status from fastapi.security.http import HTTPBase @@ -179,19 +181,20 @@ def consistency_status( ) match ecc_method: case SupportAIMethod.SUPPORTAI: - if graphname in consistency_checkers: - ecc = consistency_checkers[graphname] - ecc_status = json.dumps(ecc.get_status()) - else: - start_ecc_in_thread(graphname, conn) - ecc_status = ( - f"Eventual consistency checker started for graph {graphname}" - ) - - LogWriter.info(f"Returning consistency status for {graphname}: {status}") + # if graphname in consistency_checkers: + # ecc = consistency_checkers[graphname] + # ecc_status = json.dumps(ecc.get_status()) + # else: + # start_ecc_in_thread(graphname, conn) + background.add_task(supportai.run, graphname, conn) + # ecc_status = ( + # f"Eventual consistency checker started for graph {graphname} {time.ctime()}" + # ) + ecc_status = f"SupportAI initialization on {graphname} {time.ctime()}" + + # LogWriter.info(f"Returning consistency status for {graphname}: {status}") case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) - import time ecc_status = f"GraphRAG initialization on {conn.graphname} {time.ctime()}" case _: diff --git a/eventual-consistency-service/app/supportai/__init__.py b/eventual-consistency-service/app/supportai/__init__.py new file mode 100644 index 00000000..4483d1ee --- /dev/null +++ b/eventual-consistency-service/app/supportai/__init__.py @@ -0,0 +1 @@ +from .supportai_init import * \ No newline at end of file diff --git a/eventual-consistency-service/app/supportai/supportai_init.py b/eventual-consistency-service/app/supportai/supportai_init.py new file mode 100644 index 00000000..0f23985e --- /dev/null +++ b/eventual-consistency-service/app/supportai/supportai_init.py @@ -0,0 +1,210 @@ +import asyncio +import logging +import time +import traceback +import httpx + +from aiochannel import Channel +from pyTigerGraph import TigerGraphConnection + +from common.config import embedding_service +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor +from supportai import workers +from supportai.util import ( + init, + make_headers, + http_timeout, + stream_ids +) + +logger = logging.getLogger(__name__) + +consistency_checkers = {} + + +async def stream_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + ttl_batches: int = 10 +): + """ + Streams the document contents into the docs_chan + """ + logger.info("streaming docs") + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + for i in range(ttl_batches): + doc_ids = await stream_ids(conn, "Document", i, ttl_batches) + if doc_ids["error"]: + continue + + for d in doc_ids["ids"]: + try: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + if res.status_code != 200: + continue + logger.info("stream_docs writes to docs") + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + except Exception as e: + exc = traceback.format_exc() + logger.error(f"Error retrieveing doc: {d} --> {e}\n{exc}") + continue + logger.info("stream_docs done") + logger.info("closing docs chan") + docs_chan.close() + + +async def chunk_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + embed_chan: Channel, + upsert_chan: Channel, + extract_chan: Channel +): + """ + Creates and starts one worker for each document + in the docs channel. + """ + logger.info("Reading form docs channel") + doc_task = [] + async with asyncio.TaskGroup() as sp: + async for content in docs_chan: + v_id = content["v_id"] + txt = content["attributes"]["text"] + + logger.info("chunk writes to extract") + await embed_chan.put((v_id, txt, "Document")) + + task = sp.create_task( + workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) + ) + doc_task.append(task) + + logger.info("chunk_docs done") + logger.info("closing extract_chan") + extract_chan.close() + + +async def upsert( + upsert_chan: Channel +): + """ + Creates and starts one worker for each upsert job + chan expects: + (func, args) <- q.get() + """ + + logger.info("Reading from upsert channel") + # consume task queue + async with asyncio.TaskGroup() as sp: + async for func, args in upsert_chan: + logger.info(f"{func.__name__}, {args[1]}") + # execute the task + sp.create_task(func(*args)) + + logger.info(f"upsert done") + + +async def embed( + embed_chan: Channel, + index_stores: dict[str, MilvusEmbeddingStore], + graphname: str +): + """ + Creates and starts one worker for each embed job + chan expects: + (v_id, content, index_name) <- q.get() + """ + logger.info("Reading from embed channel") + async with asyncio.TaskGroup() as grp: + # consume task queue + async for v_id, content, index_name in embed_chan: + embedding_store = index_stores[f"{graphname}_{index_name}"] + logger.info(f"Embed to {graphname}_{index_name}: {v_id}") + grp.create_task( + workers.embed( + embedding_service, + embedding_store, + v_id, + content, + ) + ) + + logger.info(f"embed done") + + +async def extract( + extract_chan: Channel, + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection +): + """ + Creates and starts one worker for each extract job + chan expects: + (chunk , chunk_id) <- q.get() + """ + logger.info("Reading from extract channel") + # consume task queue + async with asyncio.TaskGroup() as grp: + async for item in extract_chan: + grp.create_task( + workers.extract(upsert_chan, embed_chan, extractor, conn, *item) + ) + + logger.info(f"extract done") + + logger.info("closing upsert and embed chan") + upsert_chan.close() + embed_chan.close() + + +async def run( + graphname: str, + conn: TigerGraphConnection +): + """ + Set up SupportAI: + - Install necessary queries. + - Process the documents into: + - chuncks + - embeddings + - entities/relationshio (and their embeddings) + - upsert everything to the graph + """ + + extractor, index_stores = await init(conn) + init_start = time.perf_counter() + + doc_process_switch = True + + if doc_process_switch: + logger.info("Doc Processing Start") + docs_chan = Channel(1) + embed_chan = Channel(100) + upsert_chan = Channel(100) + extract_chan = Channel(100) + async with asyncio.TaskGroup() as sp: + # Get docs + sp.create_task(stream_docs(conn, docs_chan, 10)) + # Process docs + sp.create_task( + chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) + ) + # Upsert chunks + sp.create_task(upsert(upsert_chan)) + # Embed + sp.create_task(embed(embed_chan, index_stores, graphname)) + # Extract entities + sp.create_task( + extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + ) + init_end = time.perf_counter() + logger.info("Doc Processing End") + logger.info(f"DONE. supportai system initializer dT: {init_end-init_start}") \ No newline at end of file diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py new file mode 100644 index 00000000..120e2059 --- /dev/null +++ b/eventual-consistency-service/app/supportai/util.py @@ -0,0 +1,244 @@ +import asyncio +import base64 +import json +import logging +import re +import traceback +from glob import glob + +import httpx +from supportai import workers +from pyTigerGraph import TigerGraphConnection + +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor +from common.extractors.BaseExtractor import BaseExtractor +from common.logs.logwriter import LogWriter + +logger = logging.getLogger(__name__) +http_timeout = httpx.Timeout(15.0) + + +async def install_queries( + requried_queries: list[str], + conn: TigerGraphConnection, +): + # queries that are currently installed + installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + + # doesn't need to be parallel since tg only does it one at a time + for q in requried_queries: + # only install n queries at a time (n=n_workers) + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + res = await workers.install_query(conn, q) + # stop system if a required query doesn't install + if res["error"]: + raise Exception(res["message"]) + + +async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): + content = "init" + vec = embedding_service.embed_query(content) + await s.aadd_embeddings([(content, vec)], [{vertex_field: content}]) + s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") + + +async def init( + conn: TigerGraphConnection, +) -> tuple[BaseExtractor, dict[str, MilvusEmbeddingStore]]: + # install requried queries + requried_queries = [ + "common/gsql/supportai/Scan_For_Updates", + "common/gsql/supportai/Update_Vertices_Processing_Status", + "common/gsql/supportai/ECC_Status", + "common/gsql/supportai/Check_Nonexistent_Vertices", + "common/gsql/graphRAG/StreamIds", + "common/gsql/graphRAG/StreamDocContent", + # "common/gsql/graphRAG/SetEpochProcessing", + ] + await install_queries(requried_queries, conn) + + # extractor + if doc_processing_config.get("extractor") == "graphrag": + extractor = GraphExtractor() + elif doc_processing_config.get("extractor") == "llm": + extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) + else: + raise ValueError("Invalid extractor type") + vertex_field = milvus_config.get("vertex_field", "vertex_id") + index_names = milvus_config.get( + "indexes", + [ + "Document", + "DocumentChunk", + "Entity", + "Relationship" + ], + ) + index_stores = {} + async with asyncio.TaskGroup() as tg: + for index_name in index_names: + name = conn.graphname + "_" + index_name + s = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, + drop_old=False, + ) + + LogWriter.info(f"Initializing {name}") + # init collection if it doesn't exist + if not s.check_collection_exists(): + tg.create_task(init_embedding_index(s, vertex_field)) + + index_stores[name] = s + + return extractor, index_stores + + +def make_headers(conn: TigerGraphConnection): + if conn.apiToken is None or conn.apiToken == "": + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + else: + headers = {"Authorization": f"Bearer {conn.apiToken}"} + + return headers + + +async def stream_ids( + conn: TigerGraphConnection, v_type: str, current_batch: int, ttl_batches: int +) -> dict[str, str | list[str]]: + headers = make_headers(conn) + + try: + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + "v_type": v_type, + }, + headers=headers, + ) + ids = res.json()["results"][0]["@@ids"] + return {"error": False, "ids": ids} + + except Exception as e: + exc = traceback.format_exc() + LogWriter.error(f"/{conn.graphname}/query/StreamIds\nException Trace:\n{exc}") + + return {"error": True, "message": str(e)} + + +def map_attrs(attributes: dict): + # map attrs + attrs = {} + for k, v in attributes.items(): + if isinstance(v, tuple): + attrs[k] = {"value": v[0], "op": v[1]} + elif isinstance(v, dict): + attrs[k] = { + "value": {"keylist": list(v.keys()), "valuelist": list(v.values())} + } + else: + attrs[k] = {"value": v} + return attrs + + +def process_id(v_id: str): + v_id = v_id.replace(" ", "_").replace("/", "") + + has_func = re.compile(r"(.*)\(").findall(v_id) + if len(has_func) > 0: + v_id = has_func[0] + if v_id == "''" or v_id == '""': + return "" + + return v_id + + +async def upsert_vertex( + conn: TigerGraphConnection, + vertex_type: str, + vertex_id: str, + attributes: dict, +): + logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") + vertex_id = vertex_id.replace(" ", "_") + attrs = map_attrs(attributes) + data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + + res.raise_for_status() + + +async def check_vertex_exists(conn, v_id: str): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) + + res.raise_for_status() + return res.json() + + +async def upsert_edge( + conn: TigerGraphConnection, + src_v_type: str, + src_v_id: str, + edge_type: str, + tgt_v_type: str, + tgt_v_id: str, + attributes: dict = None, +): + if attributes is None: + attrs = {} + else: + attrs = map_attrs(attributes) + src_v_id = src_v_id.replace(" ", "_") + tgt_v_id = tgt_v_id.replace(" ", "_") + data = json.dumps( + { + "edges": { + src_v_type: { + src_v_id: { + edge_type: { + tgt_v_type: { + tgt_v_id: attrs, + } + } + }, + } + } + } + ) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + res.raise_for_status() diff --git a/eventual-consistency-service/app/supportai/workers.py b/eventual-consistency-service/app/supportai/workers.py new file mode 100644 index 00000000..39e7e9f4 --- /dev/null +++ b/eventual-consistency-service/app/supportai/workers.py @@ -0,0 +1,309 @@ +import base64 +import time +import logging +import httpx +from urllib.parse import quote_plus + +import ecc_util + +from aiochannel import Channel +from supportai import util +from pyTigerGraph import TigerGraphConnection +from common.config import milvus_config +from langchain_community.graphs.graph_document import GraphDocument, Node +from common.embeddings.embedding_services import EmbeddingModel +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor +from common.logs.logwriter import LogWriter + + +vertex_field = milvus_config.get("vertex_field", "vertex_id") + +logger = logging.getLogger(__name__) + + +async def install_query( + conn: TigerGraphConnection, query_path: str +) -> dict[str, httpx.Response | str | None]: + LogWriter.info(f"Installing query {query_path}") + with open(f"{query_path}.gsql", "r") as f: + query = f.read() + + query_name = query_path.split("/")[-1] + query = f"""\ +USE GRAPH {conn.graphname} +{query} +INSTALL QUERY {query_name}""" + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) + + if "error" in res.text.lower(): + LogWriter.error(res.text) + return { + "result": None, + "error": True, + "message": f"Failed to install query {query_name}", + } + + return {"result": res, "error": False} + + +async def chunk_doc( + conn: TigerGraphConnection, + doc: dict[str, str], + upsert_chan: Channel, + embed_chan: Channel, + extract_chan: Channel, +): + """ + Chunks a document. + Places the resulting chunks into the upsert channel (to be upserted to TG) + and the embed channel (to be embedded and written to the vector store) + """ + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = util.process_id(doc["v_id"]) + logger.info(f"Chunking {v_id}") + for i, chunk in enumerate(chunks): + chunk_id = f"{v_id}_chunk_{i}" + # send chunks to be upserted (func, args) + logger.info("chunk writes to upsert_chan") + await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) + + # send chunks to be embedded + logger.info("chunk writes to embed_chan") + await embed_chan.put((v_id, chunk, "DocumentChunk")) + + # send chunks to have entities extracted + logger.info("chunk writes to extract_chan") + await extract_chan.put((chunk, chunk_id)) + + return doc["v_id"] + + +async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): + logger.info(f"Upserting chunk {chunk_id}") + date_added = int(time.time()) + await util.upsert_vertex( + conn, + "DocumentChunk", + chunk_id, + attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, + ) + await util.upsert_vertex( + conn, + "Content", + chunk_id, + attributes={"text": chunk, "epoch_added": date_added}, + ) + await util.upsert_edge( + conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id + ) + await util.upsert_edge( + conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id + ) + if int(chunk_id.split("_")[-1]) > 0: + await util.upsert_edge( + conn, + "DocumentChunk", + chunk_id, + "IS_AFTER", + "DocumentChunk", + doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + ) + + +async def embed( + embed_svc: EmbeddingModel, + embed_store: MilvusEmbeddingStore, + v_id: str, + content: str, +): + """ + Args: + graphname: str + the name of the graph the documents are in + embed_svc: EmbeddingModel + The class used to vectorize text + embed_store: + The class used to store the vectore to a vector DB + v_id: str + the vertex id that will be embedded + content: str + the content of the document/chunk + index_name: str + the vertex index to write to + """ + logger.info(f"Embedding {v_id}") + + vec = await embed_svc.aembed_query(content) + await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + +async def get_vert_desc(conn, v_id, node: Node): + desc = [node.properties.get("definition", "")] + exists = await util.check_vertex_exists(conn, v_id) + # if vertex exists, get description content and append this description to it + if not exists["error"]: + # deduplicate descriptions + desc.extend(exists["results"][0]["attributes"]["definition"]) + desc = list(set(desc)) + return desc + + +async def extract( + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection, + chunk: str, + chunk_id: str, +): + logger.info(f"Extracting chunk: {chunk_id}") + extracted: list[GraphDocument] = await extractor.aextract(chunk) + # upsert nodes and edges to the graph + for doc in extracted: + for node in doc.nodes: + logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") + v_id = util.process_id(str(node.id)) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, node) + + # embed the entity + # embed with the v_id if the description is blank + if len(desc[0]): + await embed_chan.put((v_id, v_id, "Entity")) + else: + # (v_id, content, index_name) + await embed_chan.put((v_id, desc[0], "Entity")) + + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # v_id + { # attrs + "definition": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # link the entity to the chunk it came from + logger.info("extract writes contains edge to upsert") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "DocumentChunk", # src_type + chunk_id, # src_id + "CONTAINS_ENTITY", # edge_type + "Entity", # tgt_type + v_id, # tgt_id + None, # attributes + ), + ) + ) + + for edge in doc.relationships: + # logger.info( + # f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" + # ) + # upsert verts first to make sure their ID becomes an attr + v_id = util.process_id(edge.type) # edge type + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.type) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Relationship", # v_type + v_id, + { # attrs + "definition": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + v_id = util.process_id(edge.source.id) # source id + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.source) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, + { # attrs + "definition": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + v_id = util.process_id(edge.target.id) # target id + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.target) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # src_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # upsert the edge between the two entities + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + util.process_id(edge.source.id), # src_id + "IS_HEAD_OF", # edgeType + "Relationship", # tgt_type + util.process_id(edge.type), # tgt_id + # {"relation_type": edge.type}, # attributes + ), + ) + ) + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Relationship", # src_type + util.process_id(edge.type), # src_id + "HAS_TAIL", # edgeType + "Entity", # tgt_type + util.process_id(edge.target.id), # tgt_id + # {"relation_type": edge.type}, # attributes + ), + ) + ) + # embed "Relationship" + # (v_id, content, index_name) diff --git a/eventual-consistency-service/ecc_util.py b/eventual-consistency-service/ecc_util.py new file mode 100644 index 00000000..01685b43 --- /dev/null +++ b/eventual-consistency-service/ecc_util.py @@ -0,0 +1,54 @@ +from common.chunkers import character_chunker, regex_chunker, semantic_chunker +from common.config import doc_processing_config, embedding_service, llm_config +from common.llm_services import ( + AWS_SageMaker_Endpoint, + AWSBedrock, + AzureOpenAI, + GoogleVertexAI, + Groq, + HuggingFaceEndpoint, + Ollama, + OpenAI, +) + +def get_chunker(): + if doc_processing_config.get("chunker") == "semantic": + chunker = semantic_chunker.SemanticChunker( + embedding_service, + doc_processing_config["chunker_config"].get("method", "percentile"), + doc_processing_config["chunker_config"].get("threshold", 0.95), + ) + elif doc_processing_config.get("chunker") == "regex": + chunker = regex_chunker.RegexChunker( + pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") + ) + elif doc_processing_config.get("chunker") == "character": + chunker = character_chunker.CharacterChunker( + chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), + overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), + ) + else: + raise ValueError("Invalid chunker type") + + return chunker + + +def get_llm_service(): + if llm_config["completion_service"]["llm_service"].lower() == "openai": + llm_provider = OpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "azure": + llm_provider = AzureOpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "sagemaker": + llm_provider = AWS_SageMaker_Endpoint(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "vertexai": + llm_provider = GoogleVertexAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "bedrock": + llm_provider = AWSBedrock(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "groq": + llm_provider = Groq(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "ollama": + llm_provider = Ollama(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "huggingface": + llm_provider = HuggingFaceEndpoint(llm_config["completion_service"]) + + return llm_provider From 90ebd8c4f36d6e1a7d566d7cadc430fbd04192b7 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:20:39 -0400 Subject: [PATCH 55/91] init --- common/logs/log.py | 2 +- .../app/graphrag/graph_rag.py | 90 ++++++++++--------- .../app/graphrag/util.py | 71 ++++++++------- .../app/graphrag/workers.py | 11 +-- 4 files changed, 97 insertions(+), 77 deletions(-) diff --git a/common/logs/log.py b/common/logs/log.py index b4f11b77..ecff8a43 100644 --- a/common/logs/log.py +++ b/common/logs/log.py @@ -64,7 +64,7 @@ def logToRoot(message, *args, **kwargs): addLoggingLevel("DEBUG_PII", logging.DEBUG - 5) log_config = get_log_config() -LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() +LOGLEVEL = os.environ.get("LOGLEVEL", logging.INFO).upper() log_directory = log_config.get("log_file_path", "/tmp/logs") os.makedirs(log_directory, exist_ok=True) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index ecca36b2..29f03dce 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -12,6 +12,7 @@ init, make_headers, stream_ids, + tg_sem, ) from pyTigerGraph import TigerGraphConnection @@ -44,11 +45,12 @@ async def stream_docs( for d in doc_ids["ids"]: try: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) if res.status_code != 200: # continue to the next doc. # This doc will not be marked as processed, so the ecc will process it eventually. @@ -85,7 +87,7 @@ async def chunk_docs( txt = content["attributes"]["text"] # send the document to be embedded logger.info("chunk writes to extract") - await embed_chan.put((v_id, txt, "Document")) + # await embed_chan.put((v_id, txt, "Document")) task = grp.create_task( workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) @@ -221,10 +223,11 @@ async def resolve_entities( # Copy RELATIONSHIP edges to RESOLVED_RELATIONSHIP headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/ResolveRelationships/", - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/ResolveRelationships/", + headers=headers, + ) res.raise_for_status() @@ -236,19 +239,21 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): logger.info("Initializing Communities (first louvain pass)") headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_init", - params={"n_batches": 1}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_init", + params={"n_batches": 1}, + headers=headers, + ) res.raise_for_status() # get the modularity async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/modularity", - params={"iteration": 1, "batch_num": 1}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": 1, "batch_num": 1}, + headers=headers, + ) res.raise_for_status() mod = res.json()["results"][0]["mod"] logger.info(f"****mod pass 1: {mod}") @@ -263,21 +268,23 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): logger.info(f"Running louvain on Communities (iteration: {i})") # louvain pass async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", - params={"n_batches": 1, "iteration": i}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", + params={"n_batches": 1, "iteration": i}, + headers=headers, + ) res.raise_for_status() # get the modularity async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/modularity", - params={"iteration": i + 1, "batch_num": 1}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": i + 1, "batch_num": 1}, + headers=headers, + ) res.raise_for_status() mod = res.json()["results"][0]["mod"] logger.info(f"*** mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") @@ -307,11 +314,12 @@ async def stream_communities( # async for i in community_chan: # get the community from that layer async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/stream_community", - params={"iter": i}, - headers=headers, - ) + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/stream_community", + params={"iter": i}, + headers=headers, + ) resp.raise_for_status() comms = resp.json()["results"][0]["Comms"] @@ -345,7 +353,7 @@ async def summarize_communities( embed_chan.close() -async def run(graphname: str, conn: TigerGraphConnection): +async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): """ Set up GraphRAG: - Install necessary queries. @@ -362,8 +370,8 @@ async def run(graphname: str, conn: TigerGraphConnection): init_start = time.perf_counter() doc_process_switch = True - entity_resolution_switch = True - community_detection_switch = True + entity_resolution_switch =True + community_detection_switch =True if doc_process_switch: logger.info("Doc Processing Start") docs_chan = Channel(1) @@ -378,7 +386,7 @@ async def run(graphname: str, conn: TigerGraphConnection): chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) ) # upsert chunks - grp.create_task(upsert(upsert_chan)) + grp.create_task(upsert( upsert_chan)) # embed grp.create_task(embed(embed_chan, index_stores, graphname)) # extract entities @@ -437,5 +445,7 @@ async def run(graphname: str, conn: TigerGraphConnection): end = time.perf_counter() logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") - logger.info(f"DONE. graphrag community initializer dT: {community_end-community_start}") + logger.info( + f"DONE. graphrag community initializer dT: {community_end-community_start}" + ) logger.info(f"DONE. graphrag.run() total time elaplsed: {end-init_start}") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 186ab11a..a934f272 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,6 +5,7 @@ import re import traceback from glob import glob +from typing import Callable import httpx from graphrag import workers @@ -23,7 +24,9 @@ from common.logs.logwriter import LogWriter logger = logging.getLogger(__name__) -http_timeout = httpx.Timeout(15.0) +http_timeout = httpx.Timeout(15.0) + +tg_sem = asyncio.Semaphore(100) async def install_queries( @@ -111,7 +114,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=False, + drop_old=True, ) LogWriter.info(f"Initializing {name}") @@ -141,15 +144,16 @@ async def stream_ids( try: async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", - params={ - "current_batch": current_batch, - "ttl_batches": ttl_batches, - "v_type": v_type, - }, - headers=headers, - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + "v_type": v_type, + }, + headers=headers, + ) ids = res.json()["results"][0]["@@ids"] return {"error": False, "ids": ids} @@ -199,9 +203,10 @@ async def upsert_vertex( data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) res.raise_for_status() @@ -209,10 +214,11 @@ async def upsert_vertex( async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) res.raise_for_status() return res.json() @@ -250,20 +256,22 @@ async def upsert_edge( ) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) res.raise_for_status() async def get_commuinty_children(conn, i: int, c: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", - params={"comm": c, "iter": i}, - headers=headers, - ) + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", + params={"comm": c, "iter": i}, + headers=headers, + ) resp.raise_for_status() descrs = [] for d in resp.json()["results"][0]["children"]: @@ -281,11 +289,12 @@ async def get_commuinty_children(conn, i: int, c: str): async def check_vertex_has_desc(conn, i: int): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", - params={"iter": i}, - headers=headers, - ) + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", + params={"iter": i}, + headers=headers, + ) resp.raise_for_status() res = resp.json()["results"][0]["all_have_desc"] diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 755b1085..9d8df3c8 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -37,11 +37,12 @@ async def install_query( headers = {"Authorization": f"Basic {tkn}"} async with httpx.AsyncClient(timeout=None) as client: - res = await client.post( - conn.gsUrl + "/gsqlserver/gsql/file", - data=quote_plus(query.encode("utf-8")), - headers=headers, - ) + async with util.tg_sem: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) if "error" in res.text.lower(): LogWriter.error(res.text) From 722f4c5ff300c96c8868059f874d8602f6c59879 Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Fri, 16 Aug 2024 15:00:15 -0700 Subject: [PATCH 56/91] update relationship vertex --- common/requirements.txt | 2 +- .../notebooks/SupportAIDocIngestion.ipynb | 323 ++++++++++++++++++ copilot/requirements.txt | 2 +- .../app/graphrag/util.py | 67 ++-- .../app/supportai/supportai_init.py | 17 +- .../app/supportai/util.py | 44 +-- .../app/supportai/workers.py | 63 ++-- 7 files changed, 441 insertions(+), 77 deletions(-) create mode 100644 copilot/docs/notebooks/SupportAIDocIngestion.ipynb diff --git a/common/requirements.txt b/common/requirements.txt index d287660f..86bdc50c 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -101,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 diff --git a/copilot/docs/notebooks/SupportAIDocIngestion.ipynb b/copilot/docs/notebooks/SupportAIDocIngestion.ipynb new file mode 100644 index 00000000..7a380db5 --- /dev/null +++ b/copilot/docs/notebooks/SupportAIDocIngestion.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pyTigerGraph import TigerGraphConnection\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "# We first create a connection to the database\n", + "host = os.environ[\"HOST\"]\n", + "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", + "password = os.getenv(\"PASS\", \"tigergraph\")\n", + "conn = TigerGraphConnection(\n", + " host=host,\n", + " username=username,\n", + " password=password,\n", + ")\n", + "# conn.getToken()\n", + "\n", + "# And then add CoPilot's address to the connection. This address\n", + "# is the host's address where the CoPilot container is running.\n", + "# conn.ai.configureCoPilotHost(\"http://localhost:8000\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.gsql(\"\"\"CREATE GRAPH SupportAIDocIngestion()\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'l11vnumq77c33f0aa2ss5m0th5hqdj14'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.graphname = \"SupportAIDocIngestion\"\n", + "conn.getToken()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", + "# Create Graph Schema\n", + "# Install GSQL queries\n", + "# conn.ai.initializeSupportAI()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", + "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", + "res = conn.ai.createDocumentIngest(\n", + " data_source=\"s3\",\n", + " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", + " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", + " file_format=\"json\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.ai.runDocumentIngest(res[\"load_job_id\"], res[\"data_source_id\"], \"s3://tg-documentation/pytg_current/pytg_current.jsonl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'submitted'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.ai.forceConsistencyUpdate(method=\"supportai\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"How do I get a count of vertices in Python?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HNSW Index Overlap in Graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.ai.searchDocuments(query,\n", + " method=\"hnswoverlap\",\n", + " method_parameters = {\"indices\": [\"Document\", \"DocumentChunk\", \"Entity\", \"Relationship\"],\n", + " \"top_k\": 2,\n", + " \"num_hops\": 2,\n", + " \"num_seen_min\": 2})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Document Chunk Vector Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.ai.searchDocuments(query,\n", + " method=\"vdb\",\n", + " method_parameters={\"index\": \"DocumentChunk\",\n", + " \"top_k\": 5,\n", + " \"withHyDE\": False})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sibling Document Chunk Vector Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.ai.searchDocuments(query,\n", + " method=\"sibling\",\n", + " method_parameters={\"index\": \"DocumentChunk\",\n", + " \"top_k\": 5,\n", + " \"lookahead\": 3,\n", + " \"lookback\": 3,\n", + " \"withHyDE\": False})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing LLM Generated Responses\n", + "\n", + "TigerGraph CoPilot provides a way to generate the response to the user's query using a LLM, based on the search results from the methods above. You can compare the responses generated by the LLM for each of the search methods to see which one is the most relevant to the user's query. In this example, we can see that the HNSW Overlap method generates the most relevant response to the user's query. While none of the responses were wrong, the HNSW Overlap method generated the most relevant response to the user's query, by suggesting to use the `getVertexCount()` function to get the number of vertices in the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp = conn.ai.answerQuestion(query,\n", + " method=\"hnswoverlap\",\n", + " method_parameters = {\"indices\": [\"Document\", \"DocumentChunk\", \"Entity\", \"Relationship\"],\n", + " \"top_k\": 2,\n", + " \"num_hops\": 2,\n", + " \"num_seen_min\": 2})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"response\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"retrieved\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp = conn.ai.answerQuestion(query,\n", + " method=\"vdb\",\n", + " method_parameters={\"index\": \"DocumentChunk\",\n", + " \"top_k\": 5,\n", + " \"withHyDE\": False})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"response\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"retrieved\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp = conn.ai.answerQuestion(query,\n", + " method=\"sibling\",\n", + " method_parameters={\"index\": \"DocumentChunk\",\n", + " \"top_k\": 5,\n", + " \"lookahead\": 3,\n", + " \"lookback\": 3,\n", + " \"withHyDE\": False})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"response\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(resp[\"retrieved\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytg_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 4a5ac3d1..e057eb90 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -101,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 186ab11a..fba184ee 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -25,6 +25,7 @@ logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) +tg_sem = asyncio.Semaphore(100) async def install_queries( requried_queries: list[str], @@ -141,15 +142,16 @@ async def stream_ids( try: async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", - params={ - "current_batch": current_batch, - "ttl_batches": ttl_batches, - "v_type": v_type, - }, - headers=headers, - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + "v_type": v_type, + }, + headers=headers, + ) ids = res.json()["results"][0]["@@ids"] return {"error": False, "ids": ids} @@ -199,20 +201,22 @@ async def upsert_vertex( data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) - res.raise_for_status() + res.raise_for_status() async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) res.raise_for_status() return res.json() @@ -250,20 +254,22 @@ async def upsert_edge( ) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) res.raise_for_status() async def get_commuinty_children(conn, i: int, c: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", - params={"comm": c, "iter": i}, - headers=headers, - ) + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", + params={"comm": c, "iter": i}, + headers=headers, + ) resp.raise_for_status() descrs = [] for d in resp.json()["results"][0]["children"]: @@ -281,11 +287,12 @@ async def get_commuinty_children(conn, i: int, c: str): async def check_vertex_has_desc(conn, i: int): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", - params={"iter": i}, - headers=headers, - ) + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", + params={"iter": i}, + headers=headers, + ) resp.raise_for_status() res = resp.json()["results"][0]["all_have_desc"] diff --git a/eventual-consistency-service/app/supportai/supportai_init.py b/eventual-consistency-service/app/supportai/supportai_init.py index 0f23985e..9993376f 100644 --- a/eventual-consistency-service/app/supportai/supportai_init.py +++ b/eventual-consistency-service/app/supportai/supportai_init.py @@ -15,7 +15,8 @@ init, make_headers, http_timeout, - stream_ids + stream_ids, + tg_sem ) logger = logging.getLogger(__name__) @@ -41,11 +42,12 @@ async def stream_docs( for d in doc_ids["ids"]: try: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) if res.status_code != 200: continue logger.info("stream_docs writes to docs") @@ -167,7 +169,8 @@ async def extract( async def run( graphname: str, - conn: TigerGraphConnection + conn: TigerGraphConnection, + upsert_limit=10 ): """ Set up SupportAI: diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py index 120e2059..b1684b44 100644 --- a/eventual-consistency-service/app/supportai/util.py +++ b/eventual-consistency-service/app/supportai/util.py @@ -5,6 +5,7 @@ import re import traceback from glob import glob +from typing import Callable import httpx from supportai import workers @@ -25,6 +26,7 @@ logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) +tg_sem = asyncio.Semaphore(10) async def install_queries( requried_queries: list[str], @@ -129,15 +131,16 @@ async def stream_ids( try: async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", - params={ - "current_batch": current_batch, - "ttl_batches": ttl_batches, - "v_type": v_type, - }, - headers=headers, - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + "v_type": v_type, + }, + headers=headers, + ) ids = res.json()["results"][0]["@@ids"] return {"error": False, "ids": ids} @@ -187,9 +190,10 @@ async def upsert_vertex( data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) res.raise_for_status() @@ -197,10 +201,11 @@ async def upsert_vertex( async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", - headers=headers, - ) + async with tg_sem: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) res.raise_for_status() return res.json() @@ -238,7 +243,8 @@ async def upsert_edge( ) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + async with tg_sem: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) res.raise_for_status() diff --git a/eventual-consistency-service/app/supportai/workers.py b/eventual-consistency-service/app/supportai/workers.py index 39e7e9f4..ac72636e 100644 --- a/eventual-consistency-service/app/supportai/workers.py +++ b/eventual-consistency-service/app/supportai/workers.py @@ -38,11 +38,12 @@ async def install_query( headers = {"Authorization": f"Basic {tkn}"} async with httpx.AsyncClient(timeout=None) as client: - res = await client.post( - conn.gsUrl + "/gsqlserver/gsql/file", - data=quote_plus(query.encode("utf-8")), - headers=headers, - ) + async with util.tg_sem: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) if "error" in res.text.lower(): LogWriter.error(res.text) @@ -192,7 +193,7 @@ async def extract( "Entity", # v_type v_id, # v_id { # attrs - "definition": desc, + "description": desc, "epoch_added": int(time.time()), }, ), @@ -221,10 +222,17 @@ async def extract( # f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" # ) # upsert verts first to make sure their ID becomes an attr - v_id = util.process_id(edge.type) # edge type - if len(v_id) == 0: - continue - desc = await get_vert_desc(conn, v_id, edge.type) + # v_id = util.process_id(edge.type) # edge type + v_id = edge.type + # if len(v_id) == 0: + # continue + # desc = await get_vert_desc(conn, v_id, node) + # embed "Relationship" + # if len(desc[0]): + await embed_chan.put((v_id, v_id, "Relationship")) + # else: + # # (v_id, content, index_name) + # await embed_chan.put((v_id, desc[0], "Relationship")) await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -233,7 +241,7 @@ async def extract( "Relationship", # v_type v_id, { # attrs - "definition": desc, + # "description": desc, "epoch_added": int(time.time()), }, ), @@ -242,7 +250,7 @@ async def extract( v_id = util.process_id(edge.source.id) # source id if len(v_id) == 0: continue - desc = await get_vert_desc(conn, v_id, edge.source) + # desc = await get_vert_desc(conn, v_id, edge.source) await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -251,7 +259,7 @@ async def extract( "Entity", # v_type v_id, { # attrs - "definition": desc, + # "description": desc, "epoch_added": int(time.time()), }, ), @@ -260,7 +268,7 @@ async def extract( v_id = util.process_id(edge.target.id) # target id if len(v_id) == 0: continue - desc = await get_vert_desc(conn, v_id, edge.target) + # desc = await get_vert_desc(conn, v_id, edge.target) await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -269,7 +277,7 @@ async def extract( "Entity", # v_type v_id, # src_id { # attrs - "description": desc, + # "description": desc, "epoch_added": int(time.time()), }, ), @@ -286,7 +294,8 @@ async def extract( util.process_id(edge.source.id), # src_id "IS_HEAD_OF", # edgeType "Relationship", # tgt_type - util.process_id(edge.type), # tgt_id + # util.process_id(edge.type), # tgt_id + edge.type, # tgt_id # {"relation_type": edge.type}, # attributes ), ) @@ -297,7 +306,8 @@ async def extract( ( conn, "Relationship", # src_type - util.process_id(edge.type), # src_id + # util.process_id(edge.type), # src_id + edge.type, # src_id "HAS_TAIL", # edgeType "Entity", # tgt_type util.process_id(edge.target.id), # tgt_id @@ -305,5 +315,20 @@ async def extract( ), ) ) - # embed "Relationship" - # (v_id, content, index_name) + + # link the relationship to the chunk it came from + logger.info("extract writes mentions edge to upsert") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "DocumentChunk", # src_type + chunk_id, # src_id + "MENTIONS_RELATIONSHIP", # edge_type + "Relationship", # tgt_type + edge.type, # tgt_id + # None, # attributes + ), + ) + ) \ No newline at end of file From 45a6b02823c6f9fd37c3ef71b760ed1096e64f65 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Mon, 19 Aug 2024 11:15:38 -0500 Subject: [PATCH 57/91] initial commits --- .../GraphRAG_Community_Retriever.gsql | 7 ++ .../retrievers/HNSW_Search_Content.gsql | 2 + common/logs/log.py | 2 +- common/requirements.txt | 3 +- copilot/app/routers/supportai.py | 17 +++- copilot/app/supportai/retrievers/GraphRAG.py | 88 +++++++++++++++++++ copilot/app/supportai/retrievers/__init__.py | 1 + copilot/docs/notebooks/SupportAIDemo.ipynb | 4 +- eventual-consistency-service/.dockerignore | 1 + .../app/graphrag/util.py | 2 +- .../app/graphrag/workers.py | 2 +- 11 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql create mode 100644 copilot/app/supportai/retrievers/GraphRAG.py diff --git a/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql b/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql new file mode 100644 index 00000000..2d6ef9b0 --- /dev/null +++ b/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql @@ -0,0 +1,7 @@ +CREATE DISTRIBUTED QUERY GraphRAG_CommunityRetriever(INT community_level=2) FOR GRAPH pyTigerGraphRAG { + comms = {Community.*}; + + selected_comms = SELECT c FROM comms:c WHERE c.iteration == community_level; + + PRINT selected_comms; +} \ No newline at end of file diff --git a/common/gsql/supportai/retrievers/HNSW_Search_Content.gsql b/common/gsql/supportai/retrievers/HNSW_Search_Content.gsql index 9de116b7..a0f7d009 100644 --- a/common/gsql/supportai/retrievers/HNSW_Search_Content.gsql +++ b/common/gsql/supportai/retrievers/HNSW_Search_Content.gsql @@ -24,6 +24,8 @@ CREATE DISTRIBUTED QUERY HNSW_Search_Content(STRING v_type, STRING milvus_host, POST-ACCUM IF s.type == "Relationship" OR s.type == "Entity" OR s.type == "Concept" THEN @@final_retrieval += (s.id -> s.definition) + ELSE IF s.type == "Community" THEN + @@final_retrieval += (s.id -> s.description) END; PRINT @@final_retrieval; diff --git a/common/logs/log.py b/common/logs/log.py index ecff8a43..0f974d77 100644 --- a/common/logs/log.py +++ b/common/logs/log.py @@ -64,7 +64,7 @@ def logToRoot(message, *args, **kwargs): addLoggingLevel("DEBUG_PII", logging.DEBUG - 5) log_config = get_log_config() -LOGLEVEL = os.environ.get("LOGLEVEL", logging.INFO).upper() +LOGLEVEL = os.environ.get("LOGLEVEL", logging.INFO) log_directory = log_config.get("log_file_path", "/tmp/logs") os.makedirs(log_directory, exist_ok=True) diff --git a/common/requirements.txt b/common/requirements.txt index af45c357..86bdc50c 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 @@ -100,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index 0eff3c41..0b67b2b8 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -16,6 +16,7 @@ HNSWOverlapRetriever, HNSWRetriever, HNSWSiblingRetriever, + GraphRAG ) from common.config import ( @@ -175,7 +176,11 @@ def search( embedding_service, embedding_store, get_llm_service(llm_config), conn ) res = retriever.search(query.question, query.method_params["top_k"]) - + elif query.method.lower() == "graphrag": + retriever = GraphRAG( + embedding_service, embedding_store, get_llm_service(llm_config), conn + ) + res = retriever.search(query.question, query.method_params["community_level"]) return res @@ -232,6 +237,16 @@ def answer_question( embedding_service, embedding_store, get_llm_service(llm_config), conn ) res = retriever.retrieve_answer(query.question, query.method_params["top_k"]) + + elif query.method.lower() == "graphrag": + retriever = GraphRAG( + embedding_service, embedding_store, get_llm_service(llm_config), conn + ) + res = retriever.retrieve_answer( + query.question, + query.method_params["community_level"], + query.method_params["top_k_summaries"] + ) else: raise Exception("Method not implemented") diff --git a/copilot/app/supportai/retrievers/GraphRAG.py b/copilot/app/supportai/retrievers/GraphRAG.py new file mode 100644 index 00000000..abd22790 --- /dev/null +++ b/copilot/app/supportai/retrievers/GraphRAG.py @@ -0,0 +1,88 @@ +from supportai.retrievers import BaseRetriever +import asyncio +from concurrent.futures import ThreadPoolExecutor + +from common.metrics.tg_proxy import TigerGraphConnectionProxy + +from langchain_core.output_parsers import PydanticOutputParser +from langchain_core.prompts import PromptTemplate +from langchain_core.pydantic_v1 import BaseModel, Field, validator + +from common.llm_services import LLM_Model + + +class CommunityAnswer(BaseModel): + answer: str = Field(description="The answer to the question, based off of the context provided.") + quality_score: int = Field(description="The quality of the answer, based on how well it answers the question. Rate the answer from 0 (poor) to 100 (excellent).") + +output_parser = PydanticOutputParser(pydantic_object=CommunityAnswer) + +ANSWER_PROMPT = PromptTemplate(template = """ +You are a helpful assistant responsible for generating an answer to the question below using the data provided. +Include a quality score for the answer, based on how well it answers the question. The quality score should be between 0 (poor) and 100 (excellent). + +Question: {question} +Context: {context} + +{format_instructions} +""", +input_variables=["question", "context"], +partial_variables={"format_instructions": output_parser.get_format_instructions()} +) + + +class GraphRAG(BaseRetriever): + def __init__( + self, + embedding_service, + embedding_store, + llm_service: LLM_Model, + connection: TigerGraphConnectionProxy, + ): + super().__init__(embedding_service, embedding_store, llm_service, connection) + self._check_query_install("GraphRAG_CommunityRetriever") + + def search(self, question, community_level: int): + res = self.conn.runInstalledQuery("GraphRAG_CommunityRetriever", {"community_level": community_level}, usePost=True) + return res + + async def _generate_candidate(self, question, context): + model = self.llm_service.model + + + + chain = ANSWER_PROMPT | model | output_parser + + answer = await chain.ainvoke( + { + "question": question, + "context": context, + } + ) + return answer + + def gather_candidates(self, question, context): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + tasks = [self._generate_candidate(question, c) for c in context] + res = loop.run_until_complete(asyncio.gather(*tasks)) + loop.close() + return res + + def retrieve_answer(self, + question: str, + community_level: int, + top_k_summaries: int = 1): + retrieved = self.search(question, community_level) + context = [x["attributes"] for x in retrieved[0]["selected_comms"]] + + with ThreadPoolExecutor() as executor: + res = executor.submit(self.gather_candidates, question, context).result() + + # sort list by quality score + res.sort(key=lambda x: x.quality_score, reverse=True) + + new_context = [{"candidate_answer": x.answer, + "score": x.quality_score} for x in res[:top_k_summaries]] + + return self._generate_response(question, new_context) diff --git a/copilot/app/supportai/retrievers/__init__.py b/copilot/app/supportai/retrievers/__init__.py index 9bdcefa4..aa6cd324 100644 --- a/copilot/app/supportai/retrievers/__init__.py +++ b/copilot/app/supportai/retrievers/__init__.py @@ -3,3 +3,4 @@ from .HNSWOverlapRetriever import HNSWOverlapRetriever from .HNSWSiblingRetriever import HNSWSiblingRetriever from .EntityRelationshipRetriever import EntityRelationshipRetriever +from .GraphRAG import GraphRAG diff --git a/copilot/docs/notebooks/SupportAIDemo.ipynb b/copilot/docs/notebooks/SupportAIDemo.ipynb index 29519463..c5a11c34 100644 --- a/copilot/docs/notebooks/SupportAIDemo.ipynb +++ b/copilot/docs/notebooks/SupportAIDemo.ipynb @@ -159,7 +159,7 @@ } ], "source": [ - "conn.ai.forceConsistencyUpdate()" + "conn.ai.forceConsistencyUpdate(method=\"graphrag\")" ] }, { @@ -546,7 +546,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/eventual-consistency-service/.dockerignore b/eventual-consistency-service/.dockerignore index 5b04df42..2bf1da45 100644 --- a/eventual-consistency-service/.dockerignore +++ b/eventual-consistency-service/.dockerignore @@ -3,3 +3,4 @@ Dockerfile.tests docs tests udfs +__pycache__ \ No newline at end of file diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index a934f272..04e15afb 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -114,7 +114,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 9d8df3c8..37786aee 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -79,7 +79,7 @@ async def chunk_doc( # send chunks to be embedded logger.info("chunk writes to embed_chan") - await embed_chan.put((v_id, chunk, "DocumentChunk")) + await embed_chan.put((chunk_id, chunk, "DocumentChunk")) # send chunks to have entities extracted logger.info("chunk writes to extract_chan") From ec2e86fdb28306024fad0644885973fd3f4b9427 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Mon, 19 Aug 2024 14:06:56 -0500 Subject: [PATCH 58/91] fix parameter names --- copilot/app/routers/supportai.py | 2 +- copilot/app/supportai/retrievers/GraphRAG.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index 0b67b2b8..fac6601d 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -245,7 +245,7 @@ def answer_question( res = retriever.retrieve_answer( query.question, query.method_params["community_level"], - query.method_params["top_k_summaries"] + query.method_params["top_k_answer_candidates"] ) else: raise Exception("Method not implemented") diff --git a/copilot/app/supportai/retrievers/GraphRAG.py b/copilot/app/supportai/retrievers/GraphRAG.py index abd22790..442f8fcb 100644 --- a/copilot/app/supportai/retrievers/GraphRAG.py +++ b/copilot/app/supportai/retrievers/GraphRAG.py @@ -72,7 +72,7 @@ def gather_candidates(self, question, context): def retrieve_answer(self, question: str, community_level: int, - top_k_summaries: int = 1): + top_k_answer_candidates: int = 1): retrieved = self.search(question, community_level) context = [x["attributes"] for x in retrieved[0]["selected_comms"]] @@ -83,6 +83,6 @@ def retrieve_answer(self, res.sort(key=lambda x: x.quality_score, reverse=True) new_context = [{"candidate_answer": x.answer, - "score": x.quality_score} for x in res[:top_k_summaries]] + "score": x.quality_score} for x in res[:top_k_answer_candidates]] return self._generate_response(question, new_context) From 75b3e53760435d791647abd58cac725ba6fb4bce Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 13:37:34 -0700 Subject: [PATCH 59/91] add more error handling --- .../app/supportai/supportai_init.py | 10 +-- .../app/supportai/util.py | 67 ++++++++++++++----- .../app/supportai/workers.py | 5 +- 3 files changed, 60 insertions(+), 22 deletions(-) diff --git a/eventual-consistency-service/app/supportai/supportai_init.py b/eventual-consistency-service/app/supportai/supportai_init.py index 9993376f..ac5f49aa 100644 --- a/eventual-consistency-service/app/supportai/supportai_init.py +++ b/eventual-consistency-service/app/supportai/supportai_init.py @@ -80,7 +80,7 @@ async def chunk_docs( txt = content["attributes"]["text"] logger.info("chunk writes to extract") - await embed_chan.put((v_id, txt, "Document")) + # await embed_chan.put((v_id, txt, "Document")) task = sp.create_task( workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) @@ -123,12 +123,12 @@ async def embed( (v_id, content, index_name) <- q.get() """ logger.info("Reading from embed channel") - async with asyncio.TaskGroup() as grp: + async with asyncio.TaskGroup() as sp: # consume task queue async for v_id, content, index_name in embed_chan: embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") - grp.create_task( + sp.create_task( workers.embed( embedding_service, embedding_store, @@ -154,9 +154,9 @@ async def extract( """ logger.info("Reading from extract channel") # consume task queue - async with asyncio.TaskGroup() as grp: + async with asyncio.TaskGroup() as sp: async for item in extract_chan: - grp.create_task( + sp.create_task( workers.extract(upsert_chan, embed_chan, extractor, conn, *item) ) diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py index b1684b44..f20b4599 100644 --- a/eventual-consistency-service/app/supportai/util.py +++ b/eventual-consistency-service/app/supportai/util.py @@ -101,7 +101,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=False, + drop_old=True, ) LogWriter.info(f"Initializing {name}") @@ -191,24 +191,49 @@ async def upsert_vertex( headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + try: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) - res.raise_for_status() + res.raise_for_status() + except httpx.RequestError as exc: + logger.error(f"An error occurred while requesting {exc.request.url!r}.") + logger.error(f"Request body: {data}") + logger.error(f"Details: {exc}") + # Check if the exception has a response attribute + if hasattr(exc, 'response') and exc.response is not None: + logger.error(f"Response content: {exc.response.content}") + except httpx.HTTPStatusError as exc: + logger.error(f"Error response {exc.response.status_code} while requesting {exc.request.url!r}.") + logger.error(f"Response content: {exc.response.content}") + logger.error(f"Request body: {data}") async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: - res = await client.get( - f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", - headers=headers, - ) + try: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) + + res.raise_for_status() + return res.json() + except httpx.RequestError as exc: + logger.error(f"An error occurred while requesting {exc.request.url!r}.") + logger.error(f"Details: {exc}") + # Check if the exception has a response attribute + if hasattr(exc, 'response') and exc.response is not None: + logger.error(f"Response content: {exc.response.content}") + return {"error": "Request failed"} + except httpx.HTTPStatusError as exc: + logger.error(f"Error response {exc.response.status_code} while requesting {exc.request.url!r}.") + logger.error(f"Response content: {exc.response.content}") + return {"error": f"HTTP status error {exc.response.status_code}"} - res.raise_for_status() - return res.json() async def upsert_edge( @@ -244,7 +269,19 @@ async def upsert_edge( headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) - res.raise_for_status() + try: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + res.raise_for_status() + except httpx.RequestError as exc: + logger.error(f"An error occurred while requesting {exc.request.url!r}.") + logger.error(f"Request body: {data}") + logger.error(f"Details: {exc}") + # Check if the exception has a response attribute + if hasattr(exc, 'response') and exc.response is not None: + logger.error(f"Response content: {exc.response.content}") + except httpx.HTTPStatusError as exc: + logger.error(f"Error response {exc.response.status_code} while requesting {exc.request.url!r}.") + logger.error(f"Response content: {exc.response.content}") + logger.error(f"Request body: {data}") diff --git a/eventual-consistency-service/app/supportai/workers.py b/eventual-consistency-service/app/supportai/workers.py index ac72636e..74b18658 100644 --- a/eventual-consistency-service/app/supportai/workers.py +++ b/eventual-consistency-service/app/supportai/workers.py @@ -147,13 +147,14 @@ async def embed( vec = await embed_svc.aembed_query(content) await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + async def get_vert_desc(conn, v_id, node: Node): - desc = [node.properties.get("definition", "")] + desc = [node.properties.get("description", "")] exists = await util.check_vertex_exists(conn, v_id) # if vertex exists, get description content and append this description to it if not exists["error"]: # deduplicate descriptions - desc.extend(exists["results"][0]["attributes"]["definition"]) + desc.extend(exists["results"][0]["attributes"]["description"]) desc = list(set(desc)) return desc From e80d882342f6c1537b20406490866a957d39186c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 19 Aug 2024 18:03:06 -0400 Subject: [PATCH 60/91] batched loader --- common/config.py | 4 - .../gsql/graphRAG/communities_have_desc.gsql | 8 +- .../graphRAG/entities_have_resolution.gsql | 10 ++ common/gsql/graphRAG/loaders/tmp.gsql | 26 +++ .../louvain/graphrag_louvain_init.gsql | 13 +- common/requirements.txt | 5 +- copilot/requirements.txt | 4 +- .../app/graphrag/graph_rag.py | 99 ++++++++++-- .../app/graphrag/reusable_channel.py | 37 +++++ .../app/graphrag/util.py | 148 +++++++++++++----- .../app/graphrag/workers.py | 12 +- 11 files changed, 299 insertions(+), 67 deletions(-) create mode 100644 common/gsql/graphRAG/entities_have_resolution.gsql create mode 100644 common/gsql/graphRAG/loaders/tmp.gsql create mode 100644 eventual-consistency-service/app/graphrag/reusable_channel.py diff --git a/common/config.py b/common/config.py index 8812016c..9f1d3cab 100644 --- a/common/config.py +++ b/common/config.py @@ -92,8 +92,6 @@ "MILVUS_CONFIG must be a .json file or a JSON string, failed with error: " + str(e) ) - - if llm_config["embedding_service"]["embedding_model_service"].lower() == "openai": embedding_service = OpenAI_Embedding(llm_config["embedding_service"]) elif llm_config["embedding_service"]["embedding_model_service"].lower() == "azure": @@ -128,11 +126,9 @@ def get_llm_service(llm_config) -> LLM_Model: else: raise Exception("LLM Completion Service Not Supported") - LogWriter.info( f"Milvus enabled for host {milvus_config['host']} at port {milvus_config['port']}" ) - if os.getenv("INIT_EMBED_STORE", "true")=="true": LogWriter.info("Setting up Milvus embedding store for InquiryAI") try: diff --git a/common/gsql/graphRAG/communities_have_desc.gsql b/common/gsql/graphRAG/communities_have_desc.gsql index f5cda70e..75abcef5 100644 --- a/common/gsql/graphRAG/communities_have_desc.gsql +++ b/common/gsql/graphRAG/communities_have_desc.gsql @@ -1,4 +1,4 @@ -CREATE DISTRIBUTED QUERY communities_have_desc(UINT iter) SYNTAX V2{ +CREATE DISTRIBUTED QUERY communities_have_desc(UINT iter, BOOL p=False) SYNTAX V2{ SumAccum @@descrs; Comms = {Community.*}; Comms = SELECT c FROM Comms:c @@ -11,4 +11,10 @@ CREATE DISTRIBUTED QUERY communities_have_desc(UINT iter) SYNTAX V2{ PRINT (@@descrs == Comms.size()) as all_have_desc; PRINT @@descrs, Comms.size(); + + IF p THEN + Comms = SELECT c FROM Comms:c + WHERE c.iteration == iter and length(c.description) == 0; + PRINT Comms; + END; } diff --git a/common/gsql/graphRAG/entities_have_resolution.gsql b/common/gsql/graphRAG/entities_have_resolution.gsql new file mode 100644 index 00000000..ebc8442f --- /dev/null +++ b/common/gsql/graphRAG/entities_have_resolution.gsql @@ -0,0 +1,10 @@ +CREATE DISTRIBUTED QUERY entities_have_resolution() SYNTAX V2{ + SumAccum @@resolved; + Ents = {Entity.*}; + Ents = SELECT s FROM Ents:s -(RESOLVES_TO>)- ResolvedEntity + ACCUM @@resolved += 1; + + + PRINT (@@resolved >= Ents.size()) as all_resolved; + PRINT @@resolved, Ents.size(); +} diff --git a/common/gsql/graphRAG/loaders/tmp.gsql b/common/gsql/graphRAG/loaders/tmp.gsql new file mode 100644 index 00000000..e8d8d417 --- /dev/null +++ b/common/gsql/graphRAG/loaders/tmp.gsql @@ -0,0 +1,26 @@ +CREATE LOADING load_entity@uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} + + +CREATE LOADING load_ResolvedEntity@uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} +CREATE LOADING load_ asdfasdf @uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} +CREATE LOADING load_ asdfasdf @uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} +CREATE LOADING load_ asdfasdf @uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} +CREATE LOADING load_ asdfasdf @uuid@ { + DEFINE FILENAME Content; + LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; +} diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql index 42e9108d..a22c3565 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql @@ -18,7 +18,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community MaxAccum @@min_double; // used to reset the @best_move SumAccum @@move_cnt; - OrAccum @to_change_community; + OrAccum @to_change_community, @has_relations; SumAccum @batch_id; MinAccum @vid; @@ -152,6 +152,8 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches @@community_sum_total_map.clear(); Tmp = SELECT s FROM AllNodes:s -(_:e)-> ResolvedEntity:t ACCUM + s.@has_relations += TRUE, + t.@has_relations += TRUE, IF s.@community_id == t.@community_id THEN // keep track of how many edges are within the community @@community_sum_in_map += (s.@community_id -> wt) @@ -165,7 +167,14 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches INSERT INTO IN_COMMUNITY VALUES (s, s.@community_vid+"_1") // link entity to it's first community ; - PRINT @@source_target_k_in_map; + // Continue community hierarchy for unattached partitions + Tmp = SELECT s FROM AllNodes:s + WHERE NOT s.@has_relations + POST-ACCUM + // if s is a part of an unattached partition, add to its community hierarchy to maintain parity with rest of graph + INSERT INTO Community VALUES (s.id+"_1", 1, ""), + INSERT INTO IN_COMMUNITY VALUES (s, s.id+"_1"); // link entity to it's first community + @@community_sum_total_map.clear(); // link communities diff --git a/common/requirements.txt b/common/requirements.txt index af45c357..9912b4a8 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 @@ -83,7 +84,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai==0.1.22 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 @@ -100,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 4a5ac3d1..cd9bb7bc 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -84,7 +84,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai==0.1.22 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 @@ -101,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 29f03dce..54e47f26 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -2,17 +2,21 @@ import logging import time import traceback +from collections import defaultdict import httpx from aiochannel import Channel from graphrag import workers from graphrag.util import ( + check_all_ents_resolved, check_vertex_has_desc, http_timeout, init, + load_q, make_headers, stream_ids, tg_sem, + upsert_batch, ) from pyTigerGraph import TigerGraphConnection @@ -83,8 +87,8 @@ async def chunk_docs( doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - v_id = content["v_id"] - txt = content["attributes"]["text"] + # v_id = content["v_id"] + # txt = content["attributes"]["text"] # send the document to be embedded logger.info("chunk writes to extract") # await embed_chan.put((v_id, txt, "Document")) @@ -117,7 +121,65 @@ async def upsert(upsert_chan: Channel): # execute the task grp.create_task(func(*args)) - logger.info(f"upsert done") + logger.info("upsert done") + logger.info("closing load_q chan") + load_q.close() + + +async def load(conn: TigerGraphConnection): + logger.info("Reading from load_q") + dd = lambda: defaultdict(dd) # infinite default dict + batch_size = 250 + # while the load q is still open or has contents + while not load_q.closed() or not load_q.empty(): + if load_q.closed(): + logger.info( + f"load queue closed. Flushing load queue (final load for this stage)" + ) + # if there's `batch_size` entities in the channel, load it + # or if the channel is closed, flush it + if load_q.qsize() >= batch_size or load_q.closed() or load_q.should_flush(): + batch = { + "vertices": defaultdict(dict[str, any]), + "edges": dd(), + } + n_verts = 0 + n_edges = 0 + size = ( + load_q.qsize() + if load_q.closed() or load_q.should_flush() + else batch_size + ) + for _ in range(size): + t, elem = await load_q.get() + if t == "FLUSH": + logger.debug(f"flush recieved: {t}") + load_q._should_flush = False + break + match t: + case "vertices": + vt, v_id, attr = elem + batch[t][vt][v_id] = attr + n_verts += 1 + case "edges": + src_v_type, src_v_id, edge_type, tgt_v_type, tgt_v_id, attrs = ( + elem + ) + batch[t][src_v_type][src_v_id][edge_type][tgt_v_type][ + tgt_v_id + ] = attrs + n_edges += 1 + + logger.info( + f"Upserting batch size of {size}. ({n_verts} verts | {n_edges} edges)" + ) + await upsert_batch(conn, batch) + else: + await asyncio.sleep(1) + + # TODO: flush q if it's not empty + if not load_q.empty(): + raise Exception(f"load_q not empty: {load_q.qsize()}", flush=True) async def embed( @@ -132,6 +194,7 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: + continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") grp.create_task( @@ -288,6 +351,8 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): res.raise_for_status() mod = res.json()["results"][0]["mod"] logger.info(f"*** mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") + if mod == 0: + break # write iter to chan for layer to be processed await stream_communities(conn, i + 1, comm_process_chan) @@ -329,13 +394,18 @@ async def stream_communities( # Wait for all communities for layer i to be processed before doing next layer # all community descriptions must be populated before the next layer can be processed if len(comms) > 0: + n_waits = 0 while not await check_vertex_has_desc(conn, i): logger.info(f"Waiting for layer{i} to finish processing") await asyncio.sleep(5) + n_waits += 1 + if n_waits > 3: + logger.info("Flushing load_q") + await load_q.flush(("FLUSH", None)) await asyncio.sleep(3) - logger.info("stream_communities done") - logger.info("closing comm_process_chan") + # logger.info("stream_communities done") + # logger.info("closing comm_process_chan") async def summarize_communities( @@ -353,7 +423,7 @@ async def summarize_communities( embed_chan.close() -async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): +async def run(graphname: str, conn: TigerGraphConnection): """ Set up GraphRAG: - Install necessary queries. @@ -369,9 +439,9 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): extractor, index_stores = await init(conn) init_start = time.perf_counter() - doc_process_switch = True - entity_resolution_switch =True - community_detection_switch =True + doc_process_switch = False + entity_resolution_switch = False + community_detection_switch = True if doc_process_switch: logger.info("Doc Processing Start") docs_chan = Channel(1) @@ -386,7 +456,8 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) ) # upsert chunks - grp.create_task(upsert( upsert_chan)) + grp.create_task(upsert(upsert_chan)) + grp.create_task(load(conn)) # embed grp.create_task(embed(embed_chan, index_stores, graphname)) # extract entities @@ -403,6 +474,7 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): logger.info("Entity Processing Start") entities_chan = Channel(100) upsert_chan = Channel(100) + load_q.reopen() async with asyncio.TaskGroup() as grp: grp.create_task(stream_entities(conn, entities_chan, 50)) grp.create_task( @@ -414,8 +486,12 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): ) ) grp.create_task(upsert(upsert_chan)) + grp.create_task(load(conn)) entity_end = time.perf_counter() logger.info("Entity Processing End") + while not await check_all_ents_resolved(conn): + logger.info(f"Waiting for resolved entites to finish loading") + await asyncio.sleep(1) # Community Detection community_start = time.perf_counter() @@ -425,9 +501,9 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): comm_process_chan = Channel(100) upsert_chan = Channel(100) embed_chan = Channel(100) + load_q.reopen() async with asyncio.TaskGroup() as grp: # run louvain - # grp.create_task(communities(conn, communities_chan)) grp.create_task(communities(conn, comm_process_chan)) # get the communities # grp.create_task( stream_communities(conn, communities_chan, comm_process_chan)) @@ -436,6 +512,7 @@ async def run(graphname: str, conn: TigerGraphConnection, upsert_limit=100): summarize_communities(conn, comm_process_chan, upsert_chan, embed_chan) ) grp.create_task(upsert(upsert_chan)) + grp.create_task(load(conn)) grp.create_task(embed(embed_chan, index_stores, graphname)) community_end = time.perf_counter() diff --git a/eventual-consistency-service/app/graphrag/reusable_channel.py b/eventual-consistency-service/app/graphrag/reusable_channel.py new file mode 100644 index 00000000..54ec62c9 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/reusable_channel.py @@ -0,0 +1,37 @@ +from asyncio import Queue + + +class ReuseableChannel: + def __init__(self, maxsize=0) -> None: + self.maxsize = maxsize + self.q = Queue(maxsize) + self._closed = False + self._should_flush = False + + async def put(self, item: any) -> None: + await self.q.put(item) + + async def get(self) -> any: + return await self.q.get() + + def closed(self): + return self._closed + + def should_flush(self): + return self._should_flush + + async def flush(self, flush_msg=None): + self._should_flush = True + await self.put(flush_msg) + + def empty(self): + return self.q.empty() + + def close(self): + self._closed = True + + def qsize(self) -> int: + return self.q.qsize() + + def reopen(self): + self._closed = False diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index a934f272..2c4dce98 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,10 +5,9 @@ import re import traceback from glob import glob -from typing import Callable import httpx -from graphrag import workers +from graphrag import reusable_channel, workers from pyTigerGraph import TigerGraphConnection from common.config import ( @@ -24,9 +23,10 @@ from common.logs.logwriter import LogWriter logger = logging.getLogger(__name__) -http_timeout = httpx.Timeout(15.0) +http_timeout = httpx.Timeout(15.0) tg_sem = asyncio.Semaphore(100) +load_q = reusable_channel.ReuseableChannel() async def install_queries( @@ -114,7 +114,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") @@ -200,15 +200,37 @@ async def upsert_vertex( logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) - data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + await load_q.put(("vertices", (vertex_type, vertex_id, attrs))) + # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + # headers = make_headers(conn) + # async with httpx.AsyncClient(timeout=http_timeout) as client: + # async with tg_sem: + # res = await client.post( + # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + # ) + # + # try: + # res.raise_for_status() + # except Exception as e: + # logger.error(f"Upsert err: {vertex_type} {vertex_id}\n{e}") + + +async def upsert_batch(conn: TigerGraphConnection, batch): + # logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") + # vertex_id = vertex_id.replace(" ", "_") + # attrs = map_attrs(attributes) + # await load_q.put(('vertices')) + data = json.dumps(batch) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - + # try: res.raise_for_status() + # except Exception as e: + # logger.error(f"Upsert err: {vertex_type} {vertex_id}\n{e}") async def check_vertex_exists(conn, v_id: str): @@ -220,8 +242,12 @@ async def check_vertex_exists(conn, v_id: str): headers=headers, ) - res.raise_for_status() - return res.json() + try: + res.raise_for_status() + return res.json() + except Exception as e: + logger.error(f"Check err:\n{e}\n{res.text}") + return {"error": True} async def upsert_edge( @@ -239,28 +265,45 @@ async def upsert_edge( attrs = map_attrs(attributes) src_v_id = src_v_id.replace(" ", "_") tgt_v_id = tgt_v_id.replace(" ", "_") - data = json.dumps( - { - "edges": { - src_v_type: { - src_v_id: { - edge_type: { - tgt_v_type: { - tgt_v_id: attrs, - } - } - }, - } - } - } + # data = json.dumps( + # { + # "edges": { + # src_v_type: { + # src_v_id: { + # edge_type: { + # tgt_v_type: { + # tgt_v_id: attrs, + # } + # } + # }, + # } + # } + # } + # ) + await load_q.put( + ( + "edges", + ( + src_v_type, + src_v_id, + edge_type, + tgt_v_type, + tgt_v_id, + attrs, + ), + ) ) - headers = make_headers(conn) - async with httpx.AsyncClient(timeout=http_timeout) as client: - async with tg_sem: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) - res.raise_for_status() + + # headers = make_headers(conn) + # async with httpx.AsyncClient(timeout=http_timeout) as client: + # async with tg_sem: + # res = await client.post( + # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + # ) + # try: + # res.raise_for_status() + # except Exception as e: + # logger.error(f"Upsert Edge err:\n{e}") async def get_commuinty_children(conn, i: int, c: str): @@ -272,20 +315,47 @@ async def get_commuinty_children(conn, i: int, c: str): params={"comm": c, "iter": i}, headers=headers, ) - resp.raise_for_status() + try: + resp.raise_for_status() + except Exception as e: + logger.error(f"Get Children err:\n{e}") descrs = [] for d in resp.json()["results"][0]["children"]: desc = d["attributes"]["description"] - if i == 1 and all(len(x) == 0 for x in desc): - desc = [d["v_id"]] - elif len(desc) == 0: - desc = d["v_id"] - - descrs.append(desc) + # if it's the entity iteration + if i == 1: + # filter out empty strings + desc = list(filter(lambda x: len(x) > 0, desc)) + # if there are no descriptions, make it the v_id + if len(desc) == 0: + desc.append(d["v_id"]) + descrs.extend(desc) + else: + descrs.append(desc) + print(f"Comm: {c} --> {descrs}", flush=True) return descrs +async def check_all_ents_resolved(conn): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/entities_have_resolution", + headers=headers, + ) + try: + resp.raise_for_status() + except Exception as e: + logger.error(f"Check Vert Desc err:\n{e}") + + res = resp.json()["results"][0]["all_resolved"] + logger.info(resp.json()["results"]) + + return res + + async def check_vertex_has_desc(conn, i: int): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: @@ -295,8 +365,12 @@ async def check_vertex_has_desc(conn, i: int): params={"iter": i}, headers=headers, ) - resp.raise_for_status() + try: + resp.raise_for_status() + except Exception as e: + logger.error(f"Check Vert Desc err:\n{e}") res = resp.json()["results"][0]["all_have_desc"] + logger.info(resp.json()["results"]) return res diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 9d8df3c8..1fb5a743 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -79,7 +79,7 @@ async def chunk_doc( # send chunks to be embedded logger.info("chunk writes to embed_chan") - await embed_chan.put((v_id, chunk, "DocumentChunk")) + await embed_chan.put((chunk_id, chunk, "DocumentChunk")) # send chunks to have entities extracted logger.info("chunk writes to extract_chan") @@ -179,7 +179,7 @@ async def extract( # embed the entity # embed with the v_id if the description is blank - if len(desc[0]): + if len(desc[0]) == 0: await embed_chan.put((v_id, v_id, "Entity")) else: # (v_id, content, index_name) @@ -219,7 +219,7 @@ async def extract( for edge in doc.relationships: logger.info( - f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" + f"extract writes relates edge to upsert:{edge.source.id} -({edge.type})-> {edge.target.id}" ) # upsert verts first to make sure their ID becomes an attr v_id = util.process_id(edge.source.id) # src_id @@ -359,11 +359,6 @@ async def process_community( logger.info(f"Processing Community: {comm_id}") # get the children of the community children = await util.get_commuinty_children(conn, i, comm_id) - if i == 1: - tmp = [] - for c in children: - tmp.extend(c) - children = list(filter(lambda x: len(x) > 0, tmp)) comm_id = util.process_id(comm_id) # if the community only has one child, use its description @@ -374,6 +369,7 @@ async def process_community( summarizer = community_summarizer.CommunitySummarizer(llm) summary = await summarizer.summarize(comm_id, children) + print(f"*******>{comm_id}: {children}, {summary}", flush=True) await upsert_chan.put( ( util.upsert_vertex, # func to call From 19f2973860600ad162af0e2d860656fbe79e2b9b Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 15:06:52 -0700 Subject: [PATCH 61/91] add logger and error handling for install queries --- .../app/supportai/util.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py index f20b4599..579463a8 100644 --- a/eventual-consistency-service/app/supportai/util.py +++ b/eventual-consistency-service/app/supportai/util.py @@ -33,18 +33,34 @@ async def install_queries( conn: TigerGraphConnection, ): # queries that are currently installed + logger.info("Fetching currently installed queries...") installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + logger.info(f"Installed queries: {installed_queries}") # doesn't need to be parallel since tg only does it one at a time for q in requried_queries: # only install n queries at a time (n=n_workers) q_name = q.split("/")[-1] + logger.info(f"Processing query: {q_name}") # if the query is not installed, install it if q_name not in installed_queries: - res = await workers.install_query(conn, q) - # stop system if a required query doesn't install - if res["error"]: - raise Exception(res["message"]) + logger.info(f"Query '{q_name}' not found in installed queries. Attempting to install...") + try: + res = await workers.install_query(conn, q) + # stop system if a required query doesn't install + if res["error"]: + logger.error(f"Failed to install query '{q_name}'. Error: {res['message']}") + raise Exception(f"Installation of query '{q_name}' failed with message: {res['message']}") + else: + logger.info(f"Successfully installed query '{q_name}'.") + + except Exception as e: + logger.critical(f"Critical error during installation of query '{q_name}': {e}") + raise e + else: + logger.info(f"Query '{q_name}' is already installed.") + + logger.info("Finished processing all required queries.") async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): @@ -64,8 +80,7 @@ async def init( "common/gsql/supportai/ECC_Status", "common/gsql/supportai/Check_Nonexistent_Vertices", "common/gsql/graphRAG/StreamIds", - "common/gsql/graphRAG/StreamDocContent", - # "common/gsql/graphRAG/SetEpochProcessing", + "common/gsql/graphRAG/StreamDocContent" ] await install_queries(requried_queries, conn) From 75f482800da42d6f48094c2db6908ab02e14fcc7 Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 15:15:27 -0700 Subject: [PATCH 62/91] remove ingestion test notebook --- copilot/docs/notebooks/SupportAIDemo.ipynb | 287 ++-------------- .../notebooks/SupportAIDocIngestion.ipynb | 323 ------------------ 2 files changed, 35 insertions(+), 575 deletions(-) delete mode 100644 copilot/docs/notebooks/SupportAIDocIngestion.ipynb diff --git a/copilot/docs/notebooks/SupportAIDemo.ipynb b/copilot/docs/notebooks/SupportAIDemo.ipynb index 29519463..09312bc1 100644 --- a/copilot/docs/notebooks/SupportAIDemo.ipynb +++ b/copilot/docs/notebooks/SupportAIDemo.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,6 @@ " username=username,\n", " password=password,\n", ")\n", - "conn.getToken()\n", "\n", "# And then add CoPilot's address to the connection. This address\n", "# is the host's address where the CoPilot container is running.\n", @@ -55,58 +54,35 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The graph pyTigerGraphRAG is created.'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.gsql(\"\"\"CREATE GRAPH pyTigerGraphRAG()\"\"\")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "conn.graphname = \"pyTigerGraphRAG\"" + "conn.graphname = \"pyTigerGraphRAG\"\n", + "conn.getToken()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'host_name': 'https://tg-26bfd0cd-6582-414e-937e-e2c83ecb5a79.us-east-1.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'pyTigerGraphRAG\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'pyTigerGraphRAG\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'pyTigerGraphRAG\\'.\\\\n\\\\nGraph pyTigerGraphRAG updated to new version 1\\\\nThe job add_supportai_schema completes in 82.016 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'pyTigerGraphRAG\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'pyTigerGraphRAG\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'pyTigerGraphRAG\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'pyTigerGraphRAG\\'.\\\\n\\\\nGraph pyTigerGraphRAG updated to new version 2\\\\nThe job add_supportai_indexes completes in 56.328 seconds!\\\\nLocal schema change succeeded.\"'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.initializeSupportAI()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -122,42 +98,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_e91e2502f0df4367bc8bdc22adea9f86',\n", - " 'job_id': 'pyTigerGraphRAG.load_documents_content_json_e91e2502f0df4367bc8bdc22adea9f86.stream.SupportAI_pyTigerGraphRAG_76597053df1f42819bbe326506ca77cc.1719320708053',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/pyTigerGraphRAG.load_documents_content_json_e91e2502f0df4367bc8bdc22adea9f86.stream.SupportAI_pyTigerGraphRAG_76597053df1f42819bbe326506ca77cc.1719320708053'}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.runDocumentIngest(res[\"load_job_id\"], res[\"data_source_id\"], \"s3://tg-documentation/pytg_current/pytg_current.jsonl\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'submitted'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.forceConsistencyUpdate()" ] @@ -178,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -194,33 +146,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'@@final_retrieval': {'EntityInfo': {'getVertexType': 'Function to retrieve the details of the specified vertex type.',\n", - " 'getVertexStats': 'Function to return vertex attribute statistics.',\n", - " 'getVertexCount': 'Function to retrieve the number of vertices of the specified type.',\n", - " 'pyTigerGraph': 'Python package for connecting to TigerGraph databases.'},\n", - " 'RelationshipInfo': {'pyTigerGraph:HAS_FUNCTION:getVertexType': 'Defines the relationship between the package and the function'},\n", - " 'DocumentChunkInfo': {'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_0': \"Contributing to pyTigerGraph\\\\nDownload the Repository\\\\nYou can download the repository from GitHub by:\\\\ngit clone https://github.com/tigergraph/pyTigerGraph.git\\\\nInstall the Local Version of the Package\\\\nOnce downloaded, you can install the local version of the package (without GDS support) by:\\\\npip install '.'\\\\nWithin the\\\\npyTigerGraph\\\\ndirectory, you can run the following command to install the GDS version of the package:\\\\npip install '.[gds]'\\\\nMake Changes\\\\nMake the changes you want to make to the package.\",\n", - " 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_3': 'The pull request will be evaluated by our team and must have three parts:\\\\n1) Unit tests written in the\\\\ntests\\\\ndirectory.\\\\n2) Documentation in the form of docstrings and comments in the code.\\\\n3) A short writeup of the changes you are making and why (this is what will be displayed on the pull request page).\\\\nAfter that, we may reach out with further changes or suggestions to improve your pull request.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_4': 'Once your pull request is accepted, it will be merged into the master branch for deployment in the next release of the package.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': 'It assumes that there are datetime attributes on vertices and edges.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_2': 'If vertex attributes change over time, children vertex attributes are moved to the appropriate parent, and then the children are removed from the graph.\\\\n_init_\\\\n()\\\\ninit\\\\n(vertex_start_attrs: dict, vertex_end_attrs: dict, edge_start_attrs: dict, edge_end_attrs: dict, start_dt: int, end_dt: int, feature_transforms: dict, timestep: int = 86400)\\\\nInstantiate a TemporalPyGTransform.\\\\nParameters:\\\\nvertex_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex becomes a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}.\\\\nvertex_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex stops being a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}\\\\nedge_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge becomes a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}.\\\\nedge_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge stops being a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}\\\\nstart_dt (int)\\\\n: The UNIX epoch time to start generating the sequence of subgraphs.\\\\nend_dt (int)\\\\n: The UNIX epoch time to stop generating the sequence of subgraphs.\\\\nfeature_transforms (dict, optional)\\\\n: Only available on heterogeneous graphs. Moves temporally dynamic features from \\\\\"children\\\\\" vertices to \\\\\"parent\\\\\" vertices when\\\\nmodelling temporal attributes in TigerGraph.\\\\nThe key of the dictionary is the edge to move the attributes from the child type to the parent type, and the value is a list of attributes to move.\\\\nIn the fromat of {(\\\\\"ItemInstance\\\\\", \\\\\"reverse_DESCRIBED_BY\\\\\", \\\\\"Item\\\\\"): [\\\\\"x\\\\\"]}\\\\ntimestep (int, optional)\\\\n: The number of seconds to use in between timesteps.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_1': 'Make sure to document your changes in the code with docstrings and comments.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_5': 'Steps 1 and 2 may be skipped when making changes such as fixing typos or improving documentation, although existing unittests will be ran against your changes to ensure they pass.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_0': 'PyTorch Geometric Transforms\\\\nTemporalPyGTransform\\\\nThe TemporalPyGTransform creates a sequence of subgraph batches out of a single batch of data produced by a NeighborLoader or HGTLoader.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_2': 'The docstrings should be formatted as follows:\\\\ndef getVertexType(self, vertexType: str, force: bool = False) -> dict:\\\\n \\\\\"\\\\\"\\\\\"Returns the details of the specified vertex type.\\\\n\\\\n Args:\\\\n vertexType:\\\\n The name of of the vertex type.\\\\n force:\\\\n If `True`, forces the retrieval the schema metadata again, otherwise returns a\\\\n cached copy of vertex type details (if they were already fetched previously).\\\\n\\\\n Returns:\\\\n The metadata of the vertex type.\\\\n \\\\\"\\\\\"\\\\\"\\\\nAdditionally, all function definitions should have type hinting for both the arguments and the return value.\\\\nFinally, unit tests should be written to test the funcitonality of the change. See the\\\\ntesting documentation\\\\nfor more information.\\\\nSubmit a Pull Request\\\\nOnce you have made the changes you want to make, you can submit a pull request to the repository.'}}}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.searchDocuments(query,\n", " method=\"hnswoverlap\",\n", @@ -239,24 +167,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'@@final_retrieval': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': 'It assumes that there are datetime attributes on vertices and edges.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.',\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.'}}]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.searchDocuments(query,\n", " method=\"vdb\",\n", @@ -274,77 +187,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'@@sibling_set': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_1': {'distance': '-1',\n", - " 'content': 'Otherwise, the request will return immediately after RESTPP processes the POST.\\\\nnewVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only insert new vertices and not update existing ones.\\\\nvertexMustExist\\\\n: If\\\\nTrue\\\\n, the request will only insert an edge if both the\\\\nFROM\\\\nand\\\\nTO\\\\nvertices\\\\nof the edge already exist.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'distance': '0',\n", - " 'content': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_0': {'distance': '-2',\n", - " 'content': 'Schema Functions\\\\nThe functions in this page retrieve information about the graph schema.\\\\nAll functions in this module are called as methods on a\\\\nTigerGraphConnection\\\\nobject\\\\n.\\\\ngetSchema()\\\\ngetSchema(udts: bool = True, force: bool = False) \\\\u2192 dict\\\\nRetrieves the schema metadata (of all vertex and edge type and, if not disabled, the\\\\nUser-Defined Type details) of the graph.\\\\nParameters:\\\\nudts\\\\n: If\\\\nTrue\\\\n, the output includes User-Defined Types in the schema details.\\\\nforce\\\\n: If\\\\nTrue\\\\n, retrieves the schema metadata again, otherwise returns a cached copy of\\\\nthe schema metadata (if they were already fetched previously).\\\\nReturns:\\\\nThe schema metadata.\\\\nEndpoint:\\\\nGET /gsqlserver/gsql/schema\\\\nSee\\\\nShow graph schema metadata\\\\nupsertData()\\\\nupsertData(data: Union[str, object], atomic: bool = False, ackAll: bool = False, newVertexOnly: bool = False, vertexMustExist: bool = False, updateVertexOnly: bool = False) \\\\u2192 dict\\\\nUpserts data (vertices and edges) from a JSON file or a file with equivalent object structure.\\\\nParameters:\\\\ndata\\\\n: The data of vertex and edge instances, in a specific format.\\\\natomic\\\\n: The request is an atomic transaction. An atomic transaction means that updates to\\\\nthe database contained in the request are all-or-nothing: either all changes are\\\\nsuccessful, or none are successful.\\\\nackAll\\\\n: If\\\\nTrue\\\\n, the request will return after all GPE instances have acknowledged the\\\\nPOST.'}},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3',\n", - " 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '-1',\n", - " 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '-3',\n", - " 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_55': {'distance': '3',\n", - " 'content': 'Defaults to \\\\\"dataframe\\\\\".\\\\nloader_id (str, optional)\\\\n: An identifier of the loader which can be any string.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '1',\n", - " 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_53': {'distance': '1',\n", - " 'content': 'Defaults to None.\\\\noutput_format (str, optional)\\\\n: Format of the output data of the loader.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_54': {'distance': '2',\n", - " 'content': 'Only\\\\n\\\\\"dataframe\\\\\" is supported.'}},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_47': {'distance': '-2',\n", - " 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench edge loader tutorial notebook\\\\nfor examples.\\\\nvertexLoader()\\\\nvertexLoader(attributes: Union[list, dict] = None, batch_size: int = None, num_batches: int = 1, shuffle: bool = False, filter_by: str = None, output_format: str = \\\\\"dataframe\\\\\", loader_id: str = None, buffer_size: int = 4, kafka_address: str = None, kafka_max_msg_size: int = 104857600, kafka_num_partitions: int = 1, kafka_replica_factor: int = 1, kafka_retention_ms: int = 60000, kafka_auto_del_topic: bool = True, kafka_address_consumer: str = None, kafka_address_producer: str = None, timeout: int = 300000) \\\\u2192 VertexLoader\\\\nReturns a\\\\nVertexLoader\\\\ninstance.\\\\nA\\\\nVertexLoader\\\\ncan load all vertices of a graph in batches.\\\\nIt divides vertices into\\\\nnum_batches\\\\nand returns each batch separately.\\\\nThe boolean attribute provided to\\\\nfilter_by\\\\nindicates which vertices are included.\\\\nIf you need random batches, set\\\\nshuffle\\\\nto True.\\\\nWhen you initialize the loader on a graph for the first time,\\\\nthe initialization might take a minute as it installs the corresponding\\\\nquery to the database.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3',\n", - " 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '1',\n", - " 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_48': {'distance': '-1',\n", - " 'content': 'However, the query installation only\\\\nneeds to be done once, so it will take no time when you initialize the loader\\\\non the same graph again.\\\\nThere are two ways to use the data loader:\\\\nIt can be used as an iterable, which means you can loop through\\\\nit to get every batch of data.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '2',\n", - " 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_46': {'distance': '-3',\n", - " 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '0',\n", - " 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'distance': '0',\n", - " 'content': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_90': {'distance': '-3',\n", - " 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\nkafka_address_producer (str, optional)\\\\n: Address of the kafka broker that a producer\\\\nshould use.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_92': {'distance': '-1',\n", - " 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench tutorial notebook for graph loaders\\\\nfor examples.\\\\nfeaturizer()\\\\nfeaturizer() \\\\u2192 Featurizer\\\\nGet a featurizer.\\\\nReturns:\\\\nFeaturizer\\\\nvertexSplitter()\\\\nvertexSplitter(timeout: int = 600000)\\\\nGet a vertex splitter that splits vertices into at most 3 parts randomly.\\\\nThe split results are stored in the provided vertex attributes.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_91': {'distance': '-2',\n", - " 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_94': {'distance': '1',\n", - " 'content': 'The two\\\\nparts are disjoint.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_95': {'distance': '2',\n", - " 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True, and\\\\nanother random 20% of vertices will have their attribute \\\\\"attr_name3\\\\\" set to True.\\\\nThe three parts are disjoint.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_96': {'distance': '3',\n", - " 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2, attr_name3=0.2)\\\\nsplitter.run()\\\\nParameter:\\\\ntimeout (int, optional)\\\\n: Timeout value for the operation.'}},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_0': {'distance': '-1',\n", - " 'content': 'PyTorch Geometric Transforms\\\\nTemporalPyGTransform\\\\nThe TemporalPyGTransform creates a sequence of subgraph batches out of a single batch of data produced by a NeighborLoader or HGTLoader.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'distance': '0',\n", - " 'content': 'It assumes that there are datetime attributes on vertices and edges.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_4': {'distance': '3',\n", - " 'content': 'Returns a list of PyTorch Geometric data objects, a sequence of snapshots in time of the graph.\\\\nEdges are removed between vertices that do not have connections at the given time.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_3': {'distance': '2',\n", - " 'content': 'Defaults to 86400 seconds (1 day).\\\\ncall\\\\n()\\\\ncall\\\\n(data) \\\\u2192 list\\\\nPerform the transform.'},\n", - " 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_2': {'distance': '1',\n", - " 'content': 'If vertex attributes change over time, children vertex attributes are moved to the appropriate parent, and then the children are removed from the graph.\\\\n_init_\\\\n()\\\\ninit\\\\n(vertex_start_attrs: dict, vertex_end_attrs: dict, edge_start_attrs: dict, edge_end_attrs: dict, start_dt: int, end_dt: int, feature_transforms: dict, timestep: int = 86400)\\\\nInstantiate a TemporalPyGTransform.\\\\nParameters:\\\\nvertex_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex becomes a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}.\\\\nvertex_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex stops being a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}\\\\nedge_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge becomes a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}.\\\\nedge_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge stops being a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}\\\\nstart_dt (int)\\\\n: The UNIX epoch time to start generating the sequence of subgraphs.\\\\nend_dt (int)\\\\n: The UNIX epoch time to stop generating the sequence of subgraphs.\\\\nfeature_transforms (dict, optional)\\\\n: Only available on heterogeneous graphs. Moves temporally dynamic features from \\\\\"children\\\\\" vertices to \\\\\"parent\\\\\" vertices when\\\\nmodelling temporal attributes in TigerGraph.\\\\nThe key of the dictionary is the edge to move the attributes from the child type to the parent type, and the value is a list of attributes to move.\\\\nIn the fromat of {(\\\\\"ItemInstance\\\\\", \\\\\"reverse_DESCRIBED_BY\\\\\", \\\\\"Item\\\\\"): [\\\\\"x\\\\\"]}\\\\ntimestep (int, optional)\\\\n: The number of seconds to use in between timesteps.'}}}}]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "conn.ai.searchDocuments(query,\n", " method=\"sibling\",\n", @@ -366,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -380,41 +225,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "You can get a count of vertices in Python using the `getVertexCount` function from the `pyTigerGraph` package. This function retrieves the number of vertices of the specified type.\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"response\"])" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'@@final_retrieval': {'EntityInfo': {'pyTigerGraph': 'Python package for connecting to TigerGraph databases.', 'getVertexType': 'Function to retrieve the details of the specified vertex type.', 'getVertexStats': 'Function to return vertex attribute statistics.', 'getVertexCount': 'Function to retrieve the number of vertices of the specified type.'}, 'RelationshipInfo': {'pyTigerGraph:HAS_FUNCTION:getVertexType': 'Defines the relationship between the package and the function'}, 'DocumentChunkInfo': {'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_2': 'The docstrings should be formatted as follows:\\\\ndef getVertexType(self, vertexType: str, force: bool = False) -> dict:\\\\n \\\\\"\\\\\"\\\\\"Returns the details of the specified vertex type.\\\\n\\\\n Args:\\\\n vertexType:\\\\n The name of of the vertex type.\\\\n force:\\\\n If `True`, forces the retrieval the schema metadata again, otherwise returns a\\\\n cached copy of vertex type details (if they were already fetched previously).\\\\n\\\\n Returns:\\\\n The metadata of the vertex type.\\\\n \\\\\"\\\\\"\\\\\"\\\\nAdditionally, all function definitions should have type hinting for both the arguments and the return value.\\\\nFinally, unit tests should be written to test the funcitonality of the change. See the\\\\ntesting documentation\\\\nfor more information.\\\\nSubmit a Pull Request\\\\nOnce you have made the changes you want to make, you can submit a pull request to the repository.', 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_0': 'PyTorch Geometric Transforms\\\\nTemporalPyGTransform\\\\nThe TemporalPyGTransform creates a sequence of subgraph batches out of a single batch of data produced by a NeighborLoader or HGTLoader.', 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_4': 'Once your pull request is accepted, it will be merged into the master branch for deployment in the next release of the package.', 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_0': \"Contributing to pyTigerGraph\\\\nDownload the Repository\\\\nYou can download the repository from GitHub by:\\\\ngit clone https://github.com/tigergraph/pyTigerGraph.git\\\\nInstall the Local Version of the Package\\\\nOnce downloaded, you can install the local version of the package (without GDS support) by:\\\\npip install '.'\\\\nWithin the\\\\npyTigerGraph\\\\ndirectory, you can run the following command to install the GDS version of the package:\\\\npip install '.[gds]'\\\\nMake Changes\\\\nMake the changes you want to make to the package.\", 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_3': 'The pull request will be evaluated by our team and must have three parts:\\\\n1) Unit tests written in the\\\\ntests\\\\ndirectory.\\\\n2) Documentation in the form of docstrings and comments in the code.\\\\n3) A short writeup of the changes you are making and why (this is what will be displayed on the pull request page).\\\\nAfter that, we may reach out with further changes or suggestions to improve your pull request.', 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_1': 'Make sure to document your changes in the code with docstrings and comments.', 'https://docs.tigergraph.com/pytigergraph/current/contributing/_chunk_5': 'Steps 1 and 2 may be skipped when making changes such as fixing typos or improving documentation, although existing unittests will be ran against your changes to ensure they pass.', 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': 'It assumes that there are datetime attributes on vertices and edges.', 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_2': 'If vertex attributes change over time, children vertex attributes are moved to the appropriate parent, and then the children are removed from the graph.\\\\n_init_\\\\n()\\\\ninit\\\\n(vertex_start_attrs: dict, vertex_end_attrs: dict, edge_start_attrs: dict, edge_end_attrs: dict, start_dt: int, end_dt: int, feature_transforms: dict, timestep: int = 86400)\\\\nInstantiate a TemporalPyGTransform.\\\\nParameters:\\\\nvertex_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex becomes a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}.\\\\nvertex_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex stops being a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}\\\\nedge_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge becomes a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}.\\\\nedge_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge stops being a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}\\\\nstart_dt (int)\\\\n: The UNIX epoch time to start generating the sequence of subgraphs.\\\\nend_dt (int)\\\\n: The UNIX epoch time to stop generating the sequence of subgraphs.\\\\nfeature_transforms (dict, optional)\\\\n: Only available on heterogeneous graphs. Moves temporally dynamic features from \\\\\"children\\\\\" vertices to \\\\\"parent\\\\\" vertices when\\\\nmodelling temporal attributes in TigerGraph.\\\\nThe key of the dictionary is the edge to move the attributes from the child type to the parent type, and the value is a list of attributes to move.\\\\nIn the fromat of {(\\\\\"ItemInstance\\\\\", \\\\\"reverse_DESCRIBED_BY\\\\\", \\\\\"Item\\\\\"): [\\\\\"x\\\\\"]}\\\\ntimestep (int, optional)\\\\n: The number of seconds to use in between timesteps.'}}}]\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"retrieved\"])" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -427,52 +256,25 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "To get a count of vertices in Python, you can use the following code:\n", - "\n", - "```python\n", - "# Assuming you have a list of vertices\n", - "vertices = [{'@@final_retrieval': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.', 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': 'It assumes that there are datetime attributes on vertices and edges.', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}}]\n", - "\n", - "# Get the count of vertices\n", - "num_vertices = len(vertices)\n", - "print(\"Number of vertices:\", num_vertices)\n", - "```\n", - "\n", - "This code will give you the count of vertices in the list of vertices.\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"response\"])" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'@@final_retrieval': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.', 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': 'It assumes that there are datetime attributes on vertices and edges.', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.', 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}}]\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"retrieved\"])" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -487,37 +289,18 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "To get a count of vertices in Python, you can use the following code:\n", - "\n", - "```python\n", - "data = [{'@@sibling_set': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '1', 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3', 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_47': {'distance': '-2', 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench edge loader tutorial notebook\\\\nfor examples.\\\\nvertexLoader()\\\\nvertexLoader(attributes: Union[list, dict] = None, batch_size: int = None, num_batches: int = 1, shuffle: bool = False, filter_by: str = None, output_format: str = \\\\\"dataframe\\\\\", loader_id: str = None, buffer_size: int = 4, kafka_address: str = None, kafka_max_msg_size: int = 104857600, kafka_num_partitions: int = 1, kafka_replica_factor: int = 1, kafka_retention_ms: int = 60000, kafka_auto_del_topic: bool = True, kafka_address_consumer: str = None, kafka_address_producer: str = None, timeout: int = 300000) \\\\u2192 VertexLoader\\\\nReturns a\\\\nVertexLoader\\\\ninstance.\\\\nA\\\\nVertexLoader\\\\ncan load all vertices of a graph in batches.\\\\nIt divides vertices into\\\\nnum_batches\\\\nand returns each batch separately.\\\\nThe boolean attribute provided to\\\\nfilter_by\\\\nindicates which vertices are included.\\\\nIf you need random batches, set\\\\nshuffle\\\\nto True.\\\\nWhen you initialize the loader on a graph for the first time,\\\\nthe initialization might take a minute as it installs the corresponding\\\\nquery to the database.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '0', 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_48': {'distance': '-1', 'content': 'However, the query installation only\\\\nneeds to be done once, so it will take no time when you initialize the loader\\\\non the same graph again.\\\\nThere are two ways to use the data loader:\\\\nIt can be used as an iterable, which means you can loop through\\\\nit to get every batch of data.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_46': {'distance': '-3', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '2', 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '-1', 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3', 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_54': {'distance': '2', 'content': 'Only\\\\n\\\\\"dataframe\\\\\" is supported.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_53': {'distance': '1', 'content': 'Defaults to None.\\\\noutput_format (str, optional)\\\\n: Format of the output data of the loader.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '-3', 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '1', 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_55': {'distance': '3', 'content': 'Defaults to \\\\\"dataframe\\\\\".\\\\nloader_id (str, optional)\\\\n: An identifier of the loader which can be any string.'}}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_0': {'distance': '-2', 'content': 'Schema Functions\\\\nThe functions in this page retrieve information about the graph schema.\\\\nAll functions in this module are called as methods on a\\\\nTigerGraphConnection\\\\nobject\\\\n.\\\\ngetSchema()\\\\ngetSchema(udts: bool = True, force: bool = False) \\\\u2192 dict\\\\nRetrieves the schema metadata (of all vertex and edge type and, if not disabled, the\\\\nUser-Defined Type details) of the graph.\\\\nParameters:\\\\nudts\\\\n: If\\\\nTrue\\\\n, the output includes User-Defined Types in the schema details.\\\\nforce\\\\n: If\\\\nTrue\\\\n, retrieves the schema metadata again, otherwise returns a cached copy of\\\\nthe schema metadata (if they were already fetched previously).\\\\nReturns:\\\\nThe schema metadata.\\\\nEndpoint:\\\\nGET /gsqlserver/gsql/schema\\\\nSee\\\\nShow graph schema metadata\\\\nupsertData()\\\\nupsertData(data: Union[str, object], atomic: bool = False, ackAll: bool = False, newVertexOnly: bool = False, vertexMustExist: bool = False, updateVertexOnly: bool = False) \\\\u2192 dict\\\\nUpserts data (vertices and edges) from a JSON file or a file with equivalent object structure.\\\\nParameters:\\\\ndata\\\\n: The data of vertex and edge instances, in a specific format.\\\\natomic\\\\n: The request is an atomic transaction. An atomic transaction means that updates to\\\\nthe database contained in the request are all-or-nothing: either all changes are\\\\nsuccessful, or none are successful.\\\\nackAll\\\\n: If\\\\nTrue\\\\n, the request will return after all GPE instances have acknowledged the\\\\nPOST.'}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'distance': '0', 'content': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints'}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_1': {'distance': '-1', 'content': 'Otherwise, the request will return immediately after RESTPP processes the POST.\\\\nnewVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only insert new vertices and not update existing ones.\\\\nvertexMustExist\\\\n: If\\\\nTrue\\\\n, the request will only insert an edge if both the\\\\nFROM\\\\nand\\\\nTO\\\\nvertices\\\\nof the edge already exist.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_95': {'distance': '2', 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True, and\\\\nanother random 20% of vertices will have their attribute \\\\\"attr_name3\\\\\" set to True.\\\\nThe three parts are disjoint.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_90': {'distance': '-3', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\nkafka_address_producer (str, optional)\\\\n: Address of the kafka broker that a producer\\\\nshould use.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_92': {'distance': '-1', 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench tutorial notebook for graph loaders\\\\nfor examples.\\\\nfeaturizer()\\\\nfeaturizer() \\\\u2192 Featurizer\\\\nGet a featurizer.\\\\nReturns:\\\\nFeaturizer\\\\nvertexSplitter()\\\\nvertexSplitter(timeout: int = 600000)\\\\nGet a vertex splitter that splits vertices into at most 3 parts randomly.\\\\nThe split results are stored in the provided vertex attributes.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_96': {'distance': '3', 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2, attr_name3=0.2)\\\\nsplitter.run()\\\\nParameter:\\\\ntimeout (int, optional)\\\\n: Timeout value for the operation.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'distance': '0', 'content': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_91': {'distance': '-2', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_94': {'distance': '1', 'content': 'The two\\\\nparts are disjoint.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_3': {'distance': '2', 'content': 'Defaults to 86400 seconds (1 day).\\\\ncall\\\\n()\\\\ncall\\\\n(data) \\\\u2192 list\\\\nPerform the transform.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_0': {'distance': '-1', 'content': 'PyTorch Geometric Transforms\\\\nTemporalPyGTransform\\\\nThe TemporalPyGTransform creates a sequence of subgraph batches out of a single batch of data produced by a NeighborLoader or HGTLoader.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'distance': '0', 'content': 'It assumes that there are datetime attributes on vertices and edges.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_2': {'distance': '1', 'content': 'If vertex attributes change over time, children vertex attributes are moved to the appropriate parent, and then the children are removed from the graph.\\\\n_init_\\\\n()\\\\ninit\\\\n(vertex_start_attrs: dict, vertex_end_attrs: dict, edge_start_attrs: dict, edge_end_attrs: dict, start_dt: int, end_dt: int, feature_transforms: dict, timestep: int = 86400)\\\\nInstantiate a TemporalPyGTransform.\\\\nParameters:\\\\nvertex_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex becomes a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}.\\\\nvertex_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex stops being a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}\\\\nedge_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge becomes a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}.\\\\nedge_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge stops being a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}\\\\nstart_dt (int)\\\\n: The UNIX epoch time to start generating the sequence of subgraphs.\\\\nend_dt (int)\\\\n: The UNIX epoch time to stop generating the sequence of subgraphs.\\\\nfeature_transforms (dict, optional)\\\\n: Only available on heterogeneous graphs. Moves temporally dynamic features from \\\\\"children\\\\\" vertices to \\\\\"parent\\\\\" vertices when\\\\nmodelling temporal attributes in TigerGraph.\\\\nThe key of the dictionary is the edge to move the attributes from the child type to the parent type, and the value is a list of attributes to move.\\\\nIn the fromat of {(\\\\\"ItemInstance\\\\\", \\\\\"reverse_DESCRIBED_BY\\\\\", \\\\\"Item\\\\\"): [\\\\\"x\\\\\"]}\\\\ntimestep (int, optional)\\\\n: The number of seconds to use in between timesteps.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_4': {'distance': '3', 'content': 'Returns a list of PyTorch Geometric data objects, a sequence of snapshots in time of the graph.\\\\nEdges are\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"response\"])" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'@@sibling_set': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '1', 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3', 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_47': {'distance': '-2', 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench edge loader tutorial notebook\\\\nfor examples.\\\\nvertexLoader()\\\\nvertexLoader(attributes: Union[list, dict] = None, batch_size: int = None, num_batches: int = 1, shuffle: bool = False, filter_by: str = None, output_format: str = \\\\\"dataframe\\\\\", loader_id: str = None, buffer_size: int = 4, kafka_address: str = None, kafka_max_msg_size: int = 104857600, kafka_num_partitions: int = 1, kafka_replica_factor: int = 1, kafka_retention_ms: int = 60000, kafka_auto_del_topic: bool = True, kafka_address_consumer: str = None, kafka_address_producer: str = None, timeout: int = 300000) \\\\u2192 VertexLoader\\\\nReturns a\\\\nVertexLoader\\\\ninstance.\\\\nA\\\\nVertexLoader\\\\ncan load all vertices of a graph in batches.\\\\nIt divides vertices into\\\\nnum_batches\\\\nand returns each batch separately.\\\\nThe boolean attribute provided to\\\\nfilter_by\\\\nindicates which vertices are included.\\\\nIf you need random batches, set\\\\nshuffle\\\\nto True.\\\\nWhen you initialize the loader on a graph for the first time,\\\\nthe initialization might take a minute as it installs the corresponding\\\\nquery to the database.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '0', 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_48': {'distance': '-1', 'content': 'However, the query installation only\\\\nneeds to be done once, so it will take no time when you initialize the loader\\\\non the same graph again.\\\\nThere are two ways to use the data loader:\\\\nIt can be used as an iterable, which means you can loop through\\\\nit to get every batch of data.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_46': {'distance': '-3', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '2', 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_50': {'distance': '-1', 'content': 'If there is\\\\nonly one batch of data to load, it will give you the batch directly instead\\\\nof an iterator, which might make more sense in that case.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_52': {'distance': '3', 'content': 'Defaults to None.\\\\nbatch_size (int, optional)\\\\n: Number of vertices in each batch.\\\\nDefaults to None.\\\\nnum_batches (int, optional)\\\\n: Number of batches to split the vertices.\\\\nDefaults to 1.\\\\nshuffle (bool, optional)\\\\n: Whether to shuffle the vertices before loading data.\\\\nDefaults to False.\\\\nfilter_by (str, optional)\\\\n: A boolean attribute used to indicate which vertices\\\\ncan be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_54': {'distance': '2', 'content': 'Only\\\\n\\\\\"dataframe\\\\\" is supported.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_53': {'distance': '1', 'content': 'Defaults to None.\\\\noutput_format (str, optional)\\\\n: Format of the output data of the loader.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_49': {'distance': '-3', 'content': 'If you load all vertices at once (\\\\nnum_batches=1\\\\n),\\\\nthere will be only one batch (of all the vertices) in the iterator.\\\\nYou can access the\\\\ndata\\\\nproperty of the class directly.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_51': {'distance': '1', 'content': 'If there are\\\\nmultiple batches of data to load, it will return the loader again.\\\\nParameters:\\\\nattributes (list, optional)\\\\n: Vertex attributes to be included.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_55': {'distance': '3', 'content': 'Defaults to \\\\\"dataframe\\\\\".\\\\nloader_id (str, optional)\\\\n: An identifier of the loader which can be any string.'}}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_0': {'distance': '-2', 'content': 'Schema Functions\\\\nThe functions in this page retrieve information about the graph schema.\\\\nAll functions in this module are called as methods on a\\\\nTigerGraphConnection\\\\nobject\\\\n.\\\\ngetSchema()\\\\ngetSchema(udts: bool = True, force: bool = False) \\\\u2192 dict\\\\nRetrieves the schema metadata (of all vertex and edge type and, if not disabled, the\\\\nUser-Defined Type details) of the graph.\\\\nParameters:\\\\nudts\\\\n: If\\\\nTrue\\\\n, the output includes User-Defined Types in the schema details.\\\\nforce\\\\n: If\\\\nTrue\\\\n, retrieves the schema metadata again, otherwise returns a cached copy of\\\\nthe schema metadata (if they were already fetched previously).\\\\nReturns:\\\\nThe schema metadata.\\\\nEndpoint:\\\\nGET /gsqlserver/gsql/schema\\\\nSee\\\\nShow graph schema metadata\\\\nupsertData()\\\\nupsertData(data: Union[str, object], atomic: bool = False, ackAll: bool = False, newVertexOnly: bool = False, vertexMustExist: bool = False, updateVertexOnly: bool = False) \\\\u2192 dict\\\\nUpserts data (vertices and edges) from a JSON file or a file with equivalent object structure.\\\\nParameters:\\\\ndata\\\\n: The data of vertex and edge instances, in a specific format.\\\\natomic\\\\n: The request is an atomic transaction. An atomic transaction means that updates to\\\\nthe database contained in the request are all-or-nothing: either all changes are\\\\nsuccessful, or none are successful.\\\\nackAll\\\\n: If\\\\nTrue\\\\n, the request will return after all GPE instances have acknowledged the\\\\nPOST.'}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_2': {'distance': '0', 'content': 'If the value is\\\\nFalse\\\\n, the request will always insert new\\\\nedges and create the necessary vertices with default values for their attributes.\\\\nNote that this parameter does not affect vertices.\\\\nupdateVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only update existing vertices and not insert new\\\\nvertices.\\\\nReturns:\\\\nThe result of upsert (number of vertices and edges accepted/upserted).\\\\nEndpoint:\\\\nPOST /graph/{graph_name}\\\\nSee\\\\nUpsert data to graph\\\\ngetEndpoints()\\\\ngetEndpoints(builtin: bool = False, dynamic: bool = False, static: bool = False) \\\\u2192 dict\\\\nLists the REST++ endpoints and their parameters.\\\\nParameters:\\\\nbuiltin\\\\n: List the TigerGraph-provided REST++ endpoints.\\\\ndynamic\\\\n: List endpoints for user-installed queries.\\\\nstatic\\\\n: List static endpoints.\\\\nIf none of the above arguments are specified, all endpoints are listed.\\\\nEndpoint:\\\\nGET /endpoints/{graph_name}\\\\nSee\\\\nList all endpoints'}, 'https://docs.tigergraph.com/pytigergraph/current/core-functions/schema_chunk_1': {'distance': '-1', 'content': 'Otherwise, the request will return immediately after RESTPP processes the POST.\\\\nnewVertexOnly\\\\n: If\\\\nTrue\\\\n, the request will only insert new vertices and not update existing ones.\\\\nvertexMustExist\\\\n: If\\\\nTrue\\\\n, the request will only insert an edge if both the\\\\nFROM\\\\nand\\\\nTO\\\\nvertices\\\\nof the edge already exist.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_95': {'distance': '2', 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True, and\\\\nanother random 20% of vertices will have their attribute \\\\\"attr_name3\\\\\" set to True.\\\\nThe three parts are disjoint.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_90': {'distance': '-3', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\nkafka_address_producer (str, optional)\\\\n: Address of the kafka broker that a producer\\\\nshould use.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_92': {'distance': '-1', 'content': 'Defaults to 300000.\\\\nSee\\\\nthe ML Workbench tutorial notebook for graph loaders\\\\nfor examples.\\\\nfeaturizer()\\\\nfeaturizer() \\\\u2192 Featurizer\\\\nGet a featurizer.\\\\nReturns:\\\\nFeaturizer\\\\nvertexSplitter()\\\\nvertexSplitter(timeout: int = 600000)\\\\nGet a vertex splitter that splits vertices into at most 3 parts randomly.\\\\nThe split results are stored in the provided vertex attributes.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_96': {'distance': '3', 'content': 'Example:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6, attr_name2=0.2, attr_name3=0.2)\\\\nsplitter.run()\\\\nParameter:\\\\ntimeout (int, optional)\\\\n: Timeout value for the operation.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_93': {'distance': '0', 'content': 'Each boolean attribute\\\\nindicates which part a vertex belongs to.\\\\nUsage:\\\\nA random 60% of vertices will have their attribute\\\\nattr_name\\\\nset to True, and\\\\nothers False.\\\\nattr_name\\\\ncan be any attribute that exists in the database (same below).\\\\nExample:\\\\nconn = TigerGraphConnection(...)\\\\nsplitter = RandomVertexSplitter(conn, timeout, attr_name=0.6)\\\\nsplitter.run()\\\\nA random 60% of vertices will have their attribute \\\\\"attr_name\\\\\" set to True, and a\\\\nrandom 20% of vertices will have their attribute \\\\\"attr_name2\\\\\" set to True.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_91': {'distance': '-2', 'content': 'Defaults to be the same as\\\\nkafkaAddress\\\\n.\\\\ntimeout (int, optional)\\\\n: Timeout value for GSQL queries, in ms.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/factory-functions_chunk_94': {'distance': '1', 'content': 'The two\\\\nparts are disjoint.'}}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_3': {'distance': '2', 'content': 'Defaults to 86400 seconds (1 day).\\\\ncall\\\\n()\\\\ncall\\\\n(data) \\\\u2192 list\\\\nPerform the transform.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_0': {'distance': '-1', 'content': 'PyTorch Geometric Transforms\\\\nTemporalPyGTransform\\\\nThe TemporalPyGTransform creates a sequence of subgraph batches out of a single batch of data produced by a NeighborLoader or HGTLoader.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_1': {'distance': '0', 'content': 'It assumes that there are datetime attributes on vertices and edges.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_2': {'distance': '1', 'content': 'If vertex attributes change over time, children vertex attributes are moved to the appropriate parent, and then the children are removed from the graph.\\\\n_init_\\\\n()\\\\ninit\\\\n(vertex_start_attrs: dict, vertex_end_attrs: dict, edge_start_attrs: dict, edge_end_attrs: dict, start_dt: int, end_dt: int, feature_transforms: dict, timestep: int = 86400)\\\\nInstantiate a TemporalPyGTransform.\\\\nParameters:\\\\nvertex_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex becomes a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}.\\\\nvertex_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when a vertex stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when a vertex stops being a valid vertex to include in the graph.\\\\nIn the format of {\\\\\"VERTEX_TYPE\\\\\": \\\\\"attribute_name\\\\\"}\\\\nedge_start_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge becomes valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge becomes a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}.\\\\nedge_end_attrs (str, dict)\\\\n: If using on a homogeneous graph, string of the attribute storing the timestamp of when an edge stops being valid to include.\\\\nIf using on a heterogenous graph, dictionary that describes the attribute storing the timestamp of when an edge stops being a valid edge to include in the graph.\\\\nUses the PyG edge format of (\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\").\\\\nIn the format of {(\\\\\"SourceVertexType\\\\\", \\\\\"EdgeName\\\\\", \\\\\"DestinationVertexType\\\\\"): \\\\\"attribute_name\\\\\"}\\\\nstart_dt (int)\\\\n: The UNIX epoch time to start generating the sequence of subgraphs.\\\\nend_dt (int)\\\\n: The UNIX epoch time to stop generating the sequence of subgraphs.\\\\nfeature_transforms (dict, optional)\\\\n: Only available on heterogeneous graphs. Moves temporally dynamic features from \\\\\"children\\\\\" vertices to \\\\\"parent\\\\\" vertices when\\\\nmodelling temporal attributes in TigerGraph.\\\\nThe key of the dictionary is the edge to move the attributes from the child type to the parent type, and the value is a list of attributes to move.\\\\nIn the fromat of {(\\\\\"ItemInstance\\\\\", \\\\\"reverse_DESCRIBED_BY\\\\\", \\\\\"Item\\\\\"): [\\\\\"x\\\\\"]}\\\\ntimestep (int, optional)\\\\n: The number of seconds to use in between timesteps.'}, 'https://docs.tigergraph.com/pytigergraph/current/gds/pyg_transforms_chunk_4': {'distance': '3', 'content': 'Returns a list of PyTorch Geometric data objects, a sequence of snapshots in time of the graph.\\\\nEdges are removed between vertices that do not have connections at the given time.'}}}}]\n" - ] - } - ], + "outputs": [], "source": [ "print(resp[\"retrieved\"])" ] diff --git a/copilot/docs/notebooks/SupportAIDocIngestion.ipynb b/copilot/docs/notebooks/SupportAIDocIngestion.ipynb deleted file mode 100644 index 7a380db5..00000000 --- a/copilot/docs/notebooks/SupportAIDocIngestion.ipynb +++ /dev/null @@ -1,323 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pyTigerGraph import TigerGraphConnection\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()\n", - "# We first create a connection to the database\n", - "host = os.environ[\"HOST\"]\n", - "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", - "password = os.getenv(\"PASS\", \"tigergraph\")\n", - "conn = TigerGraphConnection(\n", - " host=host,\n", - " username=username,\n", - " password=password,\n", - ")\n", - "# conn.getToken()\n", - "\n", - "# And then add CoPilot's address to the connection. This address\n", - "# is the host's address where the CoPilot container is running.\n", - "# conn.ai.configureCoPilotHost(\"http://localhost:8000\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.gsql(\"\"\"CREATE GRAPH SupportAIDocIngestion()\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'l11vnumq77c33f0aa2ss5m0th5hqdj14'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.graphname = \"SupportAIDocIngestion\"\n", - "conn.getToken()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", - "# Create Graph Schema\n", - "# Install GSQL queries\n", - "# conn.ai.initializeSupportAI()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", - "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", - "res = conn.ai.createDocumentIngest(\n", - " data_source=\"s3\",\n", - " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", - " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", - " file_format=\"json\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.ai.runDocumentIngest(res[\"load_job_id\"], res[\"data_source_id\"], \"s3://tg-documentation/pytg_current/pytg_current.jsonl\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'submitted'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.ai.forceConsistencyUpdate(method=\"supportai\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"How do I get a count of vertices in Python?\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### HNSW Index Overlap in Graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.ai.searchDocuments(query,\n", - " method=\"hnswoverlap\",\n", - " method_parameters = {\"indices\": [\"Document\", \"DocumentChunk\", \"Entity\", \"Relationship\"],\n", - " \"top_k\": 2,\n", - " \"num_hops\": 2,\n", - " \"num_seen_min\": 2})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Document Chunk Vector Search" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.ai.searchDocuments(query,\n", - " method=\"vdb\",\n", - " method_parameters={\"index\": \"DocumentChunk\",\n", - " \"top_k\": 5,\n", - " \"withHyDE\": False})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sibling Document Chunk Vector Search" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.ai.searchDocuments(query,\n", - " method=\"sibling\",\n", - " method_parameters={\"index\": \"DocumentChunk\",\n", - " \"top_k\": 5,\n", - " \"lookahead\": 3,\n", - " \"lookback\": 3,\n", - " \"withHyDE\": False})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Comparing LLM Generated Responses\n", - "\n", - "TigerGraph CoPilot provides a way to generate the response to the user's query using a LLM, based on the search results from the methods above. You can compare the responses generated by the LLM for each of the search methods to see which one is the most relevant to the user's query. In this example, we can see that the HNSW Overlap method generates the most relevant response to the user's query. While none of the responses were wrong, the HNSW Overlap method generated the most relevant response to the user's query, by suggesting to use the `getVertexCount()` function to get the number of vertices in the graph." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resp = conn.ai.answerQuestion(query,\n", - " method=\"hnswoverlap\",\n", - " method_parameters = {\"indices\": [\"Document\", \"DocumentChunk\", \"Entity\", \"Relationship\"],\n", - " \"top_k\": 2,\n", - " \"num_hops\": 2,\n", - " \"num_seen_min\": 2})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"response\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"retrieved\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resp = conn.ai.answerQuestion(query,\n", - " method=\"vdb\",\n", - " method_parameters={\"index\": \"DocumentChunk\",\n", - " \"top_k\": 5,\n", - " \"withHyDE\": False})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"response\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"retrieved\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resp = conn.ai.answerQuestion(query,\n", - " method=\"sibling\",\n", - " method_parameters={\"index\": \"DocumentChunk\",\n", - " \"top_k\": 5,\n", - " \"lookahead\": 3,\n", - " \"lookback\": 3,\n", - " \"withHyDE\": False})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"response\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(resp[\"retrieved\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pytg_dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From bfd49bd3b794aaca97431002e93f38973b3d972e Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 15:18:13 -0700 Subject: [PATCH 63/91] modify raise_for_status --- eventual-consistency-service/app/graphrag/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index fba184ee..2b237f39 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -206,7 +206,8 @@ async def upsert_vertex( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - res.raise_for_status() + res.raise_for_status() + async def check_vertex_exists(conn, v_id: str): From 3856ff79ae3279248be71db8b1a3b9554400043d Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 15:34:43 -0700 Subject: [PATCH 64/91] clean up code --- eventual-consistency-service/app/graphrag/workers.py | 2 +- .../app/supportai/supportai_init.py | 6 +++--- eventual-consistency-service/app/supportai/util.py | 5 +---- eventual-consistency-service/app/supportai/workers.py | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 755b1085..39ed0e0d 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -78,7 +78,7 @@ async def chunk_doc( # send chunks to be embedded logger.info("chunk writes to embed_chan") - await embed_chan.put((v_id, chunk, "DocumentChunk")) + await embed_chan.put((chunk_id, chunk, "DocumentChunk")) # send chunks to have entities extracted logger.info("chunk writes to extract_chan") diff --git a/eventual-consistency-service/app/supportai/supportai_init.py b/eventual-consistency-service/app/supportai/supportai_init.py index ac5f49aa..287d367b 100644 --- a/eventual-consistency-service/app/supportai/supportai_init.py +++ b/eventual-consistency-service/app/supportai/supportai_init.py @@ -76,8 +76,8 @@ async def chunk_docs( doc_task = [] async with asyncio.TaskGroup() as sp: async for content in docs_chan: - v_id = content["v_id"] - txt = content["attributes"]["text"] + # v_id = content["v_id"] + # txt = content["attributes"]["text"] logger.info("chunk writes to extract") # await embed_chan.put((v_id, txt, "Document")) @@ -170,7 +170,7 @@ async def extract( async def run( graphname: str, conn: TigerGraphConnection, - upsert_limit=10 + upsert_limit=100 ): """ Set up SupportAI: diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py index 579463a8..133dbd3a 100644 --- a/eventual-consistency-service/app/supportai/util.py +++ b/eventual-consistency-service/app/supportai/util.py @@ -26,22 +26,19 @@ logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) -tg_sem = asyncio.Semaphore(10) +tg_sem = asyncio.Semaphore(100) async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, ): # queries that are currently installed - logger.info("Fetching currently installed queries...") installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - logger.info(f"Installed queries: {installed_queries}") # doesn't need to be parallel since tg only does it one at a time for q in requried_queries: # only install n queries at a time (n=n_workers) q_name = q.split("/")[-1] - logger.info(f"Processing query: {q_name}") # if the query is not installed, install it if q_name not in installed_queries: logger.info(f"Query '{q_name}' not found in installed queries. Attempting to install...") diff --git a/eventual-consistency-service/app/supportai/workers.py b/eventual-consistency-service/app/supportai/workers.py index 74b18658..bc9f4ed7 100644 --- a/eventual-consistency-service/app/supportai/workers.py +++ b/eventual-consistency-service/app/supportai/workers.py @@ -80,7 +80,7 @@ async def chunk_doc( # send chunks to be embedded logger.info("chunk writes to embed_chan") - await embed_chan.put((v_id, chunk, "DocumentChunk")) + await embed_chan.put((chunk_id, chunk, "DocumentChunk")) # send chunks to have entities extracted logger.info("chunk writes to extract_chan") From 1c1a8930ffed075ee45c09779c09b0d114ea8a4e Mon Sep 17 00:00:00 2001 From: Lu Zhou Date: Mon, 19 Aug 2024 17:43:24 -0700 Subject: [PATCH 65/91] set drop_old to false and clean up comment out code --- eventual-consistency-service/app/main.py | 13 ++------- .../app/supportai/util.py | 2 +- .../app/supportai/workers.py | 29 +++++-------------- 3 files changed, 10 insertions(+), 34 deletions(-) diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index c018dd43..d0c9afd8 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -181,18 +181,9 @@ def consistency_status( ) match ecc_method: case SupportAIMethod.SUPPORTAI: - # if graphname in consistency_checkers: - # ecc = consistency_checkers[graphname] - # ecc_status = json.dumps(ecc.get_status()) - # else: - # start_ecc_in_thread(graphname, conn) background.add_task(supportai.run, graphname, conn) - # ecc_status = ( - # f"Eventual consistency checker started for graph {graphname} {time.ctime()}" - # ) - ecc_status = f"SupportAI initialization on {graphname} {time.ctime()}" - - # LogWriter.info(f"Returning consistency status for {graphname}: {status}") + + ecc_status = f"SupportAI initialization on {graphname} {time.ctime()}" case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) diff --git a/eventual-consistency-service/app/supportai/util.py b/eventual-consistency-service/app/supportai/util.py index 133dbd3a..b6cdc948 100644 --- a/eventual-consistency-service/app/supportai/util.py +++ b/eventual-consistency-service/app/supportai/util.py @@ -113,7 +113,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") diff --git a/eventual-consistency-service/app/supportai/workers.py b/eventual-consistency-service/app/supportai/workers.py index bc9f4ed7..828655e8 100644 --- a/eventual-consistency-service/app/supportai/workers.py +++ b/eventual-consistency-service/app/supportai/workers.py @@ -183,7 +183,6 @@ async def extract( if len(desc[0]): await embed_chan.put((v_id, v_id, "Entity")) else: - # (v_id, content, index_name) await embed_chan.put((v_id, desc[0], "Entity")) await upsert_chan.put( @@ -219,21 +218,13 @@ async def extract( ) for edge in doc.relationships: - # logger.info( - # f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" - # ) # upsert verts first to make sure their ID becomes an attr - # v_id = util.process_id(edge.type) # edge type v_id = edge.type - # if len(v_id) == 0: - # continue - # desc = await get_vert_desc(conn, v_id, node) + if len(v_id) == 0: + continue # embed "Relationship" - # if len(desc[0]): await embed_chan.put((v_id, v_id, "Relationship")) - # else: - # # (v_id, content, index_name) - # await embed_chan.put((v_id, desc[0], "Relationship")) + await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -242,7 +233,6 @@ async def extract( "Relationship", # v_type v_id, { # attrs - # "description": desc, "epoch_added": int(time.time()), }, ), @@ -251,7 +241,7 @@ async def extract( v_id = util.process_id(edge.source.id) # source id if len(v_id) == 0: continue - # desc = await get_vert_desc(conn, v_id, edge.source) + desc = await get_vert_desc(conn, v_id, edge.source) await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -260,7 +250,7 @@ async def extract( "Entity", # v_type v_id, { # attrs - # "description": desc, + "description": desc, "epoch_added": int(time.time()), }, ), @@ -269,7 +259,7 @@ async def extract( v_id = util.process_id(edge.target.id) # target id if len(v_id) == 0: continue - # desc = await get_vert_desc(conn, v_id, edge.target) + desc = await get_vert_desc(conn, v_id, edge.target) await upsert_chan.put( ( util.upsert_vertex, # func to call @@ -278,7 +268,7 @@ async def extract( "Entity", # v_type v_id, # src_id { # attrs - # "description": desc, + "description": desc, "epoch_added": int(time.time()), }, ), @@ -295,9 +285,7 @@ async def extract( util.process_id(edge.source.id), # src_id "IS_HEAD_OF", # edgeType "Relationship", # tgt_type - # util.process_id(edge.type), # tgt_id edge.type, # tgt_id - # {"relation_type": edge.type}, # attributes ), ) ) @@ -307,12 +295,10 @@ async def extract( ( conn, "Relationship", # src_type - # util.process_id(edge.type), # src_id edge.type, # src_id "HAS_TAIL", # edgeType "Entity", # tgt_type util.process_id(edge.target.id), # tgt_id - # {"relation_type": edge.type}, # attributes ), ) ) @@ -329,7 +315,6 @@ async def extract( "MENTIONS_RELATIONSHIP", # edge_type "Relationship", # tgt_type edge.type, # tgt_id - # None, # attributes ), ) ) \ No newline at end of file From 0f7d8ebb87ac08fe855d068de996a12b0ddc1a82 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Tue, 20 Aug 2024 10:06:28 -0500 Subject: [PATCH 66/91] back off nltk --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 4a5ac3d1..e057eb90 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -101,7 +101,7 @@ minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 openai==1.40.6 ordered-set==4.1.0 From b0b833eee133b0d5e6d2e3b592fba57450dd2121 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:34:22 -0400 Subject: [PATCH 67/91] starting cleanup --- .../app/graphrag/graph_rag.py | 20 +++++++----------- .../app/graphrag/util.py | 21 +++++++++---------- .../app/graphrag/workers.py | 10 ++++++++- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 54e47f26..a99fc1f7 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,4 +1,5 @@ import asyncio +import json import logging import time import traceback @@ -129,7 +130,7 @@ async def upsert(upsert_chan: Channel): async def load(conn: TigerGraphConnection): logger.info("Reading from load_q") dd = lambda: defaultdict(dd) # infinite default dict - batch_size = 250 + batch_size = 1000 # while the load q is still open or has contents while not load_q.closed() or not load_q.empty(): if load_q.closed(): @@ -170,10 +171,11 @@ async def load(conn: TigerGraphConnection): ] = attrs n_edges += 1 + data = json.dumps(batch) logger.info( - f"Upserting batch size of {size}. ({n_verts} verts | {n_edges} edges)" + f"Upserting batch size of {size}. ({n_verts} verts | {n_edges} edges. {len(data.encode())/1000:,} kb)" ) - await upsert_batch(conn, batch) + await upsert_batch(conn, data) else: await asyncio.sleep(1) @@ -194,7 +196,6 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: - continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") grp.create_task( @@ -350,7 +351,7 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): ) res.raise_for_status() mod = res.json()["results"][0]["mod"] - logger.info(f"*** mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") + logger.info(f"mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") if mod == 0: break @@ -373,8 +374,6 @@ async def stream_communities( logger.info("streaming communities") headers = make_headers(conn) - # TODO: - # can only do one layer at a time to ensure that every child community has their descriptions # async for i in community_chan: # get the community from that layer @@ -404,9 +403,6 @@ async def stream_communities( await load_q.flush(("FLUSH", None)) await asyncio.sleep(3) - # logger.info("stream_communities done") - # logger.info("closing comm_process_chan") - async def summarize_communities( conn: TigerGraphConnection, @@ -439,8 +435,8 @@ async def run(graphname: str, conn: TigerGraphConnection): extractor, index_stores = await init(conn) init_start = time.perf_counter() - doc_process_switch = False - entity_resolution_switch = False + doc_process_switch = True + entity_resolution_switch = True community_detection_switch = True if doc_process_switch: logger.info("Doc Processing Start") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 2c4dce98..5c73008e 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -10,13 +10,8 @@ from graphrag import reusable_channel, workers from pyTigerGraph import TigerGraphConnection -from common.config import ( - doc_processing_config, - embedding_service, - get_llm_service, - llm_config, - milvus_config, -) +from common.config import (doc_processing_config, embedding_service, + get_llm_service, llm_config, milvus_config) from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor @@ -25,7 +20,7 @@ logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) -tg_sem = asyncio.Semaphore(100) +tg_sem = asyncio.Semaphore(20) load_q = reusable_channel.ReuseableChannel() @@ -215,12 +210,11 @@ async def upsert_vertex( # logger.error(f"Upsert err: {vertex_type} {vertex_id}\n{e}") -async def upsert_batch(conn: TigerGraphConnection, batch): +async def upsert_batch(conn: TigerGraphConnection, data: str): # logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") # vertex_id = vertex_id.replace(" ", "_") # attrs = map_attrs(attributes) # await load_q.put(('vertices')) - data = json.dumps(batch) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: @@ -237,11 +231,16 @@ async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: - res = await client.get( + try: + res = await client.get( f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", headers=headers, ) + except Exception as e: + logger.error(f"Check err:\n{e}") + return {"error": True} + try: res.raise_for_status() return res.json() diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 1fb5a743..9d8267b2 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -1,6 +1,7 @@ import base64 import logging import time +import traceback from urllib.parse import quote_plus import ecc_util @@ -294,7 +295,14 @@ async def resolve_entity( mark as processed """ - results = await emb_store.aget_k_closest(entity_id) + try: + results = await emb_store.aget_k_closest(entity_id) + + except Exception: + err = traceback.format_exc() + logger.error(err) + return + if len(results) == 0: logger.error( f"aget_k_closest should, minimally, return the entity itself.\n{results}" From 2b7099b060936822cc33febc8b17e4d01a919b44 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:42:38 -0400 Subject: [PATCH 68/91] starting cleanup --- .../app/graphrag/graph_rag.py | 2 +- .../app/graphrag/util.py | 67 +++---------------- .../app/graphrag/workers.py | 1 + 3 files changed, 13 insertions(+), 57 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index a99fc1f7..b69198c4 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -352,7 +352,7 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): res.raise_for_status() mod = res.json()["results"][0]["mod"] logger.info(f"mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") - if mod == 0: + if mod == 0 or mod - prev_mod < -0.05: break # write iter to chan for layer to be processed diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 5c73008e..ccf00cb2 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -1,6 +1,5 @@ import asyncio import base64 -import json import logging import re import traceback @@ -10,8 +9,13 @@ from graphrag import reusable_channel, workers from pyTigerGraph import TigerGraphConnection -from common.config import (doc_processing_config, embedding_service, - get_llm_service, llm_config, milvus_config) +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor @@ -55,15 +59,12 @@ async def init( ) -> tuple[BaseExtractor, dict[str, MilvusEmbeddingStore]]: # install requried queries requried_queries = [ - # "common/gsql/supportai/Scan_For_Updates", - # "common/gsql/supportai/Update_Vertices_Processing_Status", - # "common/gsql/supportai/ECC_Status", - # "common/gsql/supportai/Check_Nonexistent_Vertices", "common/gsql/graphRAG/StreamIds", "common/gsql/graphRAG/StreamDocContent", "common/gsql/graphRAG/SetEpochProcessing", "common/gsql/graphRAG/ResolveRelationships", "common/gsql/graphRAG/get_community_children", + "common/gsql/graphRAG/entities_have_resolution", "common/gsql/graphRAG/communities_have_desc", "common/gsql/graphRAG/louvain/graphrag_louvain_init", "common/gsql/graphRAG/louvain/graphrag_louvain_communities", @@ -90,7 +91,6 @@ async def init( "DocumentChunk", "Entity", "Relationship", - # "Concept", "Community", ], ) @@ -196,35 +196,16 @@ async def upsert_vertex( vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) await load_q.put(("vertices", (vertex_type, vertex_id, attrs))) - # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) - # headers = make_headers(conn) - # async with httpx.AsyncClient(timeout=http_timeout) as client: - # async with tg_sem: - # res = await client.post( - # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - # ) - # - # try: - # res.raise_for_status() - # except Exception as e: - # logger.error(f"Upsert err: {vertex_type} {vertex_id}\n{e}") async def upsert_batch(conn: TigerGraphConnection, data: str): - # logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") - # vertex_id = vertex_id.replace(" ", "_") - # attrs = map_attrs(attributes) - # await load_q.put(('vertices')) headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - # try: res.raise_for_status() - # except Exception as e: - # logger.error(f"Upsert err: {vertex_type} {vertex_id}\n{e}") async def check_vertex_exists(conn, v_id: str): @@ -233,9 +214,9 @@ async def check_vertex_exists(conn, v_id: str): async with tg_sem: try: res = await client.get( - f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", - headers=headers, - ) + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) except Exception as e: logger.error(f"Check err:\n{e}") @@ -264,21 +245,6 @@ async def upsert_edge( attrs = map_attrs(attributes) src_v_id = src_v_id.replace(" ", "_") tgt_v_id = tgt_v_id.replace(" ", "_") - # data = json.dumps( - # { - # "edges": { - # src_v_type: { - # src_v_id: { - # edge_type: { - # tgt_v_type: { - # tgt_v_id: attrs, - # } - # } - # }, - # } - # } - # } - # ) await load_q.put( ( "edges", @@ -293,17 +259,6 @@ async def upsert_edge( ) ) - # headers = make_headers(conn) - # async with httpx.AsyncClient(timeout=http_timeout) as client: - # async with tg_sem: - # res = await client.post( - # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - # ) - # try: - # res.raise_for_status() - # except Exception as e: - # logger.error(f"Upsert Edge err:\n{e}") - async def get_commuinty_children(conn, i: int, c: str): headers = make_headers(conn) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 9d8267b2..c1b355f2 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -277,6 +277,7 @@ async def extract( ) # embed "Relationship", # (v_id, content, index_name) + # right now, we're not embedding relationships in graphrag async def resolve_entity( From a03188d4b7276463597c82922ae90586fb443646 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:25:07 -0400 Subject: [PATCH 69/91] graph rag v0 --- eventual-consistency-service/app/graphrag/graph_rag.py | 5 ++--- eventual-consistency-service/app/graphrag/util.py | 7 +++---- eventual-consistency-service/app/graphrag/workers.py | 9 ++++----- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index b69198c4..4f8ccc61 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -352,7 +352,7 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): res.raise_for_status() mod = res.json()["results"][0]["mod"] logger.info(f"mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") - if mod == 0 or mod - prev_mod < -0.05: + if mod == 0 or mod - prev_mod <= -0.05: break # write iter to chan for layer to be processed @@ -500,9 +500,8 @@ async def run(graphname: str, conn: TigerGraphConnection): load_q.reopen() async with asyncio.TaskGroup() as grp: # run louvain - grp.create_task(communities(conn, comm_process_chan)) # get the communities - # grp.create_task( stream_communities(conn, communities_chan, comm_process_chan)) + grp.create_task(communities(conn, comm_process_chan)) # summarize each community grp.create_task( summarize_communities(conn, comm_process_chan, upsert_chan, embed_chan) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index ccf00cb2..ca99cdde 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -6,9 +6,6 @@ from glob import glob import httpx -from graphrag import reusable_channel, workers -from pyTigerGraph import TigerGraphConnection - from common.config import ( doc_processing_config, embedding_service, @@ -20,6 +17,8 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import reusable_channel, workers +from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) @@ -182,6 +181,7 @@ def process_id(v_id: str): v_id = has_func[0] if v_id == "''" or v_id == '""': return "" + v_id = v_id.replace("(", "").replace(")", "") return v_id @@ -287,7 +287,6 @@ async def get_commuinty_children(conn, i: int, c: str): else: descrs.append(desc) - print(f"Comm: {c} --> {descrs}", flush=True) return descrs diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index c1b355f2..d696df8b 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -7,15 +7,14 @@ import ecc_util import httpx from aiochannel import Channel -from graphrag import community_summarizer, util -from langchain_community.graphs.graph_document import GraphDocument, Node -from pyTigerGraph import TigerGraphConnection - from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import community_summarizer, util +from langchain_community.graphs.graph_document import GraphDocument, Node +from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -378,7 +377,7 @@ async def process_community( summarizer = community_summarizer.CommunitySummarizer(llm) summary = await summarizer.summarize(comm_id, children) - print(f"*******>{comm_id}: {children}, {summary}", flush=True) + logger.debug(f"*******>{comm_id}: {children}, {summary}") await upsert_chan.put( ( util.upsert_vertex, # func to call From a3c6dfbdf8e96c88f2208aaaa861021b1aa77ac6 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:28:22 -0400 Subject: [PATCH 70/91] rm loader --- common/gsql/graphRAG/loaders/tmp.gsql | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 common/gsql/graphRAG/loaders/tmp.gsql diff --git a/common/gsql/graphRAG/loaders/tmp.gsql b/common/gsql/graphRAG/loaders/tmp.gsql deleted file mode 100644 index e8d8d417..00000000 --- a/common/gsql/graphRAG/loaders/tmp.gsql +++ /dev/null @@ -1,26 +0,0 @@ -CREATE LOADING load_entity@uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} - - -CREATE LOADING load_ResolvedEntity@uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} -CREATE LOADING load_ asdfasdf @uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} -CREATE LOADING load_ asdfasdf @uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} -CREATE LOADING load_ asdfasdf @uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} -CREATE LOADING load_ asdfasdf @uuid@ { - DEFINE FILENAME Content; - LOAD DocumentContent TO VERTEX Document VALUES() USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; -} From 9d286e089ea6d8ad3ab79a93cd31ad1119f4608e Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:31:18 -0400 Subject: [PATCH 71/91] final cleanup --- eventual-consistency-service/app/graphrag/graph_rag.py | 6 ------ eventual-consistency-service/app/graphrag/workers.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 4f8ccc61..1d8f6084 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -88,12 +88,6 @@ async def chunk_docs( doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - # v_id = content["v_id"] - # txt = content["attributes"]["text"] - # send the document to be embedded - logger.info("chunk writes to extract") - # await embed_chan.put((v_id, txt, "Document")) - task = grp.create_task( workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) ) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index d696df8b..98b3e69c 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -377,7 +377,7 @@ async def process_community( summarizer = community_summarizer.CommunitySummarizer(llm) summary = await summarizer.summarize(comm_id, children) - logger.debug(f"*******>{comm_id}: {children}, {summary}") + logger.debug(f"Community {comm_id}: {children}, {summary}") await upsert_chan.put( ( util.upsert_vertex, # func to call From 7a7436522c84e2f917a195402426462c643ec14c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:56:41 -0400 Subject: [PATCH 72/91] reset langchain openai version --- common/requirements.txt | 2 +- copilot/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index 9912b4a8..86bdc50c 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -84,7 +84,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.22 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index cd9bb7bc..e057eb90 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -84,7 +84,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.22 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 From 8e1a6330e0fe1de9b6239a7028cd212776c1cd41 Mon Sep 17 00:00:00 2001 From: Bill Shi Date: Tue, 20 Aug 2024 17:06:27 -0700 Subject: [PATCH 73/91] doc: update readme for 0.9 --- README.md | 285 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 156 insertions(+), 129 deletions(-) diff --git a/README.md b/README.md index 293745d6..ae70acc0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # TigerGraph CoPilot ## Releases -* **4/30/2024: CoPilot is available now in Beta** (v0.5). A whole new function is added to CoPilot: Now you can create chatbots with graph-augmented AI on a your own documents. CoPilot builds a knowledge graph from source material and applies knowledge graph RAG (Retrieval Augmented Generation) to improve the contextual relevance and accuracy of answers to their natural-language questions. We would love to hear your feedback to keep improving it so that it could bring more value to you. It would be helpful if you could fill out this [short survey](https://forms.gle/c9jd4evjEPsVtR5p7) after you have played with CoPilot. Thank you for your interest and support! +* **8/21/2024: CoPilot is available now in v0.9** (v0.9.0). Please see [Release Notes](https://docs.tigergraph.com/tg-copilot/current/release-notes/#_new_in_copilot_0_9) for details. Note: On [TigerGraph Cloud](https://beta.tgcloud.io/) only CoPilot v0.5 is available. + +* **4/30/2024: CoPilot is available now in Beta** (v0.5.0). A whole new function is added to CoPilot: Now you can create chatbots with graph-augmented AI on a your own documents. CoPilot builds a knowledge graph from source material and applies knowledge graph RAG (Retrieval Augmented Generation) to improve the contextual relevance and accuracy of answers to their natural-language questions. We would love to hear your feedback to keep improving it so that it could bring more value to you. It would be helpful if you could fill out this [short survey](https://forms.gle/c9jd4evjEPsVtR5p7) after you have played with CoPilot. Thank you for your interest and support! * **3/18/2024: CoPilot is available now in Alpha** (v0.0.1). It uses a Large Language Model (LLM) to convert your question into a function call, which is then executed on the graph in TigerGraph. We would love to hear your feedback to keep improving it so that it could bring more value to you. If you are trying it out, it would be helpful if you could fill out this [sign up form](https://info.tigergraph.com/copilotalpha) so we can keep track of it (no spam, promised). And if you would just like to provide the feedback, please feel free to fill out this [short survey](https://forms.gle/c9jd4evjEPsVtR5p7). Thank you for your interest and support! @@ -14,7 +16,7 @@ TigerGraph CoPilot is an AI assistant that is meticulously designed to combine t * SupportAI as a knowledge Q&A assistant for documents and graphs * QueryAI as a GSQL code generator including query and schema generation, data mapping, and more (Not available in Beta; coming soon) -You can interact with CoPilot through both a chat interface on TigerGraph Cloud and APIs. For beta, your own LLM services (from OpenAI, Azure, GCP and AWS Bedrock) are required to use CoPilot, but in future releases you can use TigerGraph’s LLM or your local LLM as well. +You can interact with CoPilot through a chat interface on TigerGraph Cloud, a built-in chat interface and APIs. For now, your own LLM services (from OpenAI, Azure, GCP, AWS Bedrock, Ollama, Hugging Face and Groq.) are required to use CoPilot, but in future releases you can use TigerGraph’s LLMs. ### InquiryAI ![./docs/img/InquiryAI-Architecture.png](./docs/img/InquiryAI-Architecture.png) @@ -36,20 +38,21 @@ Organizing the data as a knowledge graph allows a chatbot to access accurate, fa ### QueryAI -QueryAI is the third component of TigerGraph CoPilot. It is designed to be used as a developer tool to help generate graph queries in GSQL from an English language description. It can also be used to generate schema, data mapping, and even dashboards. This will enable developers to write GSQL queries more quickly and accurately, and will be especially useful for those who are new to GSQL. QueryAI is available in alpha Q4 2024. +QueryAI is the third component of TigerGraph CoPilot. It is designed to be used as a developer tool to help generate graph queries in GSQL from an English language description. It can also be used to generate schema, data mapping, and even dashboards. This will enable developers to write GSQL queries more quickly and accurately, and will be especially useful for those who are new to GSQL. Currently, experimental openCypher generation is available. ## Getting Started ### TigerGraph Cloud -CoPilot is available as an add-on service to your workspace on TigerGraph Cloud. Please follow the [instructions here](https://docs.tigergraph.com/tg-copilot/current/getstarted/oncloud) to start on TigerGraph Cloud within minutes. +CoPilot is available as an add-on service to your workspace on TigerGraph Cloud. It is disabled by default. Please contact beta-support@tigergraph.com to enable TigerGraph CoPilot as an option in the [Marketplace](https://docs.tigergraph.com/cloudBeta/current/integrations/). ### Self-Managed -TigerGraph CoPilot is open-source and can be deployed to your own infrastructure. This repo only includes the backend service of CoPilot but you can still access all of its functions through the APIs. What is different from CoPilot on TigerGraph Cloud is the absence of the graphical user interface and the extra steps to set it up and maintenance. +TigerGraph CoPilot is an open-source project on [GitHub](https://github.com/tigergraph/CoPilot) which can be deployed to your own infrastructure. If you don’t need to extend the source code of CoPilot, the quickest way is to deploy its docker image with the docker compose file in the repo. In order to take this route, you will need the following prerequisites. #### Prerequisites * Docker +* TigerGraph DB 3.9+. (For 3.x, you will need to install a few user defined functions (UDFs). Please see Step 5 below for details.) * API key of your LLM provider. (An LLM provider refers to a company or organization that offers Large Language Models (LLMs) as a service. The API key verifies the identity of the requester, ensuring that the request is coming from a registered and authorized user or application.) Currently, CoPilot supports the following LLM providers: OpenAI, Azure OpenAI, GCP, AWS Bedrock. #### Deploy with Docker Compose @@ -57,22 +60,50 @@ If you don’t need to extend the source code of CoPilot, the quickest way is to - Download the [docker-compose.yml](https://github.com/tigergraph/copilot/blob/main/docker-compose.yml) file directly , or - Clone the repo `git clone https://github.com/tigergraph/CoPilot` - The docker compose file contains all dependencies for CoPilot including a TigerGraph database. If any service is not needed, please feel free to remove it from the file. Besides, CoPilot comes with a Swagger API documentation page when it is deployed. If you wish to disable it, you can set the PRODUCTION environment variable to true for the CoPilot service in the compose file. + The Docker Compose file contains all dependencies for CoPilot including a Milvus database. If you do not need a particular service, you make edit the Compose file to remove it or set its scale to 0 when running the Compose file (details later). Moreover, CoPilot comes with a Swagger API documentation page when it is deployed. If you wish to disable it, you can set the `PRODUCTION` environment variable to true for the CoPilot service in the Compose file. * Step 2: Set up configurations - In the same directory as the docker compose file is in, create and fill in the following configuration files: `touch configs/db_config.json configs/llm_config.json configs/milvus_config.json`. Details for each configure file is available below. - - [LLM config](#llm-provider-configuration) - - [Tigergraph config](#db-configuration) - - [Milvus config](#milvus-configuration) + Next, in the same directory as the Docker Compose file is in, create and fill in the following configuration files: + * [configs/db_config.json](#llm-provider-configuration) + * [configs/llm_config.json](#llm-provider-configuration) + * [configs/milvus_config.json](#milvus-configuration) + * [configs/chat_config.json](#chat-configuration) + * Step 3 (Optional): Configure Logging - `touch configs/log_config.json`. Details for the configure file is available at [Logging config](#logging-configuration). + `touch configs/log_config.json`. Details for the configuration is available [here](https://docs.tigergraph.com/tg-copilot/current/getstarted/self-managed#_3_optional_logging). * Step 4: Start all services - Simply run `docker compose up -d` and wait for all the services to start. + Now, simply run `docker compose up -d` and wait for all the services to start. If you don’t want to use the included Milvus DB, you can set its scale to 0 to not start it: `docker compose up -d --scale milvus-standalone=0 --scale etcd=0 --scale minio=0`. + +* Step 5: Install UDFs + + This step is not needed for TigerGraph databases version 4.x. For TigerGraph 3.x, we need to install a few user defined functions (UDFs) for CoPilot to work. + + 1. On the machine that hosts the TigerGraph database, switch to the user of TigerGraph: `sudo su - tigergraph`. If TigerGraph is running on a cluster, you can do this on any one of the machines. + 2. Download the two files [ExprFunctions.hpp](https://raw.githubusercontent.com/tigergraph/CoPilot/dev/copilot/udfs/milvus/rest/ExprFunctions.hpp) and [ExprUtil.hpp](https://raw.githubusercontent.com/tigergraph/CoPilot/dev/copilot/udfs/milvus/rest/ExprUtil.hpp). + 3. In a terminal, run the following command to enable UDF installation: + ``` + gadmin config set GSQL.UDF.EnablePutTgExpr true + gadmin config set GSQL.UDF.Policy.Enable false + gadmin config apply + gadmin restart GSQL + ``` + 4. Enter a GSQL shell, and run the following command to install the UDF files. + ``` + PUT tg_ExprFunctions FROM "./tg_ExprFunctions.hpp" + PUT tg_ExprUtil FROM "./tg_ExprUtil.hpp" + ``` + 5. Quit the GSQL shell, and run the following command in the terminal to disable UDF installation for security purpose. + ``` + gadmin config set GSQL.UDF.EnablePutTgExpr false + gadmin config set GSQL.UDF.Policy.Enable true + gadmin config apply + gadmin restart GSQL + ``` #### Configurations @@ -134,6 +165,7 @@ In the `configs/llm_config.json` file, copy JSON config template from below for ``` * Azure + In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_deployment`, `llm_model` and `model_name` can be edited to match your specific configuration details. ```json { @@ -192,12 +224,108 @@ In the `configs/llm_config.json` file, copy JSON config template from below for } } ``` + +* Ollama + ```json + { + "model_name": "GPT-4", + "embedding_service": { + "embedding_model_service": "openai", + "authentication_configuration": { + "OPENAI_API_KEY": "" + } + }, + "completion_service": { + "llm_service": "ollama", + "llm_model": "calebfahlgren/natural-functions", + "model_kwargs": { + "temperature": 0.0000001 + }, + "prompt_path": "./app/prompts/openai_gpt4/" + } + } + ``` + +* Hugging Face + + Example configuration for a model on Hugging Face with a dedicated endpoint is shown below. Please specify your configuration details: + ```json + { + "model_name": "llama3-8b", + "embedding_service": { + "embedding_model_service": "openai", + "authentication_configuration": { + "OPENAI_API_KEY": "" + } + }, + "completion_service": { + "llm_service": "huggingface", + "llm_model": "hermes-2-pro-llama-3-8b-lpt", + "endpoint_url": "https:endpoints.huggingface.cloud", + "authentication_configuration": { + "HUGGINGFACEHUB_API_TOKEN": "" + }, + "model_kwargs": { + "temperature": 0.1 + }, + "prompt_path": "./app/prompts/openai_gpt4/" + } + } + ``` + + Example configuration for a model on Hugging Face with a serverless endpoint is shown below. Please specify your configuration details: + ```json + { + "model_name": "Llama3-70b", + "embedding_service": { + "embedding_model_service": "openai", + "authentication_configuration": { + "OPENAI_API_KEY": "" + } + }, + "completion_service": { + "llm_service": "huggingface", + "llm_model": "meta-llama/Meta-Llama-3-70B-Instruct", + "authentication_configuration": { + "HUGGINGFACEHUB_API_TOKEN": "" + }, + "model_kwargs": { + "temperature": 0.1 + }, + "prompt_path": "./app/prompts/llama_70b/" + } + } + ``` + +* Groq + ```json + { + "model_name": "mixtral-8x7b-32768", + "embedding_service": { + "embedding_model_service": "openai", + "authentication_configuration": { + "OPENAI_API_KEY": "" + } + }, + "completion_service": { + "llm_service": "groq", + "llm_model": "mixtral-8x7b-32768", + "authentication_configuration": { + "GROQ_API_KEY": "" + }, + "model_kwargs": { + "temperature": 0.1 + }, + "prompt_path": "./app/prompts/openai_gpt4/" + } + } + ``` + ##### DB configuration -Copy the below into `configs/db_config.json` and edit the `hostname` and `getToken` fields to match your database's configuration. Set the timeout, memory threshold, and thread limit parameters as desired to control how much of the database's resources are consumed when answering a question. +Copy the below into `configs/db_config.json` and edit the `hostname` and `getToken` fields to match your database's configuration. If token authentication is enabled in TigerGraph, set `getToken` to `true`. Set the timeout, memory threshold, and thread limit parameters as desired to control how much of the database's resources are consumed when answering a question. -If you are running TigerGraph outside of docker compose, change the hostname to match its address (`http://localhost`, `https://your-TgCloud-hostname`). Once authentication is enabled in TigerGraph, set getToken to `true`. +“ecc” and “chat_history_api” are the addresses of internal components of CoPilot.If you use the Docker Compose file as is, you don’t need to change them. -You can also disable the consistency_checker, which reconciles Milvus and TigerGraph data, within this config. It is true by default ```json { "hostname": "http://tigergraph", @@ -207,12 +335,13 @@ You can also disable the consistency_checker, which reconciles Milvus and TigerG "default_timeout": 300, "default_mem_threshold": 5000, "default_thread_limit": 8, - "enable_consistency_checker": true + "ecc": "http://eventual-consistency-service:8001", + "chat_history_api": "http://chat-history:8002" } ``` ##### Milvus configuration -Copy the below into `configs/milvus_config.json` and edit the `host` and `port` fields to match your Milvus configuration (keeping in mind docker configuration). `username` and `password` can also be configured below if required by your Milvus setup. `enabled` should always be set to "true" for now as Milvus is only the embedding store supported. `process_interval_seconds` is the number of seconds which the eventual-consistency-checker (ECC) service will be scheduled to check for new vertices in TigerGraph in order to create embeddings in Milvus. In the same way `cleanup_interval_seconds` is the number of seconds the ECC service will be scheduled to check for stale Milvus embeddings (e.g. if TigerGraph is restored from backup, or a vertex is deleted). Batch size is the number of vertices that ECC will process in one workload; this is optional and defaults to 10. +Copy the below into `configs/milvus_config.json` and edit the `host` and `port` fields to match your Milvus configuration (keeping in mind docker configuration). `username` and `password` can also be configured below if required by your Milvus setup. `enabled` should always be set to "true" for now as Milvus is only the embedding store supported. ```json { "host": "milvus-standalone", @@ -220,132 +349,30 @@ Copy the below into `configs/milvus_config.json` and edit the `host` and `port` "username": "", "password": "", "enabled": "true", - "process_interval_seconds": 1800, - "cleanup_interval_seconds": 2592000, - "batch_size": 10 + "sync_interval_seconds": 60 } ``` -##### Logging configuration - -Copy the below into `configs/log_config.json` and edit the appropriate values to suit your needs. The log rotation is based on size and backups are kept. These configurations are applied in the LogWriter to the standard python logging package. Operational and audit logs are recorded. Outputs include log.ERROR, log.INFO, and log.AUDIT-COPILOT +##### Chat configuration +Copy the below code into `configs/chat_config.json`. You shouldn’t need to change anything unless you change the port of the chat history service in the Docker Compose file. ```json { - "log_file_path": "logs", - "log_max_size": 10485760, - "log_backup_count": 10 + "apiPort":"8002", + "dbPath": "chats.db", + "dbLogPath": "db.log", + "logPath": "requestLogs.jsonl", + ​​"conversationAccessRoles": ["superuser", "globaldesigner"] } ``` -To configure the logging level of the service, edit the CoPilot service's `LOGLEVEL` env variable in the docker-compose file. By default, the logging level is set to `"INFO"`. - -This line can be changed to support different logging levels. The levels are described below: - -* **CRITICAL**: A serious error -* **ERROR**: Failing to perform functions -* **WARNING**: Indication of unexpected problems, e.g. failure to map a user's question to the graph schema -* **INFO**: Confriming that the service is performing as expected. -* **DEBUG**: Detailed information, e.g. the functions retrieved during the GenerateFunction step, etc. -* **DEBUG_PII**: Finer-grained information that could potentially include PII, such as a user's question, the complete function call (with parameters), and the LLM's natural language response. -* **NOTSET**: All messages are processed - ##### Enable openCypher Query Generation in InquiryAI If you would like to enable openCypher query generation in InquiryAI, you can set the `USE_CYPHER` environment variable to `"true"` in the CoPilot service in the docker compose file. By default, this is set to `"false"`. **Note**: openCypher query generation is still in beta and may not work as expected, as well as increases the potential of hallucinated answers due to bad code generation. Use with caution, and only in non-production environments. ## Using TigerGraph CoPilot -### TigerGraph Cloud -A chat interface is available on TigerGraph Cloud, with which you can “talk” to your graph to get more insights and value from your data. Please follow the [instructions here](https://docs.tigergraph.com/tg-copilot/current/using-copilot/how2-use-on-cloud) to access CoPilot on TigerGraph Cloud. - -### Using API -You can also access CoPilot via its API for both self-managed and TigerGraph Cloud-managed services. Two types of API access are provided for now: REST http endpoints and pyTigerGraph interface. Additionally, there is a primitive chatbot interface for testing purpose only. And LangChain interface is available for InquiryAI with more integrations coming soon. - -#### Authentication -When accessing its API, ​​there are two options to authenticate with the TigerGraph CoPilot service. - -First way is with a username/password pair generated from the TigerGraph database. - -The second way is a GSQL secret, also obtained from the database. However, when using the GSQL secret, the username field must be specified as __GSQL__secret, with the password field containing the secret. Note: If pyTigerGraph is being used and a connection is created with the gsqlSecret parameter, this will already be done for you. - -#### HTTP Endpoints -For self-managed services, the full list of available HTTP endpoints can be found at the /docs path on your host’s address, e.g., `http://localhost/docs`. It is a Swagger API doc and you can even try out the endpoints on that page. Note: The Swagger API doc page is disabled on TigerGraph Cloud. -![./docs/img/SwaggerDocUX.png](./docs/img/SwaggerDocUX.png) - -#### Using pyTigerGraph -First, update pyTigerGraph to utilize the latest build: -```sh -pip install -U git+https://github.com/tigergraph/pyTigerGraph.git -``` - -Then, the endpoints are availble when configured with a `TigerGraphConnection`: - -```py -from pyTigerGraph import TigerGraphConnection - -# create a connection to the database -conn = TigerGraphConnection(host="DATABASE_HOST_HERE", graphname="GRAPH_NAME_HERE", username="USERNAME_HERE", password="PASSWORD_HERE") - -### ==== CONFIGURE INQUIRYAI HOST ==== -conn.ai.configureInquiryAIHost("INQUIRYAI_HOST_HERE") - -### ==== RETRIEVE TOP-K DOCS FROM LIBRARY ==== -# `top_k` parameter optional -conn.ai.retrieveDocs("How many papers are there?", top_k = 5) - -### ==== RUN A NATURAL LANGUAGE QUERY ==== -print(conn.ai.query("How many papers are there?")) - -# prints: {'natural_language_response': 'There are 736389 papers.', 'answered_question': True, 'query_sources': {'function_call': "getVertexCount('Paper')", 'result': 736389}} - -### ==== REGISTER A CUSTOM QUERY ==== -# Prompt for PageRank query - could be read in as JSON file. -pr_prompt = { - "function_header": "tg_pagerank", - "description": "Determines the importance or influence of each vertex based on its connections to other vertices.", - "docstring": "The PageRank algorithm measures the influence of each vertex on every other vertex. PageRank influence is defined recursively: a vertex’s influence is based on the influence of the vertices which refer to it. A vertex’s influence tends to increase if either of these conditions are met:\n* It has more referring vertices\n* Its referring vertices have higher influence\nTo run this algorithm, use `runInstalledQuery('tg_pagerank', params={'v_type': 'INSERT_V_TYPE_HERE', 'e_type': 'INSERT_E_TYPE_HERE', 'top_k': INSERT_TOP_K_HERE})`, where the parameters are:\n* 'v_type': The vertex type to run the algorithm on.\n* 'e_type': The edge type to run the algorithm on.\n* 'top_k': The number of top scoring vertices to return to the user.", - "param_types": { - "v_type": "str", - "e_type": "str", - "top_k": "int" - } -} - -# Register Query -conn.ai.registerCustomQuery(pr_prompt["function_header"], pr_prompt["description"], pr_prompt["docstring"], pr_prompt["param_types"]) - -# Run Query -print(conn.ai.query("What are the 5 most influential papers by citations?")) - -# prints: {'natural_language_response': 'The top 5 most cited papers are:\n\n1. [Title of paper with Vertex_ID 428523]\n2. [Title of paper with Vertex_ID 384889]\n3. [Title of paper with Vertex_ID 377502]\n4. [Title of paper with Vertex_ID 61855]\n5. [Title of paper with Vertex_ID 416200]', 'answered_question': True, 'query_sources': {'function_call': "runInstalledQuery('tg_pagerank', params={'v_type': 'Paper', 'e_type': 'CITES', 'top_k': 5})", 'result': [{'@@top_scores_heap': [{'Vertex_ID': '428523', 'score': 392.8731}, {'Vertex_ID': '384889', 'score': 251.8021}, {'Vertex_ID': '377502', 'score': 149.1018}, {'Vertex_ID': '61855', 'score': 129.7406}, {'Vertex_ID': '416200', 'score': 129.2286}]}]}} -``` - -#### Chat with CoPilot -Navigate to `http://localhost/graphname/chat` when the Docker container is running, where graphname is the name of the graph you want to query. Note: This chat interface is for testing only. Please use CoPilot on TigerGraph Cloud for a proper chat interface. -![./docs/img/CoPilot-UX-Demo.png](./docs/img/CoPilot-UX-Demo.png) +CoPilot is friendly to both technical and non-technical users. There is a graphical chat interface as well as API access to CoPilot. Function-wise, CoPilot can answer your questions by calling existing queries in the database (InquiryAI), build a knowledge graph from your documents (SupportAI), and answer knowledge questions based on your documents (SupportAI). -#### Using LangChain -To use LangChain with InquiryAI, first install the LangChain fork here in your Python environment: -``` -pip install git+https://github.com/langchain-ai/langchain.git -``` -Then, you can get answers from the graph with the below: - -```py -import pyTigerGraph as tg -conn = tg.TigerGraphConnection(host="DATABASE_HOST_HERE", graphname="GRAPH_NAME_HERE", username="USERNAME_HERE", password="PASSWORD_HERE") - -### ==== CONFIGURE INQUIRYAI HOST ==== -conn.ai.configureInquiryAIHost("INQUIRYAI_HOST_HERE") - -from langchain_community.graphs import TigerGraph -graph = TigerGraph(conn) -result = graph.query("How many servers are there?") -print(result) -# {'natural_language_response': 'There are 46148 servers.', -# 'answered_question': True, -# 'query_sources': {'function_call': 'getVertexCount(vertexType="BareMetalNode")', -# 'result': 46148} -``` +Please refer to our [official documentation](https://docs.tigergraph.com/tg-copilot/current/using-copilot/) on how to use CoPilot. ## Customization and Extensibility TigerGraph CoPilot is designed to be easily extensible. The service can be configured to use different LLM providers, different graph schemas, and different LangChain tools. The service can also be extended to use different embedding services, different LLM generation services, and different LangChain tools. For more information on how to extend the service, see the [Developer Guide](./docs/DeveloperGuide.md). From ac95ef33e0295232e46f2bc809798bbd06e036ba Mon Sep 17 00:00:00 2001 From: Bill Shi Date: Tue, 20 Aug 2024 17:06:52 -0700 Subject: [PATCH 74/91] refactor: move docs folder to root to be consistent with readme --- {copilot/docs => docs}/Contributing.md | 0 {copilot/docs => docs}/DeveloperGuide.md | 0 {copilot/docs => docs}/img/CoPilot-UX-Demo.png | Bin .../docs => docs}/img/InquiryAI-Architecture.png | Bin .../docs => docs}/img/SupportAI-Architecture.png | Bin {copilot/docs => docs}/img/SupportAISchema.png | Bin {copilot/docs => docs}/img/SwaggerDocUX.png | Bin .../docs => docs}/img/TG-CoPilot-Architecture.png | Bin .../docs => docs}/notebooks/DigitalInfraDemo.ipynb | 0 .../docs => docs}/notebooks/FeedbackAnalysis.ipynb | 0 .../docs => docs}/notebooks/SupportAIDemo.ipynb | 0 .../notebooks/TransactionFraudInvestigation.ipynb | 0 .../notebooks/TransactionFraud_demo.ipynb | 0 .../docs => docs}/notebooks/VisualizeAgent.ipynb | 0 14 files changed, 0 insertions(+), 0 deletions(-) rename {copilot/docs => docs}/Contributing.md (100%) rename {copilot/docs => docs}/DeveloperGuide.md (100%) rename {copilot/docs => docs}/img/CoPilot-UX-Demo.png (100%) rename {copilot/docs => docs}/img/InquiryAI-Architecture.png (100%) rename {copilot/docs => docs}/img/SupportAI-Architecture.png (100%) rename {copilot/docs => docs}/img/SupportAISchema.png (100%) rename {copilot/docs => docs}/img/SwaggerDocUX.png (100%) rename {copilot/docs => docs}/img/TG-CoPilot-Architecture.png (100%) rename {copilot/docs => docs}/notebooks/DigitalInfraDemo.ipynb (100%) rename {copilot/docs => docs}/notebooks/FeedbackAnalysis.ipynb (100%) rename {copilot/docs => docs}/notebooks/SupportAIDemo.ipynb (100%) rename {copilot/docs => docs}/notebooks/TransactionFraudInvestigation.ipynb (100%) rename {copilot/docs => docs}/notebooks/TransactionFraud_demo.ipynb (100%) rename {copilot/docs => docs}/notebooks/VisualizeAgent.ipynb (100%) diff --git a/copilot/docs/Contributing.md b/docs/Contributing.md similarity index 100% rename from copilot/docs/Contributing.md rename to docs/Contributing.md diff --git a/copilot/docs/DeveloperGuide.md b/docs/DeveloperGuide.md similarity index 100% rename from copilot/docs/DeveloperGuide.md rename to docs/DeveloperGuide.md diff --git a/copilot/docs/img/CoPilot-UX-Demo.png b/docs/img/CoPilot-UX-Demo.png similarity index 100% rename from copilot/docs/img/CoPilot-UX-Demo.png rename to docs/img/CoPilot-UX-Demo.png diff --git a/copilot/docs/img/InquiryAI-Architecture.png b/docs/img/InquiryAI-Architecture.png similarity index 100% rename from copilot/docs/img/InquiryAI-Architecture.png rename to docs/img/InquiryAI-Architecture.png diff --git a/copilot/docs/img/SupportAI-Architecture.png b/docs/img/SupportAI-Architecture.png similarity index 100% rename from copilot/docs/img/SupportAI-Architecture.png rename to docs/img/SupportAI-Architecture.png diff --git a/copilot/docs/img/SupportAISchema.png b/docs/img/SupportAISchema.png similarity index 100% rename from copilot/docs/img/SupportAISchema.png rename to docs/img/SupportAISchema.png diff --git a/copilot/docs/img/SwaggerDocUX.png b/docs/img/SwaggerDocUX.png similarity index 100% rename from copilot/docs/img/SwaggerDocUX.png rename to docs/img/SwaggerDocUX.png diff --git a/copilot/docs/img/TG-CoPilot-Architecture.png b/docs/img/TG-CoPilot-Architecture.png similarity index 100% rename from copilot/docs/img/TG-CoPilot-Architecture.png rename to docs/img/TG-CoPilot-Architecture.png diff --git a/copilot/docs/notebooks/DigitalInfraDemo.ipynb b/docs/notebooks/DigitalInfraDemo.ipynb similarity index 100% rename from copilot/docs/notebooks/DigitalInfraDemo.ipynb rename to docs/notebooks/DigitalInfraDemo.ipynb diff --git a/copilot/docs/notebooks/FeedbackAnalysis.ipynb b/docs/notebooks/FeedbackAnalysis.ipynb similarity index 100% rename from copilot/docs/notebooks/FeedbackAnalysis.ipynb rename to docs/notebooks/FeedbackAnalysis.ipynb diff --git a/copilot/docs/notebooks/SupportAIDemo.ipynb b/docs/notebooks/SupportAIDemo.ipynb similarity index 100% rename from copilot/docs/notebooks/SupportAIDemo.ipynb rename to docs/notebooks/SupportAIDemo.ipynb diff --git a/copilot/docs/notebooks/TransactionFraudInvestigation.ipynb b/docs/notebooks/TransactionFraudInvestigation.ipynb similarity index 100% rename from copilot/docs/notebooks/TransactionFraudInvestigation.ipynb rename to docs/notebooks/TransactionFraudInvestigation.ipynb diff --git a/copilot/docs/notebooks/TransactionFraud_demo.ipynb b/docs/notebooks/TransactionFraud_demo.ipynb similarity index 100% rename from copilot/docs/notebooks/TransactionFraud_demo.ipynb rename to docs/notebooks/TransactionFraud_demo.ipynb diff --git a/copilot/docs/notebooks/VisualizeAgent.ipynb b/docs/notebooks/VisualizeAgent.ipynb similarity index 100% rename from copilot/docs/notebooks/VisualizeAgent.ipynb rename to docs/notebooks/VisualizeAgent.ipynb From 0f2045a60cc5f6ee8f5b2cbf0ffa4ec4bdeb4f53 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:06:47 -0400 Subject: [PATCH 75/91] remove duplicate tg semaphore --- eventual-consistency-service/app/graphrag/util.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 5f288fa2..5838ec7e 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -26,8 +26,6 @@ tg_sem = asyncio.Semaphore(20) load_q = reusable_channel.ReuseableChannel() -tg_sem = asyncio.Semaphore(100) - async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, From 52e1e0fdeb1ca974a150d062b55fc5d6b94988e2 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Thu, 22 Aug 2024 12:53:41 -0400 Subject: [PATCH 76/91] tune loader with semaphores and events --- .../app/graphrag/graph_rag.py | 28 +- .../app/graphrag/util.py | 16 +- .../app/graphrag/workers.py | 431 ++++++++++-------- 3 files changed, 268 insertions(+), 207 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index e4457a77..a1af8cc4 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -14,6 +14,7 @@ http_timeout, init, load_q, + loading_event, make_headers, stream_ids, tg_sem, @@ -124,7 +125,7 @@ async def upsert(upsert_chan: Channel): async def load(conn: TigerGraphConnection): logger.info("Reading from load_q") dd = lambda: defaultdict(dd) # infinite default dict - batch_size = 1000 + batch_size = 500 # while the load q is still open or has contents while not load_q.closed() or not load_q.empty(): if load_q.closed(): @@ -169,7 +170,12 @@ async def load(conn: TigerGraphConnection): logger.info( f"Upserting batch size of {size}. ({n_verts} verts | {n_edges} edges. {len(data.encode())/1000:,} kb)" ) + + loading_event.clear() await upsert_batch(conn, data) + print("giving the graph time to catch up",flush=True) + await asyncio.sleep(5) + loading_event.set() else: await asyncio.sleep(1) @@ -435,12 +441,12 @@ async def run(graphname: str, conn: TigerGraphConnection): if doc_process_switch: logger.info("Doc Processing Start") docs_chan = Channel(1) - embed_chan = Channel(100) - upsert_chan = Channel(100) - extract_chan = Channel(100) + embed_chan = Channel() + upsert_chan = Channel() + extract_chan = Channel() async with asyncio.TaskGroup() as grp: # get docs - grp.create_task(stream_docs(conn, docs_chan, 10)) + grp.create_task(stream_docs(conn, docs_chan, 100)) # process docs grp.create_task( chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) @@ -462,8 +468,8 @@ async def run(graphname: str, conn: TigerGraphConnection): if entity_resolution_switch: logger.info("Entity Processing Start") - entities_chan = Channel(100) - upsert_chan = Channel(100) + entities_chan = Channel() + upsert_chan = Channel() load_q.reopen() async with asyncio.TaskGroup() as grp: grp.create_task(stream_entities(conn, entities_chan, 50)) @@ -487,10 +493,10 @@ async def run(graphname: str, conn: TigerGraphConnection): community_start = time.perf_counter() if community_detection_switch: logger.info("Community Processing Start") - upsert_chan = Channel(10) - comm_process_chan = Channel(100) - upsert_chan = Channel(100) - embed_chan = Channel(100) + upsert_chan = Channel() + comm_process_chan = Channel() + upsert_chan = Channel() + embed_chan = Channel() load_q.reopen() async with asyncio.TaskGroup() as grp: # run louvain diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 5f288fa2..69ca7dd6 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -6,6 +6,9 @@ from glob import glob import httpx +from graphrag import reusable_channel, workers +from pyTigerGraph import TigerGraphConnection + from common.config import ( doc_processing_config, embedding_service, @@ -17,8 +20,6 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import reusable_channel, workers -from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) @@ -26,7 +27,10 @@ tg_sem = asyncio.Semaphore(20) load_q = reusable_channel.ReuseableChannel() -tg_sem = asyncio.Semaphore(100) +# will pause workers until the event is false +loading_event = asyncio.Event() +loading_event.set() # set the event to true to allow the workers to run + async def install_queries( requried_queries: list[str], @@ -109,7 +113,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=False, + drop_old=True, ) LogWriter.info(f"Initializing {name}") @@ -209,7 +213,6 @@ async def upsert_batch(conn: TigerGraphConnection, data: str): res.raise_for_status() - async def check_vertex_exists(conn, v_id: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: @@ -221,7 +224,8 @@ async def check_vertex_exists(conn, v_id: str): ) except Exception as e: - logger.error(f"Check err:\n{e}") + err = traceback.format_exc() + logger.error(f"Check err:\n{err}") return {"error": True} try: diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 98b3e69c..cd94d15f 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -1,3 +1,4 @@ +import asyncio import base64 import logging import time @@ -7,14 +8,15 @@ import ecc_util import httpx from aiochannel import Channel +from graphrag import community_summarizer, util +from langchain_community.graphs.graph_document import GraphDocument, Node +from pyTigerGraph import TigerGraphConnection + from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import community_summarizer, util -from langchain_community.graphs.graph_document import GraphDocument, Node -from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -55,6 +57,9 @@ async def install_query( return {"result": res, "error": False} +chunk_sem = asyncio.Semaphore(20) + + async def chunk_doc( conn: TigerGraphConnection, doc: dict[str, str], @@ -67,23 +72,30 @@ async def chunk_doc( Places the resulting chunks into the upsert channel (to be upserted to TG) and the embed channel (to be embedded and written to the vector store) """ - chunker = ecc_util.get_chunker() - chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = util.process_id(doc["v_id"]) - logger.info(f"Chunking {v_id}") - for i, chunk in enumerate(chunks): - chunk_id = f"{v_id}_chunk_{i}" - # send chunks to be upserted (func, args) - logger.info("chunk writes to upsert_chan") - await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) - - # send chunks to be embedded - logger.info("chunk writes to embed_chan") - await embed_chan.put((chunk_id, chunk, "DocumentChunk")) - - # send chunks to have entities extracted - logger.info("chunk writes to extract_chan") - await extract_chan.put((chunk, chunk_id)) + + # if loader is running, wait until it's done + if not util.loading_event.is_set(): + logger.info("Chunk worker waiting for loading event to finish") + await util.loading_event.wait() + + async with chunk_sem: + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = util.process_id(doc["v_id"]) + logger.info(f"Chunking {v_id}") + for i, chunk in enumerate(chunks): + chunk_id = f"{v_id}_chunk_{i}" + # send chunks to be upserted (func, args) + logger.info("chunk writes to upsert_chan") + await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) + + # send chunks to have entities extracted + logger.info("chunk writes to extract_chan") + await extract_chan.put((chunk, chunk_id)) + + # send chunks to be embedded + logger.info("chunk writes to embed_chan") + await embed_chan.put((chunk_id, chunk, "DocumentChunk")) return doc["v_id"] @@ -120,6 +132,9 @@ async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): ) +embed_sem = asyncio.Semaphore(20) + + async def embed( embed_svc: EmbeddingModel, embed_store: MilvusEmbeddingStore, @@ -141,10 +156,16 @@ async def embed( index_name: str the vertex index to write to """ - logger.info(f"Embedding {v_id}") + async with embed_sem: + logger.info(f"Embedding {v_id}") + + # if loader is running, wait until it's done + if not util.loading_event.is_set(): + logger.info("Embed worker waiting for loading event to finish") + await util.loading_event.wait() - vec = await embed_svc.aembed_query(content) - await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + vec = await embed_svc.aembed_query(content) + await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) async def get_vert_desc(conn, v_id, node: Node): @@ -158,6 +179,9 @@ async def get_vert_desc(conn, v_id, node: Node): return desc +extract_sem = asyncio.Semaphore(20) + + async def extract( upsert_chan: Channel, embed_chan: Channel, @@ -166,117 +190,129 @@ async def extract( chunk: str, chunk_id: str, ): - logger.info(f"Extracting chunk: {chunk_id}") - extracted: list[GraphDocument] = await extractor.aextract(chunk) - # upsert nodes and edges to the graph - for doc in extracted: - for node in doc.nodes: - logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") - v_id = util.process_id(str(node.id)) - if len(v_id) == 0: - continue - desc = await get_vert_desc(conn, v_id, node) - - # embed the entity - # embed with the v_id if the description is blank - if len(desc[0]) == 0: - await embed_chan.put((v_id, v_id, "Entity")) - else: - # (v_id, content, index_name) - await embed_chan.put((v_id, desc[0], "Entity")) + # if loader is running, wait until it's done + if not util.loading_event.is_set(): + logger.info("Extract worker waiting for loading event to finish") + await util.loading_event.wait() + + async with extract_sem: + extracted: list[GraphDocument] = await extractor.aextract(chunk) + logger.info( + f"Extracting chunk: {chunk_id} ({len(extracted)} graph docs extracted)" + ) - await upsert_chan.put( - ( - util.upsert_vertex, # func to call + # upsert nodes and edges to the graph + for doc in extracted: + for node in doc.nodes: + logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") + v_id = util.process_id(str(node.id)) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, node) + + # embed the entity + # embed with the v_id if the description is blank + if len(desc[0]) == 0: + await embed_chan.put((v_id, v_id, "Entity")) + else: + # (v_id, content, index_name) + await embed_chan.put((v_id, desc[0], "Entity")) + + await upsert_chan.put( ( - conn, - "Entity", # v_type - v_id, # v_id - { # attrs - "description": desc, - "epoch_added": int(time.time()), - }, - ), + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # v_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) ) - ) - # link the entity to the chunk it came from - logger.info("extract writes contains edge to upsert") - await upsert_chan.put( - ( - util.upsert_edge, + # link the entity to the chunk it came from + logger.info("extract writes contains edge to upsert") + await upsert_chan.put( ( - conn, - "DocumentChunk", # src_type - chunk_id, # src_id - "CONTAINS_ENTITY", # edge_type - "Entity", # tgt_type - v_id, # tgt_id - None, # attributes - ), + util.upsert_edge, + ( + conn, + "DocumentChunk", # src_type + chunk_id, # src_id + "CONTAINS_ENTITY", # edge_type + "Entity", # tgt_type + v_id, # tgt_id + None, # attributes + ), + ) ) - ) - for edge in doc.relationships: - logger.info( - f"extract writes relates edge to upsert:{edge.source.id} -({edge.type})-> {edge.target.id}" - ) - # upsert verts first to make sure their ID becomes an attr - v_id = util.process_id(edge.source.id) # src_id - if len(v_id) == 0: - continue - desc = await get_vert_desc(conn, v_id, edge.source) - await upsert_chan.put( - ( - util.upsert_vertex, # func to call + for edge in doc.relationships: + logger.info( + f"extract writes relates edge to upsert:{edge.source.id} -({edge.type})-> {edge.target.id}" + ) + # upsert verts first to make sure their ID becomes an attr + v_id = util.process_id(edge.source.id) # src_id + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.source) + await upsert_chan.put( ( - conn, - "Entity", # v_type - v_id, - { # attrs - "description": desc, - "epoch_added": int(time.time()), - }, - ), + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) ) - ) - v_id = util.process_id(edge.target.id) - if len(v_id) == 0: - continue - desc = await get_vert_desc(conn, v_id, edge.target) - await upsert_chan.put( - ( - util.upsert_vertex, # func to call + v_id = util.process_id(edge.target.id) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.target) + await upsert_chan.put( ( - conn, - "Entity", # v_type - v_id, # src_id - { # attrs - "description": desc, - "epoch_added": int(time.time()), - }, - ), + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # src_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) ) - ) - # upsert the edge between the two entities - await upsert_chan.put( - ( - util.upsert_edge, + # upsert the edge between the two entities + await upsert_chan.put( ( - conn, - "Entity", # src_type - util.process_id(edge.source.id), # src_id - "RELATIONSHIP", # edgeType - "Entity", # tgt_type - util.process_id(edge.target.id), # tgt_id - {"relation_type": edge.type}, # attributes - ), + util.upsert_edge, + ( + conn, + "Entity", # src_type + util.process_id(edge.source.id), # src_id + "RELATIONSHIP", # edgeType + "Entity", # tgt_type + util.process_id(edge.target.id), # tgt_id + {"relation_type": edge.type}, # attributes + ), + ) ) - ) - # embed "Relationship", - # (v_id, content, index_name) - # right now, we're not embedding relationships in graphrag + # embed "Relationship", + # (v_id, content, index_name) + # right now, we're not embedding relationships in graphrag + + +resolve_sem = asyncio.Semaphore(20) async def resolve_entity( @@ -295,58 +331,68 @@ async def resolve_entity( mark as processed """ - try: - results = await emb_store.aget_k_closest(entity_id) - except Exception: - err = traceback.format_exc() - logger.error(err) - return + # if loader is running, wait until it's done + if not util.loading_event.is_set(): + logger.info("Entity Resolution worker waiting for loading event to finish") + await util.loading_event.wait() - if len(results) == 0: - logger.error( - f"aget_k_closest should, minimally, return the entity itself.\n{results}" - ) - raise Exception() - - # merge all entities into the ResolvedEntity vertex - # use the longest v_id as the resolved entity's v_id - resolved_entity_id = entity_id - for v in results: - if len(v) > len(resolved_entity_id): - resolved_entity_id = v - - # upsert the resolved entity - await upsert_chan.put( - ( - util.upsert_vertex, # func to call - ( - conn, - "ResolvedEntity", # v_type - resolved_entity_id, # v_id - { # attrs - }, - ), - ) - ) + async with resolve_sem: + try: + results = await emb_store.aget_k_closest(entity_id) - # create RESOLVES_TO edges from each entity to the ResolvedEntity - for v in results: + except Exception: + err = traceback.format_exc() + logger.error(err) + return + + if len(results) == 0: + logger.error( + f"aget_k_closest should, minimally, return the entity itself.\n{results}" + ) + raise Exception() + + # merge all entities into the ResolvedEntity vertex + # use the longest v_id as the resolved entity's v_id + resolved_entity_id = entity_id + for v in results: + if len(v) > len(resolved_entity_id): + resolved_entity_id = v + + # upsert the resolved entity await upsert_chan.put( ( - util.upsert_edge, + util.upsert_vertex, # func to call ( conn, - "Entity", # src_type - v, # src_id - "RESOLVES_TO", # edge_type - "ResolvedEntity", # tgt_type - resolved_entity_id, # tgt_id - None, # attributes + "ResolvedEntity", # v_type + resolved_entity_id, # v_id + { # attrs + }, ), ) ) + # create RESOLVES_TO edges from each entity to the ResolvedEntity + for v in results: + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + v, # src_id + "RESOLVES_TO", # edge_type + "ResolvedEntity", # tgt_type + resolved_entity_id, # tgt_id + None, # attributes + ), + ) + ) + + +comm_sem = asyncio.Semaphore(20) + async def process_community( conn: TigerGraphConnection, @@ -363,35 +409,40 @@ async def process_community( embed summaries """ - - logger.info(f"Processing Community: {comm_id}") - # get the children of the community - children = await util.get_commuinty_children(conn, i, comm_id) - comm_id = util.process_id(comm_id) - - # if the community only has one child, use its description - if len(children) == 1: - summary = children[0] - else: - llm = ecc_util.get_llm_service() - summarizer = community_summarizer.CommunitySummarizer(llm) - summary = await summarizer.summarize(comm_id, children) - - logger.debug(f"Community {comm_id}: {children}, {summary}") - await upsert_chan.put( - ( - util.upsert_vertex, # func to call + # if loader is running, wait until it's done + if not util.loading_event.is_set(): + logger.info("Process Community worker waiting for loading event to finish") + await util.loading_event.wait() + + async with comm_sem: + logger.info(f"Processing Community: {comm_id}") + # get the children of the community + children = await util.get_commuinty_children(conn, i, comm_id) + comm_id = util.process_id(comm_id) + + # if the community only has one child, use its description + if len(children) == 1: + summary = children[0] + else: + llm = ecc_util.get_llm_service() + summarizer = community_summarizer.CommunitySummarizer(llm) + summary = await summarizer.summarize(comm_id, children) + + logger.debug(f"Community {comm_id}: {children}, {summary}") + await upsert_chan.put( ( - conn, - "Community", # v_type - comm_id, # v_id - { # attrs - "description": summary, - "iteration": i, - }, - ), + util.upsert_vertex, # func to call + ( + conn, + "Community", # v_type + comm_id, # v_id + { # attrs + "description": summary, + "iteration": i, + }, + ), + ) ) - ) - # (v_id, content, index_name) - await embed_chan.put((comm_id, summary, "Community")) + # (v_id, content, index_name) + await embed_chan.put((comm_id, summary, "Community")) From 8c96f26f713d634eac1e5b22eb5d337df46f6edb Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:10:24 -0400 Subject: [PATCH 77/91] rm print --- eventual-consistency-service/app/graphrag/graph_rag.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index a1af8cc4..71e2f0f0 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -173,7 +173,6 @@ async def load(conn: TigerGraphConnection): loading_event.clear() await upsert_batch(conn, data) - print("giving the graph time to catch up",flush=True) await asyncio.sleep(5) loading_event.set() else: From 0604ecfa2b1a830afaa64824ef59251a971771b7 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 29 Aug 2024 08:46:01 -0500 Subject: [PATCH 78/91] Stability improvements --- common/config.py | 2 +- .../GraphRAG_Community_Retriever.gsql | 2 +- copilot/app/supportai/retrievers/GraphRAG.py | 4 ++-- .../app/graphrag/util.py | 22 +++++++++++------ .../app/graphrag/workers.py | 24 +++++++++++++------ 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/common/config.py b/common/config.py index 9f1d3cab..4548aa3a 100644 --- a/common/config.py +++ b/common/config.py @@ -187,7 +187,7 @@ def get_llm_service(llm_config) -> LLM_Model: ): doc_processing_config = { "chunker": "semantic", - "chunker_config": {"method": "percentile", "threshold": 0.95}, + "chunker_config": {"method": "percentile", "threshold": 0.90}, "extractor": "graphrag", "extractor_config": {}, } diff --git a/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql b/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql index 2d6ef9b0..97e44d10 100644 --- a/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql +++ b/common/gsql/supportai/retrievers/GraphRAG_Community_Retriever.gsql @@ -1,4 +1,4 @@ -CREATE DISTRIBUTED QUERY GraphRAG_CommunityRetriever(INT community_level=2) FOR GRAPH pyTigerGraphRAG { +CREATE DISTRIBUTED QUERY GraphRAG_Community_Retriever(INT community_level=2) { comms = {Community.*}; selected_comms = SELECT c FROM comms:c WHERE c.iteration == community_level; diff --git a/copilot/app/supportai/retrievers/GraphRAG.py b/copilot/app/supportai/retrievers/GraphRAG.py index 442f8fcb..4a973dc8 100644 --- a/copilot/app/supportai/retrievers/GraphRAG.py +++ b/copilot/app/supportai/retrievers/GraphRAG.py @@ -40,10 +40,10 @@ def __init__( connection: TigerGraphConnectionProxy, ): super().__init__(embedding_service, embedding_store, llm_service, connection) - self._check_query_install("GraphRAG_CommunityRetriever") + self._check_query_install("GraphRAG_Community_Retriever") def search(self, question, community_level: int): - res = self.conn.runInstalledQuery("GraphRAG_CommunityRetriever", {"community_level": community_level}, usePost=True) + res = self.conn.runInstalledQuery("GraphRAG_Community_Retriever", {"community_level": community_level}, usePost=True) return res async def _generate_candidate(self, question, context): diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 69ca7dd6..af0142e6 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) -tg_sem = asyncio.Semaphore(20) +tg_sem = asyncio.Semaphore(2) load_q = reusable_channel.ReuseableChannel() # will pause workers until the event is false @@ -270,17 +270,25 @@ async def get_commuinty_children(conn, i: int, c: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: async with tg_sem: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", - params={"comm": c, "iter": i}, - headers=headers, - ) + try: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", + params={"comm": c, "iter": i}, + headers=headers, + ) + except: + logger.error(f"Get Children err:\n{traceback.format_exc()}") try: resp.raise_for_status() except Exception as e: logger.error(f"Get Children err:\n{e}") descrs = [] - for d in resp.json()["results"][0]["children"]: + try: + res = resp.json()["results"][0]["children"] + except Exception as e: + logger.error(f"Get Children err:\n{e}") + res = [] + for d in res: desc = d["attributes"]["description"] # if it's the entity iteration if i == 1: diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index cd94d15f..c00b9187 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -163,9 +163,15 @@ async def embed( if not util.loading_event.is_set(): logger.info("Embed worker waiting for loading event to finish") await util.loading_event.wait() - - vec = await embed_svc.aembed_query(content) - await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + try: + vec = await embed_svc.aembed_query(content) + except Exception as e: + logger.error(f"Failed to embed {v_id}: {e}") + return + try: + await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + except Exception as e: + logger.error(f"Failed to add embeddings for {v_id}: {e}") async def get_vert_desc(conn, v_id, node: Node): @@ -196,10 +202,14 @@ async def extract( await util.loading_event.wait() async with extract_sem: - extracted: list[GraphDocument] = await extractor.aextract(chunk) - logger.info( - f"Extracting chunk: {chunk_id} ({len(extracted)} graph docs extracted)" - ) + try: + extracted: list[GraphDocument] = await extractor.aextract(chunk) + logger.info( + f"Extracting chunk: {chunk_id} ({len(extracted)} graph docs extracted)" + ) + except Exception as e: + logger.error(f"Failed to extract chunk {chunk_id}: {e}") + extracted = [] # upsert nodes and edges to the graph for doc in extracted: From d1197bc000a336a055f93b2fd737b874482d808f Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 29 Aug 2024 09:14:21 -0500 Subject: [PATCH 79/91] remove drop_old=True --- eventual-consistency-service/app/graphrag/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index d645c161..35f5bcdf 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -112,7 +112,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") From 6f2f8d6eb7b74dd28463f669e9460dbb2b10af17 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 29 Aug 2024 15:06:49 -0500 Subject: [PATCH 80/91] feat(graphrag): add descriptions to all upserts, cooccurence edges --- common/config.py | 2 +- .../LLMEntityRelationshipExtractor.py | 35 +++++++++++++++---- .../app/graphrag/util.py | 2 +- .../app/graphrag/workers.py | 18 +++++++++- 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/common/config.py b/common/config.py index 4548aa3a..ee7a850b 100644 --- a/common/config.py +++ b/common/config.py @@ -188,7 +188,7 @@ def get_llm_service(llm_config) -> LLM_Model: doc_processing_config = { "chunker": "semantic", "chunker_config": {"method": "percentile", "threshold": 0.90}, - "extractor": "graphrag", + "extractor": "llm", "extractor_config": {}, } elif DOC_PROCESSING_CONFIG.endswith(".json"): diff --git a/common/extractors/LLMEntityRelationshipExtractor.py b/common/extractors/LLMEntityRelationshipExtractor.py index 415c3235..959ce644 100644 --- a/common/extractors/LLMEntityRelationshipExtractor.py +++ b/common/extractors/LLMEntityRelationshipExtractor.py @@ -4,7 +4,8 @@ from common.extractors.BaseExtractor import BaseExtractor from common.llm_services import LLM_Model from common.py_schemas import KnowledgeGraph - +from langchain_community.graphs.graph_document import Node, Relationship, GraphDocument +from langchain_core.documents import Document class LLMEntityRelationshipExtractor(BaseExtractor): def __init__( @@ -19,7 +20,7 @@ def __init__( self.allowed_edge_types = allowed_relationship_types self.strict_mode = strict_mode - def _extract_kg_from_doc(self, doc, chain, parser): + async def _extract_kg_from_doc(self, doc, chain, parser) -> list[GraphDocument]: """ returns: { @@ -49,7 +50,7 @@ def _extract_kg_from_doc(self, doc, chain, parser): """ try: - out = chain.invoke( + out = await chain.ainvoke( {"input": doc, "format_instructions": parser.get_format_instructions()} ) except Exception as e: @@ -133,15 +134,30 @@ def _extract_kg_from_doc(self, doc, chain, parser): for rel in formatted_rels if rel["type"] in self.allowed_edge_types ] - return {"nodes": formatted_nodes, "rels": formatted_rels} + + nodes = [] + for node in formatted_nodes: + nodes.append(Node(id=node["id"], + type=node["type"], + properties={"description": node["definition"]})) + relationships = [] + for rel in formatted_rels: + relationships.append(Relationship(source=Node(id=rel["source"], type=rel["source"], + properties={"description": rel["definition"]}), + target=Node(id=rel["target"], type=rel["target"], + properties={"description": rel["definition"]}), type=rel["type"])) + + return [GraphDocument(nodes=nodes, relationships=relationships, source=Document(page_content=doc))] + except: print("Error Processing: ", out) - return {"nodes": [], "rels": []} + return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] - def document_er_extraction(self, document): + async def document_er_extraction(self, document): from langchain.prompts import ChatPromptTemplate from langchain.output_parsers import PydanticOutputParser + parser = PydanticOutputParser(pydantic_object=KnowledgeGraph) prompt = [ ("system", self.llm_service.entity_relationship_extraction_prompt), @@ -171,8 +187,13 @@ def document_er_extraction(self, document): prompt.append(("human", f"Allowed Edge Types: {self.allowed_edge_types}")) prompt = ChatPromptTemplate.from_messages(prompt) chain = prompt | self.llm_service.model # | parser - er = self._extract_kg_from_doc(document, chain, parser) + er = await self._extract_kg_from_doc(document, chain, parser) return er def extract(self, text): return self.document_er_extraction(text) + + async def aextract(self, text) -> list[GraphDocument]: + return await self.document_er_extraction(text) + + diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 35f5bcdf..3911fd56 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -178,7 +178,7 @@ def map_attrs(attributes: dict): def process_id(v_id: str): - v_id = v_id.replace(" ", "_").replace("/", "") + v_id = v_id.replace(" ", "_").replace("/", "").replace("%", "percent") has_func = re.compile(r"(.*)\(").findall(v_id) if len(has_func) > 0: diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index c00b9187..8967d120 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -213,7 +213,7 @@ async def extract( # upsert nodes and edges to the graph for doc in extracted: - for node in doc.nodes: + for i, node in enumerate(doc.nodes): logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") v_id = util.process_id(str(node.id)) if len(v_id) == 0: @@ -259,6 +259,22 @@ async def extract( ), ) ) + for node2 in doc.nodes[i + 1:]: + v_id2 = util.process_id(str(node2.id)) + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + v_id, # src_id + "RELATIONSHIP", # edgeType + "Entity", # tgt_type + v_id2, # tgt_id + {"relation_type": "DOC_CHUNK_COOCCURRENCE"}, # attributes + ), + ) + ) for edge in doc.relationships: logger.info( From e76fbd3f873c54d0ff74bb9e7b98c907f4e7b6c4 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 29 Aug 2024 15:44:19 -0500 Subject: [PATCH 81/91] update async/sync llm relationship extraction --- .../LLMEntityRelationshipExtractor.py | 181 ++++++++++++++---- 1 file changed, 146 insertions(+), 35 deletions(-) diff --git a/common/extractors/LLMEntityRelationshipExtractor.py b/common/extractors/LLMEntityRelationshipExtractor.py index 959ce644..6ee999e9 100644 --- a/common/extractors/LLMEntityRelationshipExtractor.py +++ b/common/extractors/LLMEntityRelationshipExtractor.py @@ -20,42 +20,116 @@ def __init__( self.allowed_edge_types = allowed_relationship_types self.strict_mode = strict_mode - async def _extract_kg_from_doc(self, doc, chain, parser) -> list[GraphDocument]: - """ - returns: - { - "nodes": [ - { - "id": "str", - "type": "string", - "definition": "string" - } - ], - "rels": [ - { - "source":{ - "id": "str", - "type": "string", - "definition": "string" - } - "target":{ - "id": "str", - "type": "string", - "definition": "string" + async def _aextract_kg_from_doc(self, doc, chain, parser) -> list[GraphDocument]: + try: + out = await chain.ainvoke( + {"input": doc, "format_instructions": parser.get_format_instructions()} + ) + except Exception as e: + return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] + try: + if "```json" not in out.content: + json_out = json.loads(out.content.strip("content=")) + else: + json_out = json.loads( + out.content.split("```")[1].strip("```").strip("json").strip() + ) + + formatted_rels = [] + for rels in json_out["rels"]: + if isinstance(rels["source"], str) and isinstance(rels["target"], str): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], str + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], str) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + else: + raise Exception("Relationship parsing error") + formatted_nodes = [] + for node in json_out["nodes"]: + formatted_nodes.append( + { + "id": node["id"], + "type": node["node_type"].replace(" ", "_").capitalize(), + "definition": node["definition"], } - "definition" - } - ] - } - """ + ) + # filter relationships and nodes based on allowed types + if self.strict_mode: + if self.allowed_vertex_types: + formatted_nodes = [ + node + for node in formatted_nodes + if node["type"] in self.allowed_vertex_types + ] + if self.allowed_edge_types: + formatted_rels = [ + rel + for rel in formatted_rels + if rel["type"] in self.allowed_edge_types + ] + + nodes = [] + for node in formatted_nodes: + nodes.append(Node(id=node["id"], + type=node["type"], + properties={"description": node["definition"]})) + relationships = [] + for rel in formatted_rels: + relationships.append(Relationship(source=Node(id=rel["source"], type=rel["source"], + properties={"description": rel["definition"]}), + target=Node(id=rel["target"], type=rel["target"], + properties={"description": rel["definition"]}), type=rel["type"])) + + return [GraphDocument(nodes=nodes, relationships=relationships, source=Document(page_content=doc))] + + except: + return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] + + def _extract_kg_from_doc(self, doc, chain, parser) -> list[GraphDocument]: try: - out = await chain.ainvoke( + out = chain.invoke( {"input": doc, "format_instructions": parser.get_format_instructions()} ) except Exception as e: - print("Error: ", e) - return {"nodes": [], "rels": []} + return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] try: if "```json" not in out.content: json_out = json.loads(out.content.strip("content=")) @@ -150,10 +224,47 @@ async def _extract_kg_from_doc(self, doc, chain, parser) -> list[GraphDocument]: return [GraphDocument(nodes=nodes, relationships=relationships, source=Document(page_content=doc))] except: - print("Error Processing: ", out) - return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] + return [GraphDocument(nodes=[], relationships=[], source=Document(page_content=doc))] + + async def adocument_er_extraction(self, document): + from langchain.prompts import ChatPromptTemplate + from langchain.output_parsers import PydanticOutputParser + + + parser = PydanticOutputParser(pydantic_object=KnowledgeGraph) + prompt = [ + ("system", self.llm_service.entity_relationship_extraction_prompt), + ( + "human", + "Tip: Make sure to answer in the correct format and do " + "not include any explanations. " + "Use the given format to extract information from the " + "following input: {input}", + ), + ( + "human", + "Mandatory: Make sure to answer in the correct format, specified here: {format_instructions}", + ), + ] + if self.allowed_vertex_types or self.allowed_edge_types: + prompt.append( + ( + "human", + "Tip: Make sure to use the following types if they are applicable. " + "If the input does not contain any of the types, you may create your own.", + ) + ) + if self.allowed_vertex_types: + prompt.append(("human", f"Allowed Node Types: {self.allowed_vertex_types}")) + if self.allowed_edge_types: + prompt.append(("human", f"Allowed Edge Types: {self.allowed_edge_types}")) + prompt = ChatPromptTemplate.from_messages(prompt) + chain = prompt | self.llm_service.model # | parser + er = await self._aextract_kg_from_doc(document, chain, parser) + return er + - async def document_er_extraction(self, document): + def document_er_extraction(self, document): from langchain.prompts import ChatPromptTemplate from langchain.output_parsers import PydanticOutputParser @@ -187,13 +298,13 @@ async def document_er_extraction(self, document): prompt.append(("human", f"Allowed Edge Types: {self.allowed_edge_types}")) prompt = ChatPromptTemplate.from_messages(prompt) chain = prompt | self.llm_service.model # | parser - er = await self._extract_kg_from_doc(document, chain, parser) + er = self._extract_kg_from_doc(document, chain, parser) return er def extract(self, text): return self.document_er_extraction(text) async def aextract(self, text) -> list[GraphDocument]: - return await self.document_er_extraction(text) + return await self.adocument_er_extraction(text) From 689e7d520b58e2e5532c77e061e31f2eacaf4081 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Fri, 30 Aug 2024 14:49:28 -0500 Subject: [PATCH 82/91] feat(graphrag): add type information upsert --- common/gsql/supportai/SupportAI_Schema.gsql | 3 ++ .../create_entity_type_relationships.gsql | 20 +++++++++++ .../app/graphrag/graph_rag.py | 8 +++++ .../app/graphrag/util.py | 29 +++++++++++++-- .../app/graphrag/workers.py | 35 ++++++++++++++++++- 5 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 common/gsql/supportai/create_entity_type_relationships.gsql diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 718ab1a7..05c9b306 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -6,6 +6,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX EntityType(PRIMARY_ID id STRING, description STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD DIRECTED EDGE HAS_CONTENT(FROM Document, TO Content|FROM DocumentChunk, TO Content) WITH REVERSE_EDGE="reverse_HAS_CONTENT"; ADD DIRECTED EDGE IS_CHILD_OF(FROM Concept, TO Concept) WITH REVERSE_EDGE="reverse_IS_CHILD_OF"; ADD DIRECTED EDGE IS_HEAD_OF(FROM Entity, TO Relationship) WITH REVERSE_EDGE="reverse_IS_HEAD_OF"; @@ -18,6 +19,8 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE HAS_CHILD(FROM Document, TO DocumentChunk) WITH REVERSE_EDGE="reverse_HAS_CHILD"; ADD DIRECTED EDGE HAS_RELATIONSHIP(FROM Concept, TO Concept, relation_type STRING) WITH REVERSE_EDGE="reverse_HAS_RELATIONSHIP"; ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; + ADD DIRECTED EDGE ENTITY_HAS_TYPE(FROM Entity, TO EntityType) WITH REVERSE_EDGE="reverse_ENTITY_HAS_TYPE"; + ADD DIRECTED EDGE RELATIONSHIP_TYPE(FROM EntityType, TO EntityType, DISCRIMINATOR(relation_type STRING), frequency INT) WITH REVERSE_EDGE="reverse_RELATIONSHIP_TYPE"; // GraphRAG ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; diff --git a/common/gsql/supportai/create_entity_type_relationships.gsql b/common/gsql/supportai/create_entity_type_relationships.gsql new file mode 100644 index 00000000..a00626d2 --- /dev/null +++ b/common/gsql/supportai/create_entity_type_relationships.gsql @@ -0,0 +1,20 @@ +CREATE DISTRIBUTED QUERY create_entity_type_relationships(/* Parameters here */) SYNTAX v2{ + MapAccum>> @rel_type_count; // entity type, relationship type for entity type, frequency + SumAccum @@rels_inserted; + ents = {Entity.*}; + accum_types = SELECT et FROM ents:e -(RELATIONSHIP>:r)- Entity:e2 -(ENTITY_HAS_TYPE>:eht)- EntityType:et + WHERE r.relation_type != "DOC_CHUNK_COOCCURRENCE" + ACCUM + e.@rel_type_count += (et.id -> (r.relation_type -> 1)); + + ets = SELECT et FROM ents:e -(ENTITY_HAS_TYPE>:eht)- EntityType:et + ACCUM + FOREACH (entity_type, rel_type_freq) IN e.@rel_type_count DO + FOREACH (rel_type, freq) IN e.@rel_type_count.get(entity_type) DO + INSERT INTO RELATIONSHIP_TYPE VALUES (et.id, entity_type, rel_type, freq), + @@rels_inserted += 1 + END + END; + + PRINT @@rels_inserted as relationships_inserted; +} \ No newline at end of file diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 71e2f0f0..be022b3d 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -19,6 +19,7 @@ stream_ids, tg_sem, upsert_batch, + add_rels_between_types ) from pyTigerGraph import TigerGraphConnection @@ -462,6 +463,12 @@ async def run(graphname: str, conn: TigerGraphConnection): init_end = time.perf_counter() logger.info("Doc Processing End") + # Type Resolution + type_start = time.perf_counter() + logger.info("Type Processing Start") + await add_rels_between_types(conn) + logger.info("Type Processing End") + type_end = time.perf_counter() # Entity Resolution entity_start = time.perf_counter() @@ -516,6 +523,7 @@ async def run(graphname: str, conn: TigerGraphConnection): end = time.perf_counter() logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") + logger.info(f"DONE. graphrag type creation dT: {type_end-type_start}") logger.info( f"DONE. graphrag community initializer dT: {community_end-community_start}" ) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 3911fd56..da706ea7 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -73,6 +73,7 @@ async def init( "common/gsql/graphRAG/louvain/graphrag_louvain_communities", "common/gsql/graphRAG/louvain/modularity", "common/gsql/graphRAG/louvain/stream_community", + "common/gsql/supportai/create_entity_type_relationships" ] # add louvain to queries q = [x.split(".gsql")[0] for x in glob("common/gsql/graphRAG/louvain/*")] @@ -206,9 +207,14 @@ async def upsert_batch(conn: TigerGraphConnection, data: str): headers = make_headers(conn) async with httpx.AsyncClient(timeout=http_timeout) as client: async with tg_sem: - res = await client.post( - f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - ) + try: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + except Exception as e: + err = traceback.format_exc() + logger.error(f"Upsert err:\n{err}") + return {"error": True} res.raise_for_status() @@ -321,6 +327,23 @@ async def check_all_ents_resolved(conn): return res +async def add_rels_between_types(conn): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + async with tg_sem: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/create_entity_type_relationships", + headers=headers, + ) + try: + resp.raise_for_status() + except Exception as e: + logger.error(f"Check Vert EntityType err:\n{e}") + + res = resp.json()["results"][0]["relationships_inserted"] + logger.info(resp.json()["results"]) + + return res async def check_vertex_has_desc(conn, i: int): headers = make_headers(conn) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 8967d120..fa1b7476 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -15,7 +15,7 @@ from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore -from common.extractors.BaseExtractor import BaseExtractor +from common.extractors import BaseExtractor, LLMEntityRelationshipExtractor from common.logs.logwriter import LogWriter vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -242,6 +242,39 @@ async def extract( ), ) ) + if isinstance(extractor, LLMEntityRelationshipExtractor): + logger.info("extract writes type vert to upsert") + type_id = util.process_id(node.type) + if len(type_id) == 0: + continue + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "EntityType", # v_type + type_id, # v_id + { # attrs + "epoch_added": int(time.time()), + }, + ) + ) + ) + logger.info("extract writes entity_has_type edge to upsert") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + v_id, # src_id + "ENTITY_HAS_TYPE", # edgeType + "EntityType", # tgt_type + type_id, # tgt_id + None, # attributes + ), + ) + ) # link the entity to the chunk it came from logger.info("extract writes contains edge to upsert") From 4e39863b71bb3f471641720343dd732ab58468b5 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Tue, 3 Sep 2024 12:11:19 -0500 Subject: [PATCH 83/91] fix(community summmarizer): add error catch --- .../app/graphrag/community_summarizer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index 2bef4095..c23b87cf 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -35,10 +35,13 @@ async def summarize(self, name: str, text: list[str]) -> CommunitySummary: # remove iteration tags from name name = id_pat.sub("", name) - summary = await chain.ainvoke( - { - "entity_name": name, - "description_list": text, - } - ) + try: + summary = await chain.ainvoke( + { + "entity_name": name, + "description_list": text, + } + ) + except Exception as e: + return "error generating summary: {}".format(e) return summary.summary From edfeeb968082a191a668054b63d24da7fc4d7948 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Tue, 3 Sep 2024 15:17:34 -0500 Subject: [PATCH 84/91] fixes on pr --- .../app/graphrag/community_summarizer.py | 2 +- eventual-consistency-service/app/graphrag/util.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index c23b87cf..54b4d0db 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -43,5 +43,5 @@ async def summarize(self, name: str, text: list[str]) -> CommunitySummary: } ) except Exception as e: - return "error generating summary: {}".format(e) + return {"summary": f"Error: {e}"} return summary.summary diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index da706ea7..91e69b12 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -340,8 +340,12 @@ async def add_rels_between_types(conn): except Exception as e: logger.error(f"Check Vert EntityType err:\n{e}") - res = resp.json()["results"][0]["relationships_inserted"] - logger.info(resp.json()["results"]) + if resp.status_code != 200: + logger.error(f"Check Vert EntityType err:\n{resp.text}") + res = {"error": True} + else: + res = resp.json()["results"][0]["relationships_inserted"] + logger.info(resp.json()["results"]) return res From 33861b1478b9ddcf2e04ae479dd8f38759b3a4bd Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 5 Sep 2024 09:08:05 -0500 Subject: [PATCH 85/91] update error return --- .../app/graphrag/community_summarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index 54b4d0db..f3d5f869 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -43,5 +43,5 @@ async def summarize(self, name: str, text: list[str]) -> CommunitySummary: } ) except Exception as e: - return {"summary": f"Error: {e}"} + return {"error": True, "summary": ""} return summary.summary From 768217b42abc11d331cd05c111b08677a78fd722 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 5 Sep 2024 09:20:13 -0500 Subject: [PATCH 86/91] remove error returns, only log --- eventual-consistency-service/app/graphrag/util.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 91e69b12..75b20edf 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -214,7 +214,6 @@ async def upsert_batch(conn: TigerGraphConnection, data: str): except Exception as e: err = traceback.format_exc() logger.error(f"Upsert err:\n{err}") - return {"error": True} res.raise_for_status() @@ -231,14 +230,11 @@ async def check_vertex_exists(conn, v_id: str): except Exception as e: err = traceback.format_exc() logger.error(f"Check err:\n{err}") - return {"error": True} - try: res.raise_for_status() return res.json() except Exception as e: logger.error(f"Check err:\n{e}\n{res.text}") - return {"error": True} async def upsert_edge( @@ -342,12 +338,10 @@ async def add_rels_between_types(conn): if resp.status_code != 200: logger.error(f"Check Vert EntityType err:\n{resp.text}") - res = {"error": True} else: res = resp.json()["results"][0]["relationships_inserted"] logger.info(resp.json()["results"]) - - return res + return res async def check_vertex_has_desc(conn, i: int): headers = make_headers(conn) From 98cb62f9c133aeb1d69dbf403b4c11265e9541f5 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 5 Sep 2024 09:39:11 -0500 Subject: [PATCH 87/91] updates --- .../app/graphrag/community_summarizer.py | 2 +- eventual-consistency-service/app/graphrag/graph_rag.py | 6 +++++- eventual-consistency-service/app/graphrag/util.py | 2 ++ eventual-consistency-service/app/graphrag/workers.py | 4 ++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index f3d5f869..eacdeca7 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -44,4 +44,4 @@ async def summarize(self, name: str, text: list[str]) -> CommunitySummary: ) except Exception as e: return {"error": True, "summary": ""} - return summary.summary + return {"error": False, "summary": summary.summary} diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index be022b3d..70a966bc 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -466,7 +466,11 @@ async def run(graphname: str, conn: TigerGraphConnection): # Type Resolution type_start = time.perf_counter() logger.info("Type Processing Start") - await add_rels_between_types(conn) + res = await add_rels_between_types(conn) + if res["error"]: + logger.error(f"Error adding relationships between types: {res}") + else: + logger.info(f"Added relationships between types: {res}") logger.info("Type Processing End") type_end = time.perf_counter() # Entity Resolution diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 75b20edf..016a275e 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -235,6 +235,7 @@ async def check_vertex_exists(conn, v_id: str): return res.json() except Exception as e: logger.error(f"Check err:\n{e}\n{res.text}") + return {"error": True, "message": res.text} async def upsert_edge( @@ -338,6 +339,7 @@ async def add_rels_between_types(conn): if resp.status_code != 200: logger.error(f"Check Vert EntityType err:\n{resp.text}") + return {"error": True, "message": resp.text} else: res = resp.json()["results"][0]["relationships_inserted"] logger.info(resp.json()["results"]) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index fa1b7476..fe856c60 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -486,6 +486,10 @@ async def process_community( llm = ecc_util.get_llm_service() summarizer = community_summarizer.CommunitySummarizer(llm) summary = await summarizer.summarize(comm_id, children) + if summary["error"]: + logger.error(f"Failed to summarize community {comm_id}") + else: + summary = summary["summary"] logger.debug(f"Community {comm_id}: {children}, {summary}") await upsert_chan.put( From e6065f611f9db9808a6a7d6d26c3cd4b1a78f853 Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 5 Sep 2024 10:19:31 -0500 Subject: [PATCH 88/91] updates --- .../app/graphrag/workers.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index fe856c60..48ebf0f9 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -178,7 +178,7 @@ async def get_vert_desc(conn, v_id, node: Node): desc = [node.properties.get("description", "")] exists = await util.check_vertex_exists(conn, v_id) # if vertex exists, get description content and append this description to it - if not exists["error"]: + if not exists.get("error", False): # deduplicate descriptions desc.extend(exists["results"][0]["attributes"]["description"]) desc = list(set(desc)) @@ -478,6 +478,7 @@ async def process_community( # get the children of the community children = await util.get_commuinty_children(conn, i, comm_id) comm_id = util.process_id(comm_id) + err = False # if the community only has one child, use its description if len(children) == 1: @@ -488,24 +489,26 @@ async def process_community( summary = await summarizer.summarize(comm_id, children) if summary["error"]: logger.error(f"Failed to summarize community {comm_id}") + err = True else: summary = summary["summary"] - logger.debug(f"Community {comm_id}: {children}, {summary}") - await upsert_chan.put( - ( - util.upsert_vertex, # func to call + if not err: + logger.debug(f"Community {comm_id}: {children}, {summary}") + await upsert_chan.put( ( - conn, - "Community", # v_type - comm_id, # v_id - { # attrs - "description": summary, - "iteration": i, - }, - ), + util.upsert_vertex, # func to call + ( + conn, + "Community", # v_type + comm_id, # v_id + { # attrs + "description": summary, + "iteration": i, + }, + ), + ) ) - ) - # (v_id, content, index_name) - await embed_chan.put((comm_id, summary, "Community")) + # (v_id, content, index_name) + await embed_chan.put((comm_id, summary, "Community")) From 388aae1ac502abfdc8bee435b318739c2100d11b Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Thu, 5 Sep 2024 12:18:43 -0500 Subject: [PATCH 89/91] updates to hnsw overlap --- .../supportai/retrievers/HNSW_Overlap_Search.gsql | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/common/gsql/supportai/retrievers/HNSW_Overlap_Search.gsql b/common/gsql/supportai/retrievers/HNSW_Overlap_Search.gsql index d53b724c..5fbb0180 100644 --- a/common/gsql/supportai/retrievers/HNSW_Overlap_Search.gsql +++ b/common/gsql/supportai/retrievers/HNSW_Overlap_Search.gsql @@ -30,7 +30,12 @@ CREATE OR REPLACE DISTRIBUTED QUERY HNSW_Overlap_Search(Set v_types, STR start = SELECT s FROM start:s POST-ACCUM s.@num_times_seen += 1; FOREACH i IN RANGE[0, num_hops-1] DO - start = SELECT t FROM start:s -(:e)- :t + start = SELECT t FROM start:s -((RELATIONSHIP| + CONTAINS_ENTITY| + IS_AFTER| + reverse_CONTAINS_ENTITY| + IS_HEAD_OF| + HAS_TAIL):e)- :t ACCUM @@edges += e POST-ACCUM t.@num_times_seen += 1; END; @@ -40,7 +45,11 @@ CREATE OR REPLACE DISTRIBUTED QUERY HNSW_Overlap_Search(Set v_types, STR IF s.type == "Relationship" THEN @@relationship_info += (s.id -> s.definition) ELSE IF s.type == "Entity" THEN - @@entity_info += (s.id -> s.definition) + STRING tmp_dsc = s.definition, + FOREACH dsc IN s.description DO + tmp_dsc = tmp_dsc + dsc +";" + END, + @@entity_info += (s.id -> tmp_dsc) ELSE IF s.type == "DocumentChunk" THEN @@to_retrieve_content += s END; From 2efc5d1ee676340e6b80b098af0b4ce98062fcaa Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Fri, 6 Sep 2024 10:11:10 -0500 Subject: [PATCH 90/91] fix on error --- eventual-consistency-service/app/graphrag/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 016a275e..50d02428 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -234,8 +234,8 @@ async def check_vertex_exists(conn, v_id: str): res.raise_for_status() return res.json() except Exception as e: - logger.error(f"Check err:\n{e}\n{res.text}") - return {"error": True, "message": res.text} + logger.error(f"Check err:\n{e}\n{e}") + return {"error": True, "message": e} async def upsert_edge( From 1bbb8b64a3e4c0f882d4d9c353794e99f178749d Mon Sep 17 00:00:00 2001 From: Parker Erickson Date: Fri, 6 Sep 2024 11:57:45 -0500 Subject: [PATCH 91/91] fix --- eventual-consistency-service/app/graphrag/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 50d02428..873d856a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -341,8 +341,8 @@ async def add_rels_between_types(conn): logger.error(f"Check Vert EntityType err:\n{resp.text}") return {"error": True, "message": resp.text} else: - res = resp.json()["results"][0]["relationships_inserted"] - logger.info(resp.json()["results"]) + res = resp.json() + logger.info(resp.json()) return res async def check_vertex_has_desc(conn, i: int):