From 7a439432b79fb6c3c2646a6d268b99760bca0f97 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 22 Jul 2024 16:42:23 -0400 Subject: [PATCH 01/53] init --- common/config.py | 9 +- common/extractors/GraphExtractor.py | 21 ++ common/extractors/__init__.py | 1 + .../louvain/louvain_2_other_passes.gsql | 217 +++++++++++++++ .../louvain/louvain_3_final_community.gsql | 44 +++ .../louvain_4_modularity_1_for_pass.gsql | 39 +++ .../louvain/louvain_4_modularity_2_final.gsql | 52 ++++ .../graphRAG/louvain/louvain_5_reset.gsql | 13 + common/gsql/supportai/Scan_For_Updates.gsql | 10 +- common/gsql/supportai/SupportAI_Schema.gsql | 18 +- common/llm_services/openai_service.py | 2 +- common/py_schemas/schemas.py | 17 +- copilot/app/routers/supportai.py | 223 +++------------ copilot/app/supportai/supportai.py | 185 +++++++++++++ copilot/docs/notebooks/graphrag.ipynb | 261 ++++++++++++++++++ .../app/eventual_consistency_checker.py | 3 +- .../app/graphrag/__init__.py | 1 + .../app/graphrag/graph_rag.py | 138 +++++++++ .../app/graphrag/util.py | 36 +++ .../app/graphrag/worker.py | 27 ++ eventual-consistency-service/app/main.py | 142 +++++++--- 21 files changed, 1226 insertions(+), 233 deletions(-) create mode 100644 common/extractors/GraphExtractor.py create mode 100644 common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_3_final_community.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_5_reset.gsql create mode 100644 copilot/app/supportai/supportai.py create mode 100644 copilot/docs/notebooks/graphrag.ipynb create mode 100644 eventual-consistency-service/app/graphrag/__init__.py create mode 100644 eventual-consistency-service/app/graphrag/graph_rag.py create mode 100644 eventual-consistency-service/app/graphrag/util.py create mode 100644 eventual-consistency-service/app/graphrag/worker.py diff --git a/common/config.py b/common/config.py index 8eb9432a..2546e38a 100644 --- a/common/config.py +++ b/common/config.py @@ -15,14 +15,15 @@ AWSBedrock, AzureOpenAI, GoogleVertexAI, - OpenAI, Groq, + HuggingFaceEndpoint, + LLM_Model, Ollama, - HuggingFaceEndpoint + OpenAI, ) +from common.logs.logwriter import LogWriter from common.session import SessionHandler from common.status import StatusManager -from common.logs.logwriter import LogWriter security = HTTPBasic() session_handler = SessionHandler() @@ -102,7 +103,7 @@ raise Exception("Embedding service not implemented") -def get_llm_service(llm_config): +def get_llm_service(llm_config) -> LLM_Model: if llm_config["completion_service"]["llm_service"].lower() == "openai": return OpenAI(llm_config["completion_service"]) elif llm_config["completion_service"]["llm_service"].lower() == "azure": diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py new file mode 100644 index 00000000..c8f24355 --- /dev/null +++ b/common/extractors/GraphExtractor.py @@ -0,0 +1,21 @@ +from langchain_community.graphs.graph_document import GraphDocument +from langchain_core.documents import Document +from langchain_experimental.graph_transformers import LLMGraphTransformer + +from common.config import get_llm_service, llm_config +from common.extractors.BaseExtractor import BaseExtractor + + +class GraphExtractor(BaseExtractor): + def __init__(self): + llm = get_llm_service(llm_config).llm + self.transformer = LLMGraphTransformer( + llm=llm, + node_properties=["description"], + relationship_properties=["description"], + ) + + def extract(self, text) -> list[GraphDocument]: + doc = Document(page_content=text) + graph_docs = self.transformer.convert_to_graph_documents([doc]) + return graph_docs diff --git a/common/extractors/__init__.py b/common/extractors/__init__.py index ced539e4..e2f0bcdf 100644 --- a/common/extractors/__init__.py +++ b/common/extractors/__init__.py @@ -1,3 +1,4 @@ +from common.extractors.GraphExtractor import GraphExtractor from common.extractors.LLMEntityRelationshipExtractor import ( LLMEntityRelationshipExtractor, ) diff --git a/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql new file mode 100644 index 00000000..231631d6 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql @@ -0,0 +1,217 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_2( + UINT layer = 0, + UINT max_hop = 10, + UINT batch_num = 1 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + SumAccum @vid; + SumAccum @@links_to_check; + + // Initialization + LOG(TRUE, "Query started!"); + All_Nodes = {{{entity_vertex_name}.*}}; + _tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + ACCUM + @@links_to_check += 1; + + All_Nodes = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + @@m += weight / 2, + s.@k += weight, + IF s == t THEN // self-loop link + s.@k_self_loop += weight + END + POST-ACCUM + s.@{community_id_attribute_name} = s, + s.@community_vid = to_string(s.id), + s.@vid = getvid(s), + s.@batch_id = s.@vid % batch_num + ; + LOG(TRUE, All_Nodes.size()); + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = All_Nodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop = hop + 1; + LOG(TRUE, hop); + IF hop == 1 THEN // first iteration + ChangedNodes = + SELECT s + FROM Candidates:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE + ; + ELSE // remaining iterations + // Calculate sum_total + Tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k) + ; + Tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}) + ; + LOG(TRUE, @@community_sum_total_map.size()); + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {{}}; + FOREACH batch_id IN RANGE[0, batch_num-1] DO + LOG(TRUE, batch_id); + // Calculate the delta Q to remove the node from the previous community + Nodes = + SELECT s + FROM Candidates:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@batch_id == batch_id + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + s.@k_in += weight + ELSE + s.@community_k_in_map += (t.@{community_id_attribute_name} -> weight) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add + ; + // Find the best move + Nodes = + SELECT s + FROM Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE + ; + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = + SELECT s + FROM ChangedNodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@best_move.community == t.@{community_id_attribute_name} + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@{community_id_attribute_name} + // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid)) + POST-ACCUM + s.@to_change_community = FALSE + ; + LOG(TRUE, SwapNodes.size()); + ChangedNodes = ChangedNodes MINUS SwapNodes; + LOG(TRUE, ChangedNodes.size()); + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = + SELECT s + FROM ChangedNodes:s + POST-ACCUM + s.@{community_id_attribute_name} = s.@best_move.community, + s.@community_vid = s.@best_move.ext_vid, + s.@to_change_community = FALSE + ; + + @@move_cnt += ChangedNodes.size(); + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = + SELECT t + FROM ChangedNodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND t.@{community_id_attribute_name} != s.@{community_id_attribute_name} + ; + LOG(TRUE, Candidates.size()); + END; + + PRINT @@move_cnt AS Delta; + + // Coarsening + LOG(TRUE, "Coarsening"); + UINT new_layer = layer + 1; + @@community_sum_total_map.clear(); + Tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + DOUBLE weight = e.layer_weight_map.get(layer), + @@community_sum_in_map += (s.@{community_id_attribute_name} -> weight) + END + POST-ACCUM + //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), + INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) + END + ; + LOG(TRUE, @@community_sum_in_map.size()); + @@community_sum_in_map.clear(); + Tmp = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN + @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> weight)) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + //f_links_to.println(s.uniq_id, target_community, k_in, new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) + END + END + ; + LOG(TRUE, @@source_target_k_in_map.size()); + @@source_target_k_in_map.clear(); + PRINT @@links_to_check; + LOG(TRUE, "Query finished!"); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql new file mode 100644 index 00000000..75cbad7e --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql @@ -0,0 +1,44 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_3( + UINT top_layer = 2 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + MinAccum @{community_id_attribute_name}; // the community ID of the node + INT layer = top_layer; + + // Initialization + LOG(TRUE, "Query started!"); + All_Nodes = {{{entity_vertex_name}.*}}; + + // Top layer + Nodes = + SELECT t + FROM All_Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t + WHERE layer IN e.layer_set + ACCUM t.@{community_id_attribute_name} = to_string(s.id) + ; + LOG(TRUE, layer, Nodes.size()); + + // Other layers + WHILE Nodes.size() > 0 AND layer > 0 DO + layer = layer - 1; + Nodes = + SELECT t + FROM Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t + WHERE layer IN e.layer_set + ACCUM t.@{community_id_attribute_name} = s.@{community_id_attribute_name} + ; + LOG(TRUE, layer, Nodes.size()); + END; + + // Write to the file + Nodes = + SELECT s + FROM Nodes:s + POST-ACCUM + //f.println(s.uniq_id, s.@{community_id_attribute_name}) + s.{community_id_attribute_name} = s.@{community_id_attribute_name} + + ; + LOG(TRUE, "Query finished!"); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql new file mode 100644 index 00000000..0058d0ee --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql @@ -0,0 +1,39 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4a( + UINT layer=0 +) FOR GRAPH {graph_name} SYNTAX v1 {{ + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MapAccum, SumAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum, SumAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + + All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = + SELECT s + FROM All_Nodes:s -({links_to_edge_name}:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s == t THEN + @@community_in_weight_map += (s -> weight) + END, + @@community_total_weight_map += (s -> weight), + @@sum_weight += weight + ; + LOG(TRUE, All_Nodes.size()); + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; + PRINT layer; + PRINT @@modularity AS modularity; + PRINT @@community_total_weight_map.size() AS community_number; + PRINT All_Nodes.size(); + @@community_in_weight_map.clear(); + @@community_total_weight_map.clear(); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql new file mode 100644 index 00000000..31ba4d0b --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql @@ -0,0 +1,52 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4b( +) FOR GRAPH {graph_name} SYNTAX v1 {{ + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + MapAccum> @@Community_sizes; + MapAccum> @@count_of_sizes; + AvgAccum @@avg_community_size; + + DOUBLE wt = 1.0; + All_Nodes = {{{entity_vertex_name}.*}}; + Nodes = + SELECT s + FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM IF s.{community_id_attribute_name} == t.{community_id_attribute_name} THEN + @@community_in_weight_map += (s.{community_id_attribute_name} -> wt) + END, + @@community_total_weight_map += (s.{community_id_attribute_name} -> wt), + @@sum_weight += wt + ; + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + + _tmp = + SELECT s + FROM All_Nodes:s + POST-ACCUM + @@Community_sizes += (s.{community_id_attribute_name} -> 1); + + FOREACH (comm, cnt) IN @@Community_sizes DO + @@count_of_sizes += (cnt -> 1); + @@avg_community_size += cnt; + END; + + // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; + PRINT @@modularity AS modularity; + PRINT @@community_total_weight_map.size() AS community_number; + PRINT @@count_of_sizes AS num_communities_by_size; + PRINT @@avg_community_size AS avg_community_size; + + @@community_in_weight_map.clear(); + @@community_total_weight_map.clear(); +}} diff --git a/common/gsql/graphRAG/louvain/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain/louvain_5_reset.gsql new file mode 100644 index 00000000..7590935a --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_5_reset.gsql @@ -0,0 +1,13 @@ +USE GRAPH {graph_name} +DROP QUERY {query_name} +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_5_reset( +) FOR GRAPH {graph_name} SYNTAX v1 {{ + + // Initialization + Nodes = {{{entity_vertex_name}.*}}; + + // Top layer + DELETE e + FROM Nodes:s -(({belongs_to_edge_name}|{links_to_edge_name}):e)- :t + ; +}} diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index 03ced2ec..7d9d1b83 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - ELSE IF v_type == "Concept" THEN - res = SELECT s FROM start:s - POST-ACCUM @@v_and_text += (s.id -> s.description), - s.epoch_processing = datetime_to_epoch(now()); + // ELSE IF v_type == "Concept" THEN + // res = SELECT s FROM start:s + // POST-ACCUM @@v_and_text += (s.id -> s.description), + // s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), @@ -42,4 +42,4 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); END; PRINT @@v_and_text; -} \ No newline at end of file +} diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 061993bb..0998affe 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -18,4 +18,18 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE HAS_CHILD(FROM Document, TO DocumentChunk) WITH REVERSE_EDGE="reverse_HAS_CHILD"; ADD DIRECTED EDGE HAS_RELATIONSHIP(FROM Concept, TO Concept, relation_type STRING) WITH REVERSE_EDGE="reverse_HAS_RELATIONSHIP"; ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; -} \ No newline at end of file + + // GraphRAG + ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + + ADD DIRECTED EDGE KNN(FROM Entity, TO Entity); // TODO: check where knn algo writes results + ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity); // Connect ResolvedEntities with their children entities + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, weight UINT); // store edges between entities after they're resolved + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community); + + // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out + // Hierarchical communities (Louvain/Leiden) + // ADD UNDIRECTED EDGE LINKS_TO(FROM Community, TO Community); + // ADD DIRECTED EDGE BELONGS_TO(FROM Community, TO Community); +} diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 914f6364..c7274720 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,5 +1,6 @@ import logging import os +from langchain_openai import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv @@ -16,7 +17,6 @@ def __init__(self, config): auth_detail ] - from langchain.chat_models import ChatOpenAI model_name = config["llm_model"] self.llm = ChatOpenAI( diff --git a/common/py_schemas/schemas.py b/common/py_schemas/schemas.py index e5dd1faf..07a2113f 100644 --- a/common/py_schemas/schemas.py +++ b/common/py_schemas/schemas.py @@ -15,11 +15,13 @@ class SupportAIQuestion(BaseModel): method_params: dict = {} -class SupportAIInitConfig(BaseModel): - chunker: str - chunker_params: dict - extractor: str - extractor_params: dict +class SupportAIMethod(enum.StrEnum): + SUPPORTAI = enum.auto() + GRAPHRAG = enum.auto() + + +class EccConfig(BaseModel): + method: SupportAIMethod = SupportAIMethod.SUPPORTAI class GSQLQueryInfo(BaseModel): @@ -126,15 +128,18 @@ class QueryUpsertRequest(BaseModel): id: Optional[str] query_info: Optional[GSQLQueryInfo] + class MessageContext(BaseModel): # TODO: fix this to contain proper message context user: str content: str + class ReportQuestions(BaseModel): question: str reasoning: str + class ReportSection(BaseModel): section_name: str description: str @@ -142,6 +147,7 @@ class ReportSection(BaseModel): copilot_fortify: bool = True actions: Optional[List[str]] = None + class ReportCreationRequest(BaseModel): topic: str sections: Union[List[ReportSection], str] = None @@ -150,6 +156,7 @@ class ReportCreationRequest(BaseModel): conversation_id: Optional[str] = None message_context: Optional[List[MessageContext]] = None + class Role(enum.StrEnum): SYSTEM = enum.auto() USER = enum.auto() diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index a3c94951..a829d3a4 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -1,22 +1,38 @@ import json import logging -import uuid from typing import Annotated -from fastapi import APIRouter, BackgroundTasks, Depends, Request +from fastapi import APIRouter, BackgroundTasks, Depends, Request, Response, status from fastapi.security.http import HTTPBase +from supportai import supportai from supportai.concept_management.create_concepts import ( - CommunityConceptCreator, EntityConceptCreator, HigherLevelConceptCreator, - RelationshipConceptCreator) -from supportai.retrievers import (EntityRelationshipRetriever, - HNSWOverlapRetriever, HNSWRetriever, - HNSWSiblingRetriever) - -from common.config import (db_config, embedding_service, embedding_store, - get_llm_service, llm_config) + CommunityConceptCreator, + EntityConceptCreator, + HigherLevelConceptCreator, + RelationshipConceptCreator, +) +from supportai.retrievers import ( + EntityRelationshipRetriever, + HNSWOverlapRetriever, + HNSWRetriever, + HNSWSiblingRetriever, +) + +from common.config import ( + db_config, + embedding_service, + embedding_store, + get_llm_service, + llm_config, +) from common.logs.logwriter import LogWriter -from common.py_schemas.schemas import (CoPilotResponse, CreateIngestConfig, - LoadingInfo, SupportAIQuestion) +from common.py_schemas.schemas import ( # SupportAIInitConfig,; SupportAIMethod, + CoPilotResponse, + CreateIngestConfig, + LoadingInfo, + SupportAIMethod, + SupportAIQuestion, +) logger = logging.getLogger(__name__) router = APIRouter(tags=["SupportAI"]) @@ -26,50 +42,14 @@ @router.post("/{graphname}/supportai/initialize") def initialize( - graphname, conn: Request, credentials: Annotated[HTTPBase, Depends(security)] + graphname, + conn: Request, + credentials: Annotated[HTTPBase, Depends(security)], ): conn = conn.state.conn - # need to open the file using the absolute path - file_path = "common/gsql/supportai/SupportAI_Schema.gsql" - with open(file_path, "r") as f: - schema = f.read() - schema_res = conn.gsql( - """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_schema""".format( - graphname, schema - ) - ) - - file_path = "common/gsql/supportai/SupportAI_IndexCreation.gsql" - with open(file_path) as f: - index = f.read() - index_res = conn.gsql( - """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_indexes""".format( - graphname, index - ) - ) - - file_path = "common/gsql/supportai/Scan_For_Updates.gsql" - with open(file_path) as f: - scan_for_updates = f.read() - res = conn.gsql( - "USE GRAPH " - + conn.graphname - + "\n" - + scan_for_updates - + "\n INSTALL QUERY Scan_For_Updates" - ) - - file_path = "common/gsql/supportai/Update_Vertices_Processing_Status.gsql" - with open(file_path) as f: - update_vertices = f.read() - res = conn.gsql( - "USE GRAPH " - + conn.graphname - + "\n" - + update_vertices - + "\n INSTALL QUERY Update_Vertices_Processing_Status" - ) + resp = supportai.init_supportai(conn, graphname) + schema_res, index_res = resp[0], resp[1] return { "host_name": conn._tg_connection.host, # include host_name for debugging from client. Their pyTG conn might not have the same host as what's configured in copilot "schema_creation_status": json.dumps(schema_res), @@ -80,132 +60,13 @@ def initialize( @router.post("/{graphname}/supportai/create_ingest") def create_ingest( graphname, - ingest_config: CreateIngestConfig, + cfg: CreateIngestConfig, conn: Request, credentials: Annotated[HTTPBase, Depends(security)], ): conn = conn.state.conn - if ingest_config.file_format.lower() == "json": - file_path = "common/gsql/supportai/SupportAI_InitialLoadJSON.gsql" - - with open(file_path) as f: - ingest_template = f.read() - ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) - doc_id = ingest_config.loader_config.get("doc_id_field", "doc_id") - doc_text = ingest_config.loader_config.get("content_field", "content") - ingest_template = ingest_template.replace('"doc_id"', '"{}"'.format(doc_id)) - ingest_template = ingest_template.replace('"content"', '"{}"'.format(doc_text)) - - if ingest_config.file_format.lower() == "csv": - file_path = "common/gsql/supportai/SupportAI_InitialLoadCSV.gsql" - - with open(file_path) as f: - ingest_template = f.read() - ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) - separator = ingest_config.get("separator", "|") - header = ingest_config.get("header", "true") - eol = ingest_config.get("eol", "\n") - quote = ingest_config.get("quote", "double") - ingest_template = ingest_template.replace('"|"', '"{}"'.format(separator)) - ingest_template = ingest_template.replace('"true"', '"{}"'.format(header)) - ingest_template = ingest_template.replace('"\\n"', '"{}"'.format(eol)) - ingest_template = ingest_template.replace('"double"', '"{}"'.format(quote)) - - file_path = "common/gsql/supportai/SupportAI_DataSourceCreation.gsql" - - with open(file_path) as f: - data_stream_conn = f.read() - - # assign unique identifier to the data stream connection - - data_stream_conn = data_stream_conn.replace( - "@source_name@", "SupportAI_" + graphname + "_" + str(uuid.uuid4().hex) - ) - - # check the data source and create the appropriate connection - if ingest_config.data_source.lower() == "s3": - data_conn = ingest_config.data_source_config - if ( - data_conn.get("aws_access_key") is None - or data_conn.get("aws_secret_key") is None - ): - raise Exception("AWS credentials not provided") - connector = { - "type": "s3", - "access.key": data_conn["aws_access_key"], - "secret.key": data_conn["aws_secret_key"], - } - - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - - elif ingest_config.data_source.lower() == "azure": - if ingest_config.data_source_config.get("account_key") is not None: - connector = { - "type": "abs", - "account.key": ingest_config.data_source_config["account_key"], - } - elif ingest_config.data_source_config.get("client_id") is not None: - # verify that the client secret is also provided - if ingest_config.data_source_config.get("client_secret") is None: - raise Exception("Client secret not provided") - # verify that the tenant id is also provided - if ingest_config.data_source_config.get("tenant_id") is None: - raise Exception("Tenant id not provided") - connector = { - "type": "abs", - "client.id": ingest_config.data_source_config["client_id"], - "client.secret": ingest_config.data_source_config["client_secret"], - "tenant.id": ingest_config.data_source_config["tenant_id"], - } - else: - raise Exception("Azure credentials not provided") - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - elif ingest_config.data_source.lower() == "gcs": - # verify that the correct fields are provided - if ingest_config.data_source_config.get("project_id") is None: - raise Exception("Project id not provided") - if ingest_config.data_source_config.get("private_key_id") is None: - raise Exception("Private key id not provided") - if ingest_config.data_source_config.get("private_key") is None: - raise Exception("Private key not provided") - if ingest_config.data_source_config.get("client_email") is None: - raise Exception("Client email not provided") - connector = { - "type": "gcs", - "project_id": ingest_config.data_source_config["project_id"], - "private_key_id": ingest_config.data_source_config["private_key_id"], - "private_key": ingest_config.data_source_config["private_key"], - "client_email": ingest_config.data_source_config["client_email"], - } - data_stream_conn = data_stream_conn.replace( - "@source_config@", json.dumps(connector) - ) - else: - raise Exception("Data source not implemented") - - load_job_created = conn.gsql("USE GRAPH {}\n".format(graphname) + ingest_template) - - data_source_created = conn.gsql( - "USE GRAPH {}\n".format(graphname) + data_stream_conn - ) - - return { - "load_job_id": load_job_created.split(":")[1] - .strip(" [") - .strip(" ") - .strip(".") - .strip("]"), - "data_source_id": data_source_created.split(":")[1] - .strip(" [") - .strip(" ") - .strip(".") - .strip("]"), - } + return supportai.create_ingest(graphname, cfg, conn) @router.post("/{graphname}/supportai/ingest") @@ -387,18 +248,24 @@ def build_concepts( return {"status": "success"} -@router.get("/{graphname}/supportai/forceupdate") -def ecc( - graphname, +@router.get("/{graphname}/{method}/forceupdate") +def supportai_update( + graphname: str, + method: str, conn: Request, credentials: Annotated[HTTPBase, Depends(security)], bg_tasks: BackgroundTasks, + response: Response, ): + if method != SupportAIMethod.SUPPORTAI and method != SupportAIMethod.GRAPHRAG: + response.status_code = status.HTTP_404_NOT_FOUND + return f"{method} is not a valid method. {SupportAIMethod.SUPPORTAI} or {SupportAIMethod.GRAPHRAG}" + from httpx import get as http_get ecc = ( db_config.get("ecc", "http://localhost:8001") - + f"/{graphname}/consistency_status" + + f"/{graphname}/consistency_status/{method}" ) LogWriter.info(f"Sending ECC request to: {ecc}") bg_tasks.add_task( diff --git a/copilot/app/supportai/supportai.py b/copilot/app/supportai/supportai.py new file mode 100644 index 00000000..e96663a3 --- /dev/null +++ b/copilot/app/supportai/supportai.py @@ -0,0 +1,185 @@ +import json +import uuid + +from pyTigerGraph import TigerGraphConnection + +from common.py_schemas.schemas import ( + # CoPilotResponse, + CreateIngestConfig, + # LoadingInfo, + # SupportAIInitConfig, + # SupportAIMethod, + # SupportAIQuestion, +) + + +def init_supportai(conn: TigerGraphConnection, graphname: str) -> tuple[dict, dict]: + # need to open the file using the absolute path + file_path = "common/gsql/supportai/SupportAI_Schema.gsql" + with open(file_path, "r") as f: + schema = f.read() + schema_res = conn.gsql( + """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_schema""".format( + graphname, schema + ) + ) + + file_path = "common/gsql/supportai/SupportAI_IndexCreation.gsql" + with open(file_path) as f: + index = f.read() + index_res = conn.gsql( + """USE GRAPH {}\n{}\nRUN SCHEMA_CHANGE JOB add_supportai_indexes""".format( + graphname, index + ) + ) + + file_path = "common/gsql/supportai/Scan_For_Updates.gsql" + with open(file_path) as f: + scan_for_updates = f.read() + res = conn.gsql( + "USE GRAPH " + + conn.graphname + + "\n" + + scan_for_updates + + "\n INSTALL QUERY Scan_For_Updates" + ) + + file_path = "common/gsql/supportai/Update_Vertices_Processing_Status.gsql" + with open(file_path) as f: + update_vertices = f.read() + res = conn.gsql( + "USE GRAPH " + + conn.graphname + + "\n" + + update_vertices + + "\n INSTALL QUERY Update_Vertices_Processing_Status" + ) + + return schema_res, index_res + + +def create_ingest( + graphname: str, + ingest_config: CreateIngestConfig, + conn: TigerGraphConnection, +): + if ingest_config.file_format.lower() == "json": + file_path = "common/gsql/supportai/SupportAI_InitialLoadJSON.gsql" + + with open(file_path) as f: + ingest_template = f.read() + ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) + doc_id = ingest_config.loader_config.get("doc_id_field", "doc_id") + doc_text = ingest_config.loader_config.get("content_field", "content") + ingest_template = ingest_template.replace('"doc_id"', '"{}"'.format(doc_id)) + ingest_template = ingest_template.replace('"content"', '"{}"'.format(doc_text)) + + if ingest_config.file_format.lower() == "csv": + file_path = "common/gsql/supportai/SupportAI_InitialLoadCSV.gsql" + + with open(file_path) as f: + ingest_template = f.read() + ingest_template = ingest_template.replace("@uuid@", str(uuid.uuid4().hex)) + separator = ingest_config.get("separator", "|") + header = ingest_config.get("header", "true") + eol = ingest_config.get("eol", "\n") + quote = ingest_config.get("quote", "double") + ingest_template = ingest_template.replace('"|"', '"{}"'.format(separator)) + ingest_template = ingest_template.replace('"true"', '"{}"'.format(header)) + ingest_template = ingest_template.replace('"\\n"', '"{}"'.format(eol)) + ingest_template = ingest_template.replace('"double"', '"{}"'.format(quote)) + + file_path = "common/gsql/supportai/SupportAI_DataSourceCreation.gsql" + + with open(file_path) as f: + data_stream_conn = f.read() + + # assign unique identifier to the data stream connection + + data_stream_conn = data_stream_conn.replace( + "@source_name@", "SupportAI_" + graphname + "_" + str(uuid.uuid4().hex) + ) + + # check the data source and create the appropriate connection + if ingest_config.data_source.lower() == "s3": + data_conn = ingest_config.data_source_config + if ( + data_conn.get("aws_access_key") is None + or data_conn.get("aws_secret_key") is None + ): + raise Exception("AWS credentials not provided") + connector = { + "type": "s3", + "access.key": data_conn["aws_access_key"], + "secret.key": data_conn["aws_secret_key"], + } + + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + + elif ingest_config.data_source.lower() == "azure": + if ingest_config.data_source_config.get("account_key") is not None: + connector = { + "type": "abs", + "account.key": ingest_config.data_source_config["account_key"], + } + elif ingest_config.data_source_config.get("client_id") is not None: + # verify that the client secret is also provided + if ingest_config.data_source_config.get("client_secret") is None: + raise Exception("Client secret not provided") + # verify that the tenant id is also provided + if ingest_config.data_source_config.get("tenant_id") is None: + raise Exception("Tenant id not provided") + connector = { + "type": "abs", + "client.id": ingest_config.data_source_config["client_id"], + "client.secret": ingest_config.data_source_config["client_secret"], + "tenant.id": ingest_config.data_source_config["tenant_id"], + } + else: + raise Exception("Azure credentials not provided") + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + elif ingest_config.data_source.lower() == "gcs": + # verify that the correct fields are provided + if ingest_config.data_source_config.get("project_id") is None: + raise Exception("Project id not provided") + if ingest_config.data_source_config.get("private_key_id") is None: + raise Exception("Private key id not provided") + if ingest_config.data_source_config.get("private_key") is None: + raise Exception("Private key not provided") + if ingest_config.data_source_config.get("client_email") is None: + raise Exception("Client email not provided") + connector = { + "type": "gcs", + "project_id": ingest_config.data_source_config["project_id"], + "private_key_id": ingest_config.data_source_config["private_key_id"], + "private_key": ingest_config.data_source_config["private_key"], + "client_email": ingest_config.data_source_config["client_email"], + } + data_stream_conn = data_stream_conn.replace( + "@source_config@", json.dumps(connector) + ) + else: + raise Exception("Data source not implemented") + + load_job_created = conn.gsql("USE GRAPH {}\n".format(graphname) + ingest_template) + + data_source_created = conn.gsql( + "USE GRAPH {}\n".format(graphname) + data_stream_conn + ) + + return { + "load_job_id": load_job_created.split(":")[1] + .strip(" [") + .strip(" ") + .strip(".") + .strip("]"), + "data_source_id": data_source_created.split(":")[1] + .strip(" [") + .strip(" ") + .strip(".") + .strip("]"), + } diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb new file mode 100644 index 00000000..3b1200af --- /dev/null +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pyTigerGraph import TigerGraphConnection\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "# We first create a connection to the database\n", + "host = os.environ[\"HOST\"]\n", + "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", + "password = os.getenv(\"PASS\", \"tigergraph\")\n", + "conn = TigerGraphConnection(\n", + " host=host, username=username, password=password, graphname=\"GraphRAG_pytgdocs\"\n", + ")\n", + "\n", + "conn.getToken()\n", + "\n", + "# And then add CoPilot's address to the connection. This address\n", + "# is the host's address where the CoPilot container is running.\n", + "conn.ai.configureCoPilotHost(\"http://localhost:8000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "conn.graphname = \"GraphRAG_pytgdocs\"\n", + "# conn.gsql(\"\"\"CREATE GRAPH pyTigerGraphRAG()\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host_name': 'https://algotesting.i.tgcloud.io',\n", + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.829 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.002 seconds!\\\\nLocal schema change succeeded.\"'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.ai.initializeSupportAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", + "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", + "res = conn.ai.createDocumentIngest(\n", + " data_source=\"s3\",\n", + " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", + " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", + " file_format=\"json\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'job_name': 'load_documents_content_json_75b43aab4f714888b2be3f30441e745a',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.ai.runDocumentIngest(\n", + " res[\"load_job_id\"],\n", + " res[\"data_source_id\"],\n", + " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "# conn.ai.forceConsistencyUpdate()\n", + "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", + "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", + "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "content='Hello! How can I assist you today?' response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-2a50fab6-62fc-433c-98b4-221346ca41c6-0' usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17}\n" + ] + }, + { + "data": { + "text/plain": [ + "Joke(setup='Why was the cat sitting on the computer?', punchline='To keep an eye on the mouse!')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_core.pydantic_v1 import BaseModel, Field\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "\n", + "class Joke(BaseModel):\n", + " setup: str = Field(description=\"The setup of the joke\")\n", + " punchline: str = Field(description=\"The punchline to the joke\")\n", + "\n", + "\n", + "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", + "print(model.invoke('hi'))\n", + "structured_llm = model.with_structured_output(Joke)\n", + "structured_llm.invoke(\"Tell me a joke about cats\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents import Document\n", + "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", + "from langchain_openai import ChatOpenAI\n", + "import os\n", + "# from langchain_core.pydantic_v1 import BaseModel\n", + "from pydantic import BaseModel\n", + "\n", + "\n", + "class AnswerWithJustification(BaseModel):\n", + " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", + " answer: str\n", + " justification: str\n", + "\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", + "model_name = \"gpt-4o-mini\"\n", + "llm = ChatOpenAI(model=model_name, temperature=0)\n", + "# sllm = llm.with_structured_output(AnswerWithJustification)\n", + "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", + "\n", + "class GraphExtractor:\n", + " def __init__(self):\n", + " self.transformer = LLMGraphTransformer(\n", + " llm=llm,\n", + " node_properties=[\"description\"],\n", + " relationship_properties=[\"description\"],\n", + " )\n", + "\n", + " def extract(self, text):\n", + " doc = Document(page_content=text)\n", + " graph_docs = self.transformer.convert_to_graph_documents([doc])\n", + " return graph_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id='Marie Curie' type='Person' properties={'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.'}\n", + "id='Pierre Curie' type='Person' properties={'description': 'Husband of Marie Curie and co-winner of her first Nobel Prize.'}\n", + "id='University Of Paris' type='Institution' properties={'description': 'The institution where Marie Curie became the first woman professor in 1906.'}\n", + "id='Nobel Prize' type='Award' properties={'description': 'An award won by Marie Curie, first woman to win it and first person to win it twice.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Pierre Curie', type='Person') type='HUSBAND' properties={'description': \"Marie Curie's husband and co-winner of her first Nobel Prize.\"}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First woman to win a Nobel Prize.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First person to win a Nobel Prize twice.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'Only person to win a Nobel Prize in two scientific fields.'}\n", + "source=Node(id='Marie Curie', type='Person') target=Node(id='University Of Paris', type='Institution') type='PROFESSOR' properties={'description': 'First woman to become a professor at the University of Paris in 1906.'}\n" + ] + } + ], + "source": [ + "text = \"\"\"\n", + "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", + "She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.\n", + "Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.\n", + "She was, in 1906, the first woman to become a professor at the University of Paris.\n", + "\"\"\"\n", + "ge = GraphExtractor()\n", + "\n", + "docs = ge.extract(text)\n", + "for d in docs:\n", + " for n in d.nodes:\n", + " print(n)\n", + " for r in d.relationships:\n", + " print(r)\n", + "# print(f\"Nodes:{docs[0].nodes}\")\n", + "# print(f\"Relationships:{docs[0].relationships}\")\n", + "# docs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/eventual-consistency-service/app/eventual_consistency_checker.py b/eventual-consistency-service/app/eventual_consistency_checker.py index 007330bd..fa16694e 100644 --- a/eventual-consistency-service/app/eventual_consistency_checker.py +++ b/eventual-consistency-service/app/eventual_consistency_checker.py @@ -1,4 +1,3 @@ -import json import logging import time from typing import Dict, List @@ -367,4 +366,4 @@ def get_status(self): )[0] LogWriter.info(f"ECC_Status for graphname {self.graphname}: {status}") statuses[v_type] = status - return statuses \ No newline at end of file + return statuses diff --git a/eventual-consistency-service/app/graphrag/__init__.py b/eventual-consistency-service/app/graphrag/__init__.py new file mode 100644 index 00000000..953b2a0b --- /dev/null +++ b/eventual-consistency-service/app/graphrag/__init__.py @@ -0,0 +1 @@ +from .graph_rag import * diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py new file mode 100644 index 00000000..637546d6 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -0,0 +1,138 @@ +import asyncio +import logging + +from graphrag.util import install_query +from graphrag.worker import worker +from pyTigerGraph import TigerGraphConnection + +from common.chunkers import character_chunker, regex_chunker, semantic_chunker +from common.chunkers.base_chunker import BaseChunker +from common.config import (doc_processing_config, embedding_service, + get_llm_service, llm_config, milvus_config) +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor +from common.extractors.BaseExtractor import BaseExtractor + +logger = logging.getLogger(__name__) +consistency_checkers = {} + + +def get_chunker(): + if doc_processing_config.get("chunker") == "semantic": + chunker = semantic_chunker.SemanticChunker( + embedding_service, + doc_processing_config["chunker_config"].get("method", "percentile"), + doc_processing_config["chunker_config"].get("threshold", 0.95), + ) + elif doc_processing_config.get("chunker") == "regex": + chunker = regex_chunker.RegexChunker( + pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") + ) + elif doc_processing_config.get("chunker") == "character": + chunker = character_chunker.CharacterChunker( + chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), + overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), + ) + else: + raise ValueError("Invalid chunker type") + + return chunker + + +async def install_queries( + requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +): + loop = asyncio.get_event_loop() + tasks: list[asyncio.Task] = [] + + # queries that are currently installed + installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + + # add queries to be installed into the queue + tq = asyncio.Queue() + for q in requried_queries: + if q not in installed_queries: + tq.put_nowait((install_query, (conn, q))) + # break + + print("starting workers") + # start workers + for n in range(min(tq.qsize(), n_workers)): + task = loop.create_task(worker(n, tq)) + tasks.append(task) + + # wait for workers to finish jobs + await tq.join() + for t in tasks: + print(t.result()) + return "", "", "" + + +async def init( + graphname: str, conn: TigerGraphConnection +) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: + # install requried queries + requried_queries = [ + "Scan_For_Updates", + "Update_Vertices_Processing_Status", + "ECC_Status", + "Check_Nonexistent_Vertices", + ] + await install_queries(requried_queries, conn) + + # init processing tools + chunker = get_chunker() + vector_indices = {} + vertex_field = milvus_config.get("vertex_field", "vertex_id") + index_names = milvus_config.get( + "indexes", + ["Document", "DocumentChunk", "Entity", "Relationship"], + ) + for index_name in index_names: + vector_indices[graphname + "_" + index_name] = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=graphname + "_" + index_name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, + ) + + if doc_processing_config.get("extractor") == "llm": + extractor = GraphExtractor() + elif doc_processing_config.get("extractor") == "llm": + extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) + else: + raise ValueError("Invalid extractor type") + + if vertex_field is None: + raise ValueError( + "vertex_field is not defined. Ensure Milvus is enabled in the configuration." + ) + + return chunker, vector_indices, extractor + + +async def run(graphname: str, conn: TigerGraphConnection): + """ + ecc flow + + initialize_eventual_consistency_checker + instantiates ecc object + writes checker to checker dict + runs ecc_obj.initialize() + + ECC.initialize + loops and calls fetch and process + + """ + + chunker, vector_indices, extractor = await init(graphname, conn) + + # process docs + + return f"hi from graph rag ecc: {conn.graphname} ({graphname})" diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py new file mode 100644 index 00000000..ae6fbcf7 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/util.py @@ -0,0 +1,36 @@ +import base64 +from urllib.parse import quote_plus + +import httpx +from pyTigerGraph import TigerGraphConnection + +from common.logs.logwriter import LogWriter + + +async def install_query( + conn: TigerGraphConnection, query_name: str +) -> dict[str, httpx.Response | str | None]: + print("install --", query_name) + LogWriter.info(f"Installing query {query_name}") + with open(f"common/gsql/supportai/{query_name}.gsql", "r") as f: + query = f.read() + + query = f"""\ +USE GRAPH {conn.graphname} +{query} +INSTALL QUERY {query_name}""" + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) + + if "error" in res.text.lower(): + LogWriter.error(res.text) + return {"result": None, "error": f"Failed to install query {query_name}"} + + return {"result": res, "error": False} diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py new file mode 100644 index 00000000..4edd561a --- /dev/null +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -0,0 +1,27 @@ +import asyncio + + +async def worker( + n: int, + task_queue: asyncio.Queue, +): + worker_name = f"worker-{n+1}" + worker_name += " " if n + 1 < 10 else "" + responses = [] + i = 0 + + while not task_queue.empty(): + # get the next task + func, args = await task_queue.get() + response = await func(*args) + + responses.append(response) + i += 1 + task_queue.task_done() + + # collate results + results = [] + for r in responses: + results.append(r) + + return results diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 4ca26c2c..4c486bc0 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,54 +1,79 @@ +import asyncio +import json import logging -from typing import Annotated +from contextlib import asynccontextmanager +from threading import Thread +from typing import Annotated, Callable -from fastapi import Depends, FastAPI, BackgroundTasks +import graphrag +from eventual_consistency_checker import EventualConsistencyChecker +from fastapi import BackgroundTasks, Depends, FastAPI, Response, status from fastapi.security.http import HTTPBase from common.config import ( db_config, + doc_processing_config, embedding_service, get_llm_service, llm_config, milvus_config, security, - doc_processing_config, ) +from common.db.connections import elevate_db_connection_to_token from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.logs.logwriter import LogWriter from common.metrics.tg_proxy import TigerGraphConnectionProxy -from common.db.connections import elevate_db_connection_to_token -from eventual_consistency_checker import EventualConsistencyChecker -import json -from threading import Thread +from common.py_schemas.schemas import SupportAIMethod logger = logging.getLogger(__name__) consistency_checkers = {} -app = FastAPI() -@app.on_event("startup") -def startup_event(): - if not db_config.get("enable_consistency_checker", True): - LogWriter.info("Eventual consistency checker disabled") - return +@asynccontextmanager +async def lifespan(_: FastAPI): + if not db_config.get("enable_consistency_checker", False): + LogWriter.info("Eventual Consistency Checker not run on startup") + + else: + startup_checkers = db_config.get("graph_names", []) + for graphname in startup_checkers: + conn = elevate_db_connection_to_token( + db_config["hostname"], + db_config["username"], + db_config["password"], + graphname, + ) + start_ecc_in_thread(graphname, conn) + yield + LogWriter.info("ECC Shutdown") + + +app = FastAPI(lifespan=lifespan) - startup_checkers = db_config.get("graph_names", []) - for graphname in startup_checkers: - conn = elevate_db_connection_to_token(db_config["hostname"], db_config["username"], db_config["password"], graphname) - start_ecc_in_thread(graphname, conn) def start_ecc_in_thread(graphname: str, conn: TigerGraphConnectionProxy): - thread = Thread(target=initialize_eventual_consistency_checker, args=(graphname, conn), daemon=True) + thread = Thread( + target=initialize_eventual_consistency_checker, + args=(graphname, conn), + daemon=True, + ) thread.start() LogWriter.info(f"Eventual consistency checker started for graph {graphname}") -def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConnectionProxy): + +def initialize_eventual_consistency_checker( + graphname: str, conn: TigerGraphConnectionProxy +): if graphname in consistency_checkers: return consistency_checkers[graphname] try: - process_interval_seconds = milvus_config.get("process_interval_seconds", 1800) # default 30 minutes - cleanup_interval_seconds = milvus_config.get("cleanup_interval_seconds", 86400) # default 30 days, + process_interval_seconds = milvus_config.get( + "process_interval_seconds", 1800 + ) # default 30 minutes + cleanup_interval_seconds = milvus_config.get( + "cleanup_interval_seconds", 86400 + ) # default 30 days, batch_size = milvus_config.get("batch_size", 10) vector_indices = {} vertex_field = None @@ -70,7 +95,7 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn password=milvus_config.get("password", ""), vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field + vertex_field=vertex_field, ) if doc_processing_config.get("chunker") == "semantic": @@ -111,7 +136,9 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn raise ValueError("Invalid extractor type") if vertex_field is None: - raise ValueError("vertex_field is not defined. Ensure Milvus is enabled in the configuration.") + raise ValueError( + "vertex_field is not defined. Ensure Milvus is enabled in the configuration." + ) checker = EventualConsistencyChecker( process_interval_seconds, @@ -124,7 +151,7 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn conn, chunker, extractor, - batch_size + batch_size, ) consistency_checkers[graphname] = checker @@ -138,22 +165,65 @@ def initialize_eventual_consistency_checker(graphname: str, conn: TigerGraphConn return checker except Exception as e: - LogWriter.error(f"Failed to start eventual consistency checker for graph {graphname}: {e}") + LogWriter.error( + f"Failed to start eventual consistency checker for graph {graphname}: {e}" + ) + + +def start_func_in_thread(f: Callable, *args, **kwargs): + thread = Thread( + target=f, + args=args, + kwargs=kwargs, + daemon=True, + ) + thread.start() + LogWriter.info(f'Thread started for function: "{f.__name__}"') + + +# def start_async_func(f: Callable, *args, **kwargs): +# asyncio.run(f(args, kwargs)) +# LogWriter.info(f'Thread started for function: "{f.__name__}"') + @app.get("/") def root(): LogWriter.info(f"Healthcheck") return {"status": "ok"} -@app.get("/{graphname}/consistency_status") -def consistency_status(graphname: str, credentials: Annotated[HTTPBase, Depends(security)]): - if graphname in consistency_checkers: - ecc = consistency_checkers[graphname] - status = json.dumps(ecc.get_status()) - else: - conn = elevate_db_connection_to_token(db_config["hostname"], credentials.username, credentials.password, graphname) - start_ecc_in_thread(graphname, conn) - status = f"Eventual consistency checker started for graph {graphname}" - LogWriter.info(f"Returning consistency status for {graphname}: {status}") - return status +@app.get("/{graphname}/consistency_status/{ecc_method}") +def consistency_status( + graphname: str, + ecc_method: str, + background: BackgroundTasks, + credentials: Annotated[HTTPBase, Depends(security)], + response: Response, +): + conn = elevate_db_connection_to_token( + db_config["hostname"], + credentials.username, + credentials.password, + graphname, + ) + match ecc_method: + case SupportAIMethod.SUPPORTAI: + if graphname in consistency_checkers: + ecc = consistency_checkers[graphname] + ecc_status = json.dumps(ecc.get_status()) + else: + start_ecc_in_thread(graphname, conn) + ecc_status = ( + f"Eventual consistency checker started for graph {graphname}" + ) + + LogWriter.info(f"Returning consistency status for {graphname}: {status}") + case SupportAIMethod.GRAPHRAG: + background.add_task(graphrag.run, graphname, conn) + # asyncio.run(graphrag.run(graphname, conn)) + ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname})" + case _: + response.status_code = status.HTTP_404_NOT_FOUND + return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" + + return ecc_status From 8e0ed554c8041c5cc9b070f51636a8c636275b2f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:30:08 -0400 Subject: [PATCH 02/53] save: docs handled concurrently -- writing upsert_edge --- docker-compose.yml | 154 +++++++++--------- .../app/graphrag/graph_rag.py | 95 +++++++---- .../app/graphrag/util.py | 151 ++++++++++++++++- .../app/graphrag/worker.py | 33 +++- eventual-consistency-service/app/main.py | 37 +---- 5 files changed, 309 insertions(+), 161 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4da38a25..f0a80154 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,30 +1,30 @@ services: - copilot: - image: tigergraphml/copilot:latest - container_name: copilot - build: - context: . - dockerfile: copilot/Dockerfile - ports: - - 8000:8000 - depends_on: - - eventual-consistency-service - - chat-history - environment: - LLM_CONFIG: "/code/configs/llm_config.json" - DB_CONFIG: "/code/configs/db_config.json" - MILVUS_CONFIG: "/code/configs/milvus_config.json" - LOGLEVEL: "INFO" - USE_CYPHER: "true" - volumes: - - ./configs/:/code/configs - - ./common:/code/common - networks: - - copilot_local - +# copilot: +# image: tigergraphml/copilot:latest +# container_name: copilot +# build: +# context: . +# dockerfile: copilot/Dockerfile +# ports: +# - 8000:8000 +# depends_on: +# - eventual-consistency-service +# - chat-history +# environment: +# LLM_CONFIG: "/code/configs/llm_config.json" +# DB_CONFIG: "/code/configs/db_config.json" +# MILVUS_CONFIG: "/code/configs/milvus_config.json" +# LOGLEVEL: "INFO" +# USE_CYPHER: "true" +# volumes: +# - ./configs/:/code/configs +# - ./common:/code/common +# networks: +# - copilot_local +# eventual-consistency-service: image: tigergraphml/ecc:latest - container_name: eventual-consistency-service + # container_name: eventual-consistency-service build: context: . dockerfile: eventual-consistency-service/Dockerfile @@ -40,64 +40,64 @@ services: - ./common:/code/common networks: - copilot_local - - chat-history: - image: tigergraphml/chat-history:latest - container_name: chat-history - build: - context: chat-history/ - dockerfile: Dockerfile - ports: - - 8002:8002 - environment: - CONFIG: "/configs/config.json" - LOGLEVEL: "INFO" - volumes: - - ./chat-history/:/configs - networks: - - copilot_local - # report-service: - # image: tigergraphml/report-service:latest - # container_name: report-service +# + # chat-history: + # image: tigergraphml/chat-history:latest + # container_name: chat-history # build: - # context: . - # dockerfile: report-service/Dockerfile + # context: chat-history/ + # dockerfile: Dockerfile # ports: # - 8002:8002 # environment: - # LLM_CONFIG: "/code/configs/llm_config.json" - # DB_CONFIG: "/code/configs/db_config.json" - # MILVUS_CONFIG: "/code/configs/milvus_config.json" + # CONFIG: "/configs/config.json" # LOGLEVEL: "INFO" # volumes: - # - ./configs/:/code/configs - # - ./common:/code/common - # - ui: - image: tigergraphml/copilot-ui:latest - container_name: ui - build: - context: copilot-ui - dockerfile: Dockerfile - ports: - - 3000:3000 - depends_on: - - copilot - networks: - - copilot_local - - nginx: - container_name: nginx - image: nginx - volumes: - - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf - ports: - - "80:80" - depends_on: - - ui - - copilot - networks: - - copilot_local - + # - ./configs/:/configs + # networks: + # - copilot_local +# # report-service: +# # image: tigergraphml/report-service:latest +# # container_name: report-service +# # build: +# # context: . +# # dockerfile: report-service/Dockerfile +# # ports: +# # - 8002:8002 +# # environment: +# # LLM_CONFIG: "/code/configs/llm_config.json" +# # DB_CONFIG: "/code/configs/db_config.json" +# # MILVUS_CONFIG: "/code/configs/milvus_config.json" +# # LOGLEVEL: "INFO" +# # volumes: +# # - ./configs/:/code/configs +# # - ./common:/code/common +# # +# ui: +# image: tigergraphml/copilot-ui:latest +# container_name: ui +# build: +# context: copilot-ui +# dockerfile: Dockerfile +# ports: +# - 3000:3000 +# depends_on: +# - copilot +# networks: +# - copilot_local +# +# nginx: +# container_name: nginx +# image: nginx +# volumes: +# - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf +# ports: +# - "80:80" +# depends_on: +# - ui +# - copilot +# networks: +# - copilot_local +# networks: copilot_local: diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 637546d6..1477d9e0 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,14 +1,19 @@ import asyncio import logging -from graphrag.util import install_query +import ecc_util +from graphrag.util import install_query, stream_docs, upsert_chunk from graphrag.worker import worker from pyTigerGraph import TigerGraphConnection -from common.chunkers import character_chunker, regex_chunker, semantic_chunker from common.chunkers.base_chunker import BaseChunker -from common.config import (doc_processing_config, embedding_service, - get_llm_service, llm_config, milvus_config) +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor @@ -17,28 +22,6 @@ consistency_checkers = {} -def get_chunker(): - if doc_processing_config.get("chunker") == "semantic": - chunker = semantic_chunker.SemanticChunker( - embedding_service, - doc_processing_config["chunker_config"].get("method", "percentile"), - doc_processing_config["chunker_config"].get("threshold", 0.95), - ) - elif doc_processing_config.get("chunker") == "regex": - chunker = regex_chunker.RegexChunker( - pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") - ) - elif doc_processing_config.get("chunker") == "character": - chunker = character_chunker.CharacterChunker( - chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), - overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), - ) - else: - raise ValueError("Invalid chunker type") - - return chunker - - async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 ): @@ -51,11 +34,10 @@ async def install_queries( # add queries to be installed into the queue tq = asyncio.Queue() for q in requried_queries: - if q not in installed_queries: + q_name = q.split("/")[-1] + if q_name not in installed_queries: tq.put_nowait((install_query, (conn, q))) - # break - print("starting workers") # start workers for n in range(min(tq.qsize(), n_workers)): task = loop.create_task(worker(n, tq)) @@ -65,23 +47,48 @@ async def install_queries( await tq.join() for t in tasks: print(t.result()) + # TODO: Check if anything had an error return "", "", "" +async def process_doc( + conn: TigerGraphConnection, doc: dict[str, str], sem: asyncio.Semaphore +): + # TODO: Embed document and chunks + chunker = ecc_util.get_chunker() + try: + print(">>>>>", doc["v_id"], len(doc["attributes"]["text"])) + # await asyncio.sleep(5) + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + await upsert_chunk(conn, v_id, f"{v_id}_chunk_{i}", chunk) + # break # single chunk FIXME: delete + finally: + sem.release() + + return doc["v_id"] + + async def init( graphname: str, conn: TigerGraphConnection ) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: # install requried queries requried_queries = [ - "Scan_For_Updates", - "Update_Vertices_Processing_Status", - "ECC_Status", - "Check_Nonexistent_Vertices", + # "common/gsql/supportai/Scan_For_Updates", + # "common/gsql/supportai/Update_Vertices_Processing_Status", + # "common/gsql/supportai/ECC_Status", + # "common/gsql/supportai/Check_Nonexistent_Vertices", + "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamDocContent", ] - await install_queries(requried_queries, conn) + # await install_queries(requried_queries, conn) + return await install_queries(requried_queries, conn) # init processing tools - chunker = get_chunker() + chunker = ecc_util.get_chunker() + vector_indices = {} vertex_field = milvus_config.get("vertex_field", "vertex_id") index_names = milvus_config.get( @@ -131,8 +138,26 @@ async def run(graphname: str, conn: TigerGraphConnection): """ + # init configurable objects chunker, vector_indices, extractor = await init(graphname, conn) # process docs + doc_workers = 48 # TODO: make configurable + doc_tasks = [] + doc_sem = asyncio.Semaphore(doc_workers) + + async with asyncio.TaskGroup() as tg: + async for content in stream_docs(conn): + # only n workers at a time -- held up by semaphore + print(">>>>>>>>>>>>>>>>>>>>>>>>\n", len(doc_tasks), "<<<<<<<<<") + await doc_sem.acquire() + task = tg.create_task(process_doc(conn, content, doc_sem)) + doc_tasks.append(task) + break + + # do something with doc_tasks + for t in doc_tasks: + print(t.result()) + print("DONE") return f"hi from graph rag ecc: {conn.graphname} ({graphname})" diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index ae6fbcf7..ce2efe52 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -1,4 +1,7 @@ import base64 +import json +import time +import traceback from urllib.parse import quote_plus import httpx @@ -7,14 +10,24 @@ from common.logs.logwriter import LogWriter +def make_headers(conn: TigerGraphConnection): + if conn.apiToken is None or conn.apiToken == "": + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + else: + headers = {"Authorization": f"Bearer {conn.apiToken}"} + + return headers + + async def install_query( - conn: TigerGraphConnection, query_name: str + conn: TigerGraphConnection, query_path: str ) -> dict[str, httpx.Response | str | None]: - print("install --", query_name) - LogWriter.info(f"Installing query {query_name}") - with open(f"common/gsql/supportai/{query_name}.gsql", "r") as f: + LogWriter.info(f"Installing query {query_path}") + with open(f"{query_path}.gsql", "r") as f: query = f.read() + query_name = query_path.split("/")[-1] query = f"""\ USE GRAPH {conn.graphname} {query} @@ -31,6 +44,134 @@ async def install_query( if "error" in res.text.lower(): LogWriter.error(res.text) - return {"result": None, "error": f"Failed to install query {query_name}"} + return { + "result": None, + "error": True, + "message": f"Failed to install query {query_name}", + } return {"result": res, "error": False} + + +async def stream_doc_ids( + conn: TigerGraphConnection, current_batch: int, ttl_batches: int +) -> dict[str, str | list[str]]: + headers = make_headers(conn) + + try: + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", + params={ + "current_batch": current_batch, + "ttl_batches": ttl_batches, + }, + headers=headers, + ) + ids = res.json()["results"][0]["@@doc_ids"] + return {"error": False, "ids": ids} + + except Exception as e: + exc = traceback.format_exc() + LogWriter.error( + f"/{conn.graphname}/query/StreamDocIds\nException Trace:\n{exc}" + ) + + return {"error": True, "message": str(e)} + + +async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): + headers = make_headers(conn) + for i in range(ttl_batches): + doc_ids = await stream_doc_ids(conn, i, ttl_batches) + if doc_ids["error"]: + print(doc_ids) + break # TODO: handle error + + print("*******") + print(doc_ids) + print("*******") + for d in doc_ids["ids"]: + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + + # TODO: check for errors + yield res.json()["results"][0]["DocContent"][0] + return # single doc test FIXME: delete + # return # single batch test FIXME: delete + + +def map_attrs(attributes: dict): + # map attrs + attrs = {} + for k, v in attributes.items(): + if isinstance(v, tuple): + attrs[k] = {"value": v[0], "op": v[1]} + elif isinstance(v, dict): + attrs[k] = { + "value": {"keylist": list(v.keys()), "valuelist": list(v.values())} + } + else: + attrs[k] = {"value": v} + return attrs + + +async def upsert_vertex( + conn: TigerGraphConnection, + vertex_type: str, + vertex_id: str, + attributes: dict = None, +): + attrs = map_attrs(attributes) + data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + print(res) + +async def upsert_edge( + conn: TigerGraphConnection, + vertex_type: str, + vertex_id: str, + attributes: dict = None, +): + TODO + attrs = map_attrs(attributes) + data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.post( + f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + ) + print(res) + +async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): + date_added = int(time.time()) + await upsert_vertex( + conn, + "DocumentChunk", + chunk_id, + attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, + ) + await upsert_vertex( + conn, + "Content", + chunk_id, + attributes={"text": chunk, "epoch_added": date_added}, + ) + conn.upsertEdge("DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id) + # self.conn.upsertEdge("Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) + # if int(chunk_id.split("_")[-1]) > 0: + # self.conn.upsertEdge( + # "DocumentChunk", + # chunk_id, + # "IS_AFTER", + # "DocumentChunk", + # doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + # ) diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py index 4edd561a..a2c7bbb6 100644 --- a/eventual-consistency-service/app/graphrag/worker.py +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -1,27 +1,42 @@ import asyncio +# class Channel(asyncio.Queue): +# def __init__(self, maxsize=0): +# self.is_open = True +# super().__init__(maxsize) +# +# def close(self): +# self.is_open = False + + async def worker( n: int, task_queue: asyncio.Queue, ): + # init worker logging/reporting (TODO) worker_name = f"worker-{n+1}" worker_name += " " if n + 1 < 10 else "" - responses = [] - i = 0 + while task_queue.empty(): + print(f"{worker_name} waiting") + await asyncio.sleep(1) + + # consume task queue + print(f"{worker_name} started") + responses = [] while not task_queue.empty(): # get the next task func, args = await task_queue.get() + + # execute the task response = await func(*args) + # append task results to worker results/response responses.append(response) - i += 1 - task_queue.task_done() - # collate results - results = [] - for r in responses: - results.append(r) + # mark task as done + task_queue.task_done() - return results + print(f"{worker_name} done") + return responses diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 4c486bc0..0277a272 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,10 +1,10 @@ -import asyncio import json import logging from contextlib import asynccontextmanager from threading import Thread from typing import Annotated, Callable +import ecc_util import graphrag from eventual_consistency_checker import EventualConsistencyChecker from fastapi import BackgroundTasks, Depends, FastAPI, Response, status @@ -98,35 +98,7 @@ def initialize_eventual_consistency_checker( vertex_field=vertex_field, ) - if doc_processing_config.get("chunker") == "semantic": - from common.chunkers.semantic_chunker import SemanticChunker - - chunker = SemanticChunker( - embedding_service, - doc_processing_config["chunker_config"].get("method", "percentile"), - doc_processing_config["chunker_config"].get("threshold", 0.95), - ) - elif doc_processing_config.get("chunker") == "regex": - from common.chunkers.regex_chunker import RegexChunker - - chunker = RegexChunker( - pattern=doc_processing_config["chunker_config"].get( - "pattern", "\\r?\\n" - ) - ) - elif doc_processing_config.get("chunker") == "character": - from common.chunkers.character_chunker import CharacterChunker - - chunker = CharacterChunker( - chunk_size=doc_processing_config["chunker_config"].get( - "chunk_size", 1024 - ), - overlap_size=doc_processing_config["chunker_config"].get( - "overlap_size", 0 - ), - ) - else: - raise ValueError("Invalid chunker type") + chunker = ecc_util.get_chunker() if doc_processing_config.get("extractor") == "llm": from common.extractors import LLMEntityRelationshipExtractor @@ -181,11 +153,6 @@ def start_func_in_thread(f: Callable, *args, **kwargs): LogWriter.info(f'Thread started for function: "{f.__name__}"') -# def start_async_func(f: Callable, *args, **kwargs): -# asyncio.run(f(args, kwargs)) -# LogWriter.info(f'Thread started for function: "{f.__name__}"') - - @app.get("/") def root(): LogWriter.info(f"Healthcheck") From ec299a27f84121f9e8e85666847dc517f80e2291 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:39:04 -0400 Subject: [PATCH 03/53] save: docs handled concurrently -- writing upsert_edge --- common/gsql/graphRAG/StreamDocContent.gsql | 5 + common/gsql/graphRAG/StreamDocIds.gsql | 10 + .../louvain/louvain_1_first_pass.gsql | 176 ++++++++++++++++++ eventual-consistency-service/app/ecc_util.py | 24 +++ 4 files changed, 215 insertions(+) create mode 100644 common/gsql/graphRAG/StreamDocContent.gsql create mode 100644 common/gsql/graphRAG/StreamDocIds.gsql create mode 100644 common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql create mode 100644 eventual-consistency-service/app/ecc_util.py diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql new file mode 100644 index 00000000..fb7338b7 --- /dev/null +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -0,0 +1,5 @@ +CREATE QUERY StreamDocContent(Vertex doc) { + Doc = {doc}; + DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c; + PRINT DocContent; +} diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql new file mode 100644 index 00000000..fb373490 --- /dev/null +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -0,0 +1,10 @@ +CREATE QUERY StreamDocIds(INT current_batch, INT ttl_batches) { + ListAccum @@doc_ids; + Docs = {Document.*}; + + Docs = SELECT d FROM Docs:d + WHERE vertex_to_int(d) % ttl_batches == current_batch + ACCUM @@doc_ids += d.id; + + PRINT @@doc_ids; +} diff --git a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql new file mode 100644 index 00000000..4ca06029 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql @@ -0,0 +1,176 @@ +CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( + UINT max_hop = 10, + UINT batch_num = 12, + UINT sample_edge_num = 100 +) FOR GRAPH {graph_name} SYNTAX v1 { + + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; --> this should be Community, I think + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + SumAccum @vid; + + DOUBLE wt = 1.0; + + // Initialization + All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM @@m += wt / 2, + s.@k += wt, + IF s == t THEN // self-loop link + js.@k_self_loop += wt + END + POST-ACCUM + s.@{community_id_attribute_name} = s, + s.@community_vid = to_string(s.id), + s.@vid = getvid(s), + s.@batch_id = s.@vid % batch_num; + + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = All_Nodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop = hop + 1; + LOG(TRUE, hop); + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t + WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE; + + ELSE // remaining iterations + // Calculate sum_total + Tmp = SELECT s FROM All_Nodes:s + POST-ACCUM + @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k); + Tmp = SELECT s FROM All_Nodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}); + + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {{}}; + FOREACH batch_id IN RANGE[0, batch_num-1] DO + LOG(TRUE, batch_id); + // Calculate the delta Q to remove the node from the previous community + Nodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t + WHERE s.@batch_id == batch_id + ACCUM + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + s.@k_in += wt + ELSE + s.@community_k_in_map += (t.@{community_id_attribute_name} -> wt) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add + ; + + // Find the best move + Nodes = SELECT s FROM Nodes:s -({relation_edge_name}:e)- :t + //SAMPLE sample_edge_num EDGE WHEN s.outdegree("{relation_edge_name}") > sample_edge_num + WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; + + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -({relation_edge_name}:e)- :t + WHERE s.@best_move.community == t.@{community_id_attribute_name} + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@{community_id_attribute_name} + // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND ( + s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR ( + abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@{community_id_attribute_name} = s.@best_move.community, + s.@community_vid = s.@best_move.ext_vid, + s.@to_change_community = FALSE; + + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -({relation_edge_name}:e)- :t + WHERE t.@{community_id_attribute_name} != s.@{community_id_attribute_name}; + END; + + PRINT @@move_cnt AS Delta; + + // Coarsening + UINT new_layer = 0; + @@community_sum_total_map.clear(); + Tmp = + SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM + IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN + @@community_sum_in_map += (s.@{community_id_attribute_name} -> wt) + END + POST-ACCUM + //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), + INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) + END; + + @@community_sum_in_map.clear(); + + Tmp = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t + ACCUM + IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN + @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> wt)) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + //f_links_to.println(s.id, target_community, k_in, new_layer) + INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) + END + END; + + @@source_target_k_in_map.clear(); +} diff --git a/eventual-consistency-service/app/ecc_util.py b/eventual-consistency-service/app/ecc_util.py new file mode 100644 index 00000000..5656e219 --- /dev/null +++ b/eventual-consistency-service/app/ecc_util.py @@ -0,0 +1,24 @@ +from common.chunkers import character_chunker, regex_chunker, semantic_chunker +from common.config import doc_processing_config, embedding_service + + +def get_chunker(): + if doc_processing_config.get("chunker") == "semantic": + chunker = semantic_chunker.SemanticChunker( + embedding_service, + doc_processing_config["chunker_config"].get("method", "percentile"), + doc_processing_config["chunker_config"].get("threshold", 0.95), + ) + elif doc_processing_config.get("chunker") == "regex": + chunker = regex_chunker.RegexChunker( + pattern=doc_processing_config["chunker_config"].get("pattern", "\\r?\\n") + ) + elif doc_processing_config.get("chunker") == "character": + chunker = character_chunker.CharacterChunker( + chunk_size=doc_processing_config["chunker_config"].get("chunk_size", 1024), + overlap_size=doc_processing_config["chunker_config"].get("overlap_size", 0), + ) + else: + raise ValueError("Invalid chunker type") + + return chunker From fce72c43c73aa425d859b8120bf5ccb94e6c995f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:24:47 -0400 Subject: [PATCH 04/53] changing queues for channels --- .../app/graphrag/graph_rag.py | 154 +++++++++++------- .../app/graphrag/util.py | 99 ++++++++--- .../app/graphrag/worker.py | 11 +- eventual-consistency-service/requirements.txt | 1 + 4 files changed, 173 insertions(+), 92 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 1477d9e0..0b5265b1 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,9 +1,10 @@ import asyncio import logging +import time import ecc_util -from graphrag.util import install_query, stream_docs, upsert_chunk -from graphrag.worker import worker +from aiochannel import Channel +from graphrag.util import chunk_doc, install_query, stream_docs from pyTigerGraph import TigerGraphConnection from common.chunkers.base_chunker import BaseChunker @@ -25,52 +26,25 @@ async def install_queries( requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 ): - loop = asyncio.get_event_loop() - tasks: list[asyncio.Task] = [] - # queries that are currently installed installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - # add queries to be installed into the queue - tq = asyncio.Queue() - for q in requried_queries: - q_name = q.split("/")[-1] - if q_name not in installed_queries: - tq.put_nowait((install_query, (conn, q))) - - # start workers - for n in range(min(tq.qsize(), n_workers)): - task = loop.create_task(worker(n, tq)) - tasks.append(task) + tasks = [] + async with asyncio.TaskGroup() as grp: + for q in requried_queries: + async with asyncio.Semaphore(n_workers): + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + task = grp.create_task(install_query(conn, q)) + tasks.append(task) - # wait for workers to finish jobs - await tq.join() for t in tasks: print(t.result()) # TODO: Check if anything had an error return "", "", "" -async def process_doc( - conn: TigerGraphConnection, doc: dict[str, str], sem: asyncio.Semaphore -): - # TODO: Embed document and chunks - chunker = ecc_util.get_chunker() - try: - print(">>>>>", doc["v_id"], len(doc["attributes"]["text"])) - # await asyncio.sleep(5) - chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] - # TODO: n chunks at a time - for i, chunk in enumerate(chunks): - await upsert_chunk(conn, v_id, f"{v_id}_chunk_{i}", chunk) - # break # single chunk FIXME: delete - finally: - sem.release() - - return doc["v_id"] - - async def init( graphname: str, conn: TigerGraphConnection ) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: @@ -124,6 +98,62 @@ async def init( return chunker, vector_indices, extractor +async def process_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + embed_q: Channel, + chunk_q: Channel, +): + doc_tasks = [] + async with asyncio.TaskGroup() as grp: + async for content in stream_docs(conn): + # only n workers at a time -- held up by semaphore size + async with asyncio.Semaphore(doc_workers): + task = grp.create_task(chunk_doc(conn, content, chunk_q, embed_q)) + doc_tasks.append(task) + break # single doc FIXME: delete + + # do something with doc_tasks? + for t in doc_tasks: + print(t.result()) + + +async def embed(embed_q: Channel): + pass + + +async def upsert(upsert_q: Channel): + """ + queue expects: + (func, args) <- q.get() + """ + while upsert_q.empty(): + await asyncio.sleep(1) + + # consume task queue + print("upsert started") + responses = [] + while not upsert_q.empty(): + # get the next task + func, args = await upsert_q.get() + + # execute the task + response = await func(*args) + + # append task results to worker results/response + responses.append(response) + + # mark task as done + upsert_q.task_done() + + print(f"upsert done") + return responses + + +async def extract(extract_q: Channel): + pass + + async def run(graphname: str, conn: TigerGraphConnection): """ ecc flow @@ -139,25 +169,33 @@ async def run(graphname: str, conn: TigerGraphConnection): """ # init configurable objects - chunker, vector_indices, extractor = await init(graphname, conn) - - # process docs - doc_workers = 48 # TODO: make configurable - doc_tasks = [] - doc_sem = asyncio.Semaphore(doc_workers) - - async with asyncio.TaskGroup() as tg: - async for content in stream_docs(conn): - # only n workers at a time -- held up by semaphore - print(">>>>>>>>>>>>>>>>>>>>>>>>\n", len(doc_tasks), "<<<<<<<<<") - await doc_sem.acquire() - task = tg.create_task(process_doc(conn, content, doc_sem)) - doc_tasks.append(task) - break - - # do something with doc_tasks - for t in doc_tasks: - print(t.result()) + await init(graphname, conn) + # return + start = time.perf_counter() + + # TODO: make configurable + tasks = [] + docs_chan = Channel(48) # process n chunks at a time max + chunk_chan = Channel(100) # process 100 chunks at a time max + embed_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # get docs + t = grp.create_task(stream_docs(conn, docs_chan,10)) + tasks.append(t) + # process docs + t = grp.create_task(process_docs(conn, docs_chan, embed_chan, chunk_chan)) + tasks.append(t) + # embed + t = grp.create_task(embed(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + # upsert chunks + t = grp.create_task(upsert(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + # extract entities + t = grp.create_task(extract(conn, doc_workers, embed_chan, chunk_chan)) + tasks.append(t) + end = time.perf_counter() print("DONE") - return f"hi from graph rag ecc: {conn.graphname} ({graphname})" + print(end - start) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index ce2efe52..c18ec86a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,6 +5,8 @@ from urllib.parse import quote_plus import httpx +from aiochannel import Channel +from app import ecc_util from pyTigerGraph import TigerGraphConnection from common.logs.logwriter import LogWriter @@ -80,7 +82,11 @@ async def stream_doc_ids( return {"error": True, "message": str(e)} -async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): +async def stream_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + ttl_batches: int = 10, +): headers = make_headers(conn) for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) @@ -88,9 +94,6 @@ async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): print(doc_ids) break # TODO: handle error - print("*******") - print(doc_ids) - print("*******") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=None) as client: res = await client.get( @@ -98,13 +101,35 @@ async def stream_docs(conn: TigerGraphConnection, ttl_batches: int = 10): params={"doc": d}, headers=headers, ) - # TODO: check for errors - yield res.json()["results"][0]["DocContent"][0] - return # single doc test FIXME: delete + # this will block and wait if the channel is full + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + # return # single doc test FIXME: delete # return # single batch test FIXME: delete +async def chunk_doc( + conn: TigerGraphConnection, + doc: dict[str, str], + chunk_chan: Channel, + embed_chan: Channel, +): + # TODO: Embed document and chunks + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + # send chunks to be upserted (func, args) + await chunk_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) + + # send chunks to be embedded + + # break # single chunk FIXME: delete + + return doc["v_id"] + + def map_attrs(attributes: dict): # map attrs attrs = {} @@ -124,7 +149,7 @@ async def upsert_vertex( conn: TigerGraphConnection, vertex_type: str, vertex_id: str, - attributes: dict = None, + attributes: dict, ): attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) @@ -133,23 +158,44 @@ async def upsert_vertex( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res) + print(res.json()) + async def upsert_edge( conn: TigerGraphConnection, - vertex_type: str, - vertex_id: str, + src_v_type: str, + src_v_id: str, + edge_type: str, + tgt_v_type: str, + tgt_v_id: str, attributes: dict = None, ): - TODO - attrs = map_attrs(attributes) - data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + if attributes is None: + attrs = {} + else: + attrs = map_attrs(attributes) + data = json.dumps( + { + "edges": { + src_v_type: { + src_v_id: { + edge_type: { + tgt_v_type: { + tgt_v_id: attrs, + } + } + }, + } + } + } + ) headers = make_headers(conn) async with httpx.AsyncClient(timeout=None) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res) + print(res.json()) + async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): date_added = int(time.time()) @@ -165,13 +211,16 @@ async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): chunk_id, attributes={"text": chunk, "epoch_added": date_added}, ) - conn.upsertEdge("DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id) - # self.conn.upsertEdge("Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) - # if int(chunk_id.split("_")[-1]) > 0: - # self.conn.upsertEdge( - # "DocumentChunk", - # chunk_id, - # "IS_AFTER", - # "DocumentChunk", - # doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), - # ) + await upsert_edge( + conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id + ) + await upsert_edge(conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) + if int(chunk_id.split("_")[-1]) > 0: + await upsert_edge( + conn, + "DocumentChunk", + chunk_id, + "IS_AFTER", + "DocumentChunk", + doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + ) diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py index a2c7bbb6..40720deb 100644 --- a/eventual-consistency-service/app/graphrag/worker.py +++ b/eventual-consistency-service/app/graphrag/worker.py @@ -1,18 +1,11 @@ import asyncio - -# class Channel(asyncio.Queue): -# def __init__(self, maxsize=0): -# self.is_open = True -# super().__init__(maxsize) -# -# def close(self): -# self.is_open = False +from aiochannel import Channel async def worker( n: int, - task_queue: asyncio.Queue, + task_queue: Channel, ): # init worker logging/reporting (TODO) worker_name = f"worker-{n+1}" diff --git a/eventual-consistency-service/requirements.txt b/eventual-consistency-service/requirements.txt index 90cc7f2c..3bc0dae0 100644 --- a/eventual-consistency-service/requirements.txt +++ b/eventual-consistency-service/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohttp==3.9.3 aiosignal==1.3.1 annotated-types==0.5.0 From 46d73dc039ef005c4680c525c2e417225f1d2951 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:45:18 -0400 Subject: [PATCH 05/53] graphrag etl with channels --- copilot/docs/notebooks/graphrag.ipynb | 154 +++++++++++------- .../app/graphrag/graph_rag.py | 125 ++++++++++---- .../app/graphrag/util.py | 35 +++- 3 files changed, 207 insertions(+), 107 deletions(-) diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 3b1200af..57ea4b48 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -16,51 +16,70 @@ "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", "password = os.getenv(\"PASS\", \"tigergraph\")\n", "conn = TigerGraphConnection(\n", - " host=host, username=username, password=password, graphname=\"GraphRAG_pytgdocs\"\n", - ")\n", - "\n", - "conn.getToken()\n", - "\n", - "# And then add CoPilot's address to the connection. This address\n", - "# is the host's address where the CoPilot container is running.\n", - "conn.ai.configureCoPilotHost(\"http://localhost:8000\")" + " host=host,\n", + " username=username,\n", + " password=password,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'The graph GraphRAG_pytgdocs is created.'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "conn.graphname = \"GraphRAG_pytgdocs\"\n", - "# conn.gsql(\"\"\"CREATE GRAPH pyTigerGraphRAG()\"\"\")" + "conn.gsql(\"\"\"CREATE GRAPH GraphRAG_pytgdocs()\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "_ = conn.getToken()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.829 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.002 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.434 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.932 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# And then add CoPilot's address to the connection. This address\n", + "# is the host's address where the CoPilot container is running.\n", + "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", "conn.ai.initializeSupportAI()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -76,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_75b43aab4f714888b2be3f30441e745a',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_75b43aab4f714888b2be3f30441e745a.stream.SupportAI_GraphRAG_pytgdocs_f0e175af264a4a18b1aa3bf8f4063d0e.1721674044503'}" + "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538'}" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -102,42 +121,67 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import httpx\n", - "import base64\n", - "\n", - "# conn.ai.forceConsistencyUpdate()\n", - "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", - "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", - "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + "asdf" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content='Hello! How can I assist you today?' response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-2a50fab6-62fc-433c-98b4-221346ca41c6-0' usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17}\n" - ] - }, { "data": { "text/plain": [ - "Joke(setup='Why was the cat sitting on the computer?', punchline='To keep an eye on the mouse!')" + "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658'}" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "for v in [\"Document\", \"Content\", \"DocumentChunk\"]:\n", + " try:\n", + " conn.delVertices(v)\n", + " except:\n", + " pass\n", + "\n", + "import time\n", + "time.sleep(3)\n", + "conn.ai.runDocumentIngest(\n", + " res[\"load_job_id\"],\n", + " res[\"data_source_id\"],\n", + " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "# conn.ai.forceConsistencyUpdate()\n", + "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", + "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", + "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from langchain_core.pydantic_v1 import BaseModel, Field\n", "from langchain_openai import ChatOpenAI\n", @@ -149,15 +193,14 @@ "\n", "\n", "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", - "print(model.invoke('hi'))\n", + "print(model.invoke(\"hi\"))\n", "structured_llm = model.with_structured_output(Joke)\n", - "structured_llm.invoke(\"Tell me a joke about cats\")\n", - "\n" + "structured_llm.invoke(\"Tell me a joke about cats\")" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -165,12 +208,14 @@ "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", "from langchain_openai import ChatOpenAI\n", "import os\n", + "\n", "# from langchain_core.pydantic_v1 import BaseModel\n", "from pydantic import BaseModel\n", "\n", "\n", "class AnswerWithJustification(BaseModel):\n", " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", + "\n", " answer: str\n", " justification: str\n", "\n", @@ -181,6 +226,7 @@ "# sllm = llm.with_structured_output(AnswerWithJustification)\n", "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", "\n", + "\n", "class GraphExtractor:\n", " def __init__(self):\n", " self.transformer = LLMGraphTransformer(\n", @@ -197,25 +243,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id='Marie Curie' type='Person' properties={'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.'}\n", - "id='Pierre Curie' type='Person' properties={'description': 'Husband of Marie Curie and co-winner of her first Nobel Prize.'}\n", - "id='University Of Paris' type='Institution' properties={'description': 'The institution where Marie Curie became the first woman professor in 1906.'}\n", - "id='Nobel Prize' type='Award' properties={'description': 'An award won by Marie Curie, first woman to win it and first person to win it twice.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Pierre Curie', type='Person') type='HUSBAND' properties={'description': \"Marie Curie's husband and co-winner of her first Nobel Prize.\"}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First woman to win a Nobel Prize.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'First person to win a Nobel Prize twice.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='Nobel Prize', type='Award') type='WINNER' properties={'description': 'Only person to win a Nobel Prize in two scientific fields.'}\n", - "source=Node(id='Marie Curie', type='Person') target=Node(id='University Of Paris', type='Institution') type='PROFESSOR' properties={'description': 'First woman to become a professor at the University of Paris in 1906.'}\n" - ] - } - ], + "outputs": [], "source": [ "text = \"\"\"\n", "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 0b5265b1..96a591bc 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -98,60 +98,111 @@ async def init( return chunker, vector_indices, extractor -async def process_docs( +async def chunk_docs( conn: TigerGraphConnection, docs_chan: Channel, - embed_q: Channel, - chunk_q: Channel, + embed_chan: Channel, + upsert_chan: Channel, + extract_chan: Channel, ): + """ + Creates and starts one worker for each document + in the docs channel. + """ doc_tasks = [] async with asyncio.TaskGroup() as grp: - async for content in stream_docs(conn): - # only n workers at a time -- held up by semaphore size - async with asyncio.Semaphore(doc_workers): - task = grp.create_task(chunk_doc(conn, content, chunk_q, embed_q)) - doc_tasks.append(task) - break # single doc FIXME: delete + async for content in docs_chan: + await embed_chan.put(content) # send the document to be embedded + task = grp.create_task( + chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) + ) + doc_tasks.append(task) + # break # single doc FIXME: delete # do something with doc_tasks? for t in doc_tasks: print(t.result()) + # FIXME: don't close these there, other functions will send to them + upsert_chan.close() + embed_chan.close() -async def embed(embed_q: Channel): - pass + # close the extract chan -- chunk_doc is the only sender + # and chunk_doc calls are kicked off from here (this is technically the sender) + extract_chan.close() -async def upsert(upsert_q: Channel): +async def upsert(upsert_chan: Channel): """ + Creates and starts one worker for each upsert job queue expects: (func, args) <- q.get() """ - while upsert_q.empty(): - await asyncio.sleep(1) # consume task queue - print("upsert started") + upsert_tasks = [] + async with asyncio.TaskGroup() as grp: + async for func, args in upsert_chan: + # print("func name >>>>>", func.__name__, args) + # grp.create_task(todo()) + # continue + + # execute the task + t = grp.create_task(func(*args)) + upsert_tasks.append(t) + + print(f"upsert done") + # do something with doc_tasks? + for t in upsert_tasks: + print(t.result()) + + +async def embed(embed_chan: Channel): + """ + Creates and starts one worker for each embed job + """ + + # consume task queue responses = [] - while not upsert_q.empty(): - # get the next task - func, args = await upsert_q.get() + async with asyncio.TaskGroup() as grp: + async for item in embed_chan: + print("embed item>>>>>", type(item)) + grp.create_task(todo()) + continue + # execute the task + # response = await func(*args) - # execute the task - response = await func(*args) + # append task results to worker results/response + # responses.append(response) - # append task results to worker results/response - responses.append(response) + print(f"embed done") + return responses - # mark task as done - upsert_q.task_done() - print(f"upsert done") +async def extract(extract_chan: Channel): + """ + Creates and starts one worker for each extract job + """ + + # consume task queue + responses = [] + async with asyncio.TaskGroup() as grp: + async for item in extract_chan: + print("extract item>>>>>", type(item)) + grp.create_task(todo()) + continue + # execute the task + # response = await func(*args) + + # append task results to worker results/response + # responses.append(response) + + print(f"embed done") return responses -async def extract(extract_q: Channel): - pass +async def todo(): + await asyncio.sleep(1) async def run(graphname: str, conn: TigerGraphConnection): @@ -175,25 +226,27 @@ async def run(graphname: str, conn: TigerGraphConnection): # TODO: make configurable tasks = [] - docs_chan = Channel(48) # process n chunks at a time max - chunk_chan = Channel(100) # process 100 chunks at a time max + docs_chan = Channel(15) # process n chunks at a time max embed_chan = Channel(100) upsert_chan = Channel(100) + extract_chan = Channel(100) async with asyncio.TaskGroup() as grp: # get docs - t = grp.create_task(stream_docs(conn, docs_chan,10)) + t = grp.create_task(stream_docs(conn, docs_chan, 10)) tasks.append(t) # process docs - t = grp.create_task(process_docs(conn, docs_chan, embed_chan, chunk_chan)) - tasks.append(t) - # embed - t = grp.create_task(embed(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task( + chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) + ) tasks.append(t) # upsert chunks - t = grp.create_task(upsert(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task(upsert(upsert_chan)) + tasks.append(t) + # # embed + t = grp.create_task(embed(embed_chan)) tasks.append(t) # extract entities - t = grp.create_task(extract(conn, doc_workers, embed_chan, chunk_chan)) + t = grp.create_task(extract(extract_chan)) tasks.append(t) end = time.perf_counter() diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index c18ec86a..cfb84e5a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -4,9 +4,9 @@ import traceback from urllib.parse import quote_plus +import ecc_util import httpx from aiochannel import Channel -from app import ecc_util from pyTigerGraph import TigerGraphConnection from common.logs.logwriter import LogWriter @@ -87,13 +87,18 @@ async def stream_docs( docs_chan: Channel, ttl_batches: int = 10, ): + """ + Streams the document contents into the docs_chan + """ headers = make_headers(conn) for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) if doc_ids["error"]: - print(doc_ids) break # TODO: handle error + print("********") + print(doc_ids) + print("********") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=None) as client: res = await client.get( @@ -104,26 +109,38 @@ async def stream_docs( # TODO: check for errors # this will block and wait if the channel is full await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # return # single doc test FIXME: delete - # return # single batch test FIXME: delete + # break # single doc test FIXME: delete + # break # single batch test FIXME: delete + + # close the docs chan -- this function is the only sender + docs_chan.close() async def chunk_doc( conn: TigerGraphConnection, doc: dict[str, str], - chunk_chan: Channel, + upsert_chan: Channel, embed_chan: Channel, + extract_chan: Channel, ): - # TODO: Embed document and chunks + """ + Chunks a document. + Places the resulting chunks into the upsert channel (to be upserted to TG) + and the embed channel (to be embedded and written to the vector store) + """ chunker = ecc_util.get_chunker() chunks = chunker.chunk(doc["attributes"]["text"]) v_id = doc["v_id"] # TODO: n chunks at a time for i, chunk in enumerate(chunks): # send chunks to be upserted (func, args) - await chunk_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) + await upsert_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) # send chunks to be embedded + await embed_chan.put(chunk) + + # send chunks to have entities extracted + await extract_chan.put(chunk) # break # single chunk FIXME: delete @@ -158,7 +175,7 @@ async def upsert_vertex( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res.json()) + print("upsert vertex>>>", res.json()) async def upsert_edge( @@ -194,7 +211,7 @@ async def upsert_edge( res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print(res.json()) + print("upsert edge >>>", res.json()) async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): From 7501a37b400eff0334c11aa8adc264ded66d66ca Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:31:24 -0400 Subject: [PATCH 06/53] pytg in 175 seconds --- common/config.py | 2 +- common/embeddings/embedding_services.py | 37 ++- common/embeddings/milvus_embedding_store.py | 87 +++++- common/extractors/BaseExtractor.py | 13 +- common/extractors/GraphExtractor.py | 50 ++++ .../LLMEntityRelationshipExtractor.py | 35 ++- common/gsql/graphRAG/StreamDocContent.gsql | 5 +- common/gsql/graphRAG/StreamDocIds.gsql | 7 +- common/gsql/supportai/SupportAI_Schema.gsql | 12 +- common/logs/logwriter.py | 2 +- common/py_schemas/tool_io_schemas.py | 2 +- .../app/graphrag/graph_rag.py | 252 ++++++++---------- .../app/graphrag/util.py | 240 ++++++++--------- .../app/graphrag/worker.py | 35 --- .../app/graphrag/workers.py | 226 ++++++++++++++++ eventual-consistency-service/app/main.py | 3 +- 16 files changed, 668 insertions(+), 340 deletions(-) delete mode 100644 eventual-consistency-service/app/graphrag/worker.py create mode 100644 eventual-consistency-service/app/graphrag/workers.py diff --git a/common/config.py b/common/config.py index 2546e38a..ec72455d 100644 --- a/common/config.py +++ b/common/config.py @@ -167,7 +167,7 @@ def get_llm_service(llm_config) -> LLM_Model: doc_processing_config = { "chunker": "semantic", "chunker_config": {"method": "percentile", "threshold": 0.95}, - "extractor": "llm", + "extractor": "graphrag", "extractor_config": {}, } elif DOC_PROCESSING_CONFIG.endswith(".json"): diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index c76bf46d..dd506670 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -1,11 +1,13 @@ +import logging import os +import time from typing import List + from langchain.schema.embeddings import Embeddings -import logging -import time + from common.logs.log import req_id_cv -from common.metrics.prometheus_metrics import metrics from common.logs.logwriter import LogWriter +from common.metrics.prometheus_metrics import metrics logger = logging.getLogger(__name__) @@ -87,6 +89,33 @@ def embed_query(self, question: str) -> List[float]: duration ) + async def aembed_query(self, question: str) -> List[float]: + """Embed Query Async. + Embed a string. + + Args: + question (str): + A string to embed. + """ + # start_time = time.time() + # metrics.llm_inprogress_requests.labels(self.model_name).inc() + + # try: + logger.debug_pii(f"aembed_query() embedding question={question}") + query_embedding = await self.embeddings.aembed_query(question) + # metrics.llm_success_response_total.labels(self.model_name).inc() + return query_embedding + # except Exception as e: + # # metrics.llm_query_error_total.labels(self.model_name).inc() + # raise e + # finally: + # metrics.llm_request_total.labels(self.model_name).inc() + # metrics.llm_inprogress_requests.labels(self.model_name).dec() + # duration = time.time() - start_time + # metrics.llm_request_duration_seconds.labels(self.model_name).observe( + # duration + # ) + class AzureOpenAI_Ada002(EmbeddingModel): """Azure OpenAI Ada-002 Embedding Model""" @@ -124,8 +153,8 @@ class AWS_Bedrock_Embedding(EmbeddingModel): """AWS Bedrock Embedding Model""" def __init__(self, config): - from langchain_community.embeddings import BedrockEmbeddings import boto3 + from langchain_community.embeddings import BedrockEmbeddings super().__init__(config=config, model_name=config["embedding_model"]) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 8a52d05f..ac9c5389 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -1,18 +1,17 @@ import logging +import traceback from time import sleep, time from typing import Iterable, List, Optional, Tuple from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from pymilvus import connections, utility -from pymilvus.exceptions import MilvusException +from pymilvus import MilvusException, connections, utility from common.embeddings.base_embedding_store import EmbeddingStore from common.embeddings.embedding_services import EmbeddingModel from common.logs.log import req_id_cv -from common.metrics.prometheus_metrics import metrics from common.logs.logwriter import LogWriter -from pymilvus import MilvusException +from common.metrics.prometheus_metrics import metrics logger = logging.getLogger(__name__) @@ -77,7 +76,7 @@ def connect_to_milvus(self): while retry_attempt < self.max_retry_attempts: try: connections.connect(**self.milvus_connection) - metrics.milvus_active_connections.labels(self.collection_name).inc + # metrics.milvus_active_connections.labels(self.collection_name).inc LogWriter.info( f"""Initializing Milvus with host={self.milvus_connection.get("host", self.milvus_connection.get("uri", "unknown host"))}, port={self.milvus_connection.get('port', 'unknown')}, username={self.milvus_connection.get('user', 'unknown')}, collection={self.collection_name}""" @@ -213,6 +212,76 @@ def add_embeddings( error_message = f"An error occurred while registering document: {str(e)}" LogWriter.error(error_message) + async def aadd_embeddings( + self, + embeddings: Iterable[Tuple[str, List[float]]], + metadatas: List[dict] = None, + ): + """Async Add Embeddings. + Add embeddings to the Embedding store. + Args: + embeddings (Iterable[Tuple[str, List[float]]]): + Iterable of content and embedding of the document. + metadatas (List[Dict]): + List of dictionaries containing the metadata for each document. + The embeddings and metadatas list need to have identical indexing. + """ + try: + if metadatas is None: + metadatas = [] + + # add fields required by Milvus if they do not exist + if self.support_ai_instance: + for metadata in metadatas: + if self.vertex_field not in metadata: + metadata[self.vertex_field] = "" + else: + for metadata in metadatas: + if "seq_num" not in metadata: + metadata["seq_num"] = 1 + if "source" not in metadata: + metadata["source"] = "" + + LogWriter.info( + f"request_id={req_id_cv.get()} Milvus ENTRY aadd_embeddings()" + ) + texts = [text for text, _ in embeddings] + + # operation_type = "add_texts" + # metrics.milvus_query_total.labels( + # self.collection_name, operation_type + # ).inc() + # start_time = time() + + added = await self.milvus.aadd_texts(texts=texts, metadatas=metadatas) + + # duration = time() - start_time + # metrics.milvus_query_duration_seconds.labels( + # self.collection_name, operation_type + # ).observe(duration) + + LogWriter.info( + f"request_id={req_id_cv.get()} Milvus EXIT aadd_embeddings()" + ) + + # Check if registration was successful + if added: + success_message = f"Document registered with id: {added[0]}" + LogWriter.info(success_message) + return success_message + else: + error_message = f"Failed to register document {added}" + LogWriter.error(error_message) + raise Exception(error_message) + + except Exception as e: + error_message = f"An error occurred while registering document:{metadatas} ({len(texts)},{len(metadatas)})\nErr: {str(e)}" + LogWriter.error(error_message) + exc = traceback.format_exc() + LogWriter.error(exc) + LogWriter.error(f"{texts}") + raise e + def get_pks( self, expr: str, @@ -506,11 +575,11 @@ def query(self, expr: str, output_fields: List[str]): return None try: - query_result = self.milvus.col.query( - expr=expr, output_fields=output_fields - ) + query_result = self.milvus.col.query(expr=expr, output_fields=output_fields) except MilvusException as exc: - LogWriter.error(f"Failed to get outputs: {self.milvus.collection_name} error: {exc}") + LogWriter.error( + f"Failed to get outputs: {self.milvus.collection_name} error: {exc}" + ) raise exc return query_result diff --git a/common/extractors/BaseExtractor.py b/common/extractors/BaseExtractor.py index 3f1ec92b..e8638665 100644 --- a/common/extractors/BaseExtractor.py +++ b/common/extractors/BaseExtractor.py @@ -1,6 +1,13 @@ -class BaseExtractor: - def __init__(): +from abc import ABC, abstractmethod + +from langchain_community.graphs.graph_document import GraphDocument + + +class BaseExtractor(ABC): + @abstractmethod + def extract(self, text:str): pass - def extract(self, text): + @abstractmethod + async def aextract(self, text:str) -> list[GraphDocument]: pass diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py index c8f24355..282729a4 100644 --- a/common/extractors/GraphExtractor.py +++ b/common/extractors/GraphExtractor.py @@ -16,6 +16,56 @@ def __init__(self): ) def extract(self, text) -> list[GraphDocument]: + """ + returns a list of GraphDocument: + Each doc is: + nodes=[ + Node( + id='Marie Curie', + type='Person', + properties={ + 'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.' + } + ), + ... + ], + relationships=[ + Relationship( + source=Node(id='Marie Curie', type='Person'), + target=Node(id='Pierre Curie', type='Person'), + type='SPOUSE' + ), + ... + ] + """ doc = Document(page_content=text) graph_docs = self.transformer.convert_to_graph_documents([doc]) + translated_docs = self.translate(graph_docs) + return translated_docs + + async def aextract(self, text:str) -> list[GraphDocument]: + """ + returns a list of GraphDocument: + Each doc is: + nodes=[ + Node( + id='Marie Curie', + type='Person', + properties={ + 'description': 'A Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.' + } + ), + ... + ], + relationships=[ + Relationship( + source=Node(id='Marie Curie', type='Person'), + target=Node(id='Pierre Curie', type='Person'), + type='SPOUSE' + ), + ... + ] + """ + doc = Document(page_content=text) + graph_docs = await self.transformer.aconvert_to_graph_documents([doc]) return graph_docs diff --git a/common/extractors/LLMEntityRelationshipExtractor.py b/common/extractors/LLMEntityRelationshipExtractor.py index d5a0a970..415c3235 100644 --- a/common/extractors/LLMEntityRelationshipExtractor.py +++ b/common/extractors/LLMEntityRelationshipExtractor.py @@ -1,8 +1,9 @@ -from common.llm_services import LLM_Model +import json +from typing import List + from common.extractors.BaseExtractor import BaseExtractor +from common.llm_services import LLM_Model from common.py_schemas import KnowledgeGraph -from typing import List -import json class LLMEntityRelationshipExtractor(BaseExtractor): @@ -19,6 +20,34 @@ def __init__( self.strict_mode = strict_mode def _extract_kg_from_doc(self, doc, chain, parser): + """ + returns: + { + "nodes": [ + { + "id": "str", + "type": "string", + "definition": "string" + } + ], + "rels": [ + { + "source":{ + "id": "str", + "type": "string", + "definition": "string" + } + "target":{ + "id": "str", + "type": "string", + "definition": "string" + } + "definition" + } + ] + } + """ + try: out = chain.invoke( {"input": doc, "format_instructions": parser.get_format_instructions()} diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql index fb7338b7..87f12566 100644 --- a/common/gsql/graphRAG/StreamDocContent.gsql +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -1,5 +1,6 @@ -CREATE QUERY StreamDocContent(Vertex doc) { +CREATE DISTRIBUTED QUERY StreamDocContent(Vertex doc) { Doc = {doc}; - DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c; + DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c + POST-ACCUM d.epoch_processed = datetime_to_epoch(now()); PRINT DocContent; } diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql index fb373490..d5ec982e 100644 --- a/common/gsql/graphRAG/StreamDocIds.gsql +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -1,10 +1,13 @@ -CREATE QUERY StreamDocIds(INT current_batch, INT ttl_batches) { +CREATE DISTRIBUTED QUERY StreamDocIds(INT current_batch, INT ttl_batches) { ListAccum @@doc_ids; Docs = {Document.*}; Docs = SELECT d FROM Docs:d WHERE vertex_to_int(d) % ttl_batches == current_batch - ACCUM @@doc_ids += d.id; + AND d.epoch_processed == 0 + AND d.epoch_processing == 0 + ACCUM @@doc_ids += d.id + POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); PRINT @@doc_ids; } diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 0998affe..0e3cf6c3 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -21,12 +21,12 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { // GraphRAG ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD DIRECTED EDGE KNN(FROM Entity, TO Entity); // TODO: check where knn algo writes results - ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity); // Connect ResolvedEntities with their children entities - ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, weight UINT); // store edges between entities after they're resolved - ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community); + ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results + ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out // Hierarchical communities (Louvain/Leiden) diff --git a/common/logs/logwriter.py b/common/logs/logwriter.py index ff13feed..f75be00c 100644 --- a/common/logs/logwriter.py +++ b/common/logs/logwriter.py @@ -142,7 +142,7 @@ def log(level, message, mask_pii=True, **kwargs): LogWriter.general_logger.info(message) @staticmethod - def info(message, mask_pii=True, **kwargs): + def info(message, mask_pii=False, **kwargs): LogWriter.log("info", message, mask_pii, **kwargs) @staticmethod diff --git a/common/py_schemas/tool_io_schemas.py b/common/py_schemas/tool_io_schemas.py index 1fe16de4..1ea6ed3e 100644 --- a/common/py_schemas/tool_io_schemas.py +++ b/common/py_schemas/tool_io_schemas.py @@ -91,4 +91,4 @@ class ReportSection(BaseModel): questions: List[ReportQuestion] = Field("List of questions and reasoning for the section") class ReportSections(BaseModel): - sections: List[ReportSection] = Field("List of sections for the report") \ No newline at end of file + sections: List[ReportSection] = Field("List of sections for the report") diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 96a591bc..e248510c 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -2,100 +2,59 @@ import logging import time -import ecc_util +import httpx from aiochannel import Channel -from graphrag.util import chunk_doc, install_query, stream_docs +from graphrag import workers +from graphrag.util import init, make_headers, stream_doc_ids,http_timeout from pyTigerGraph import TigerGraphConnection -from common.chunkers.base_chunker import BaseChunker -from common.config import ( - doc_processing_config, - embedding_service, - get_llm_service, - llm_config, - milvus_config, -) +from common.config import embedding_service from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore -from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor +http_logs = logging.getLogger("httpx") +http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) + consistency_checkers = {} -async def install_queries( - requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +async def stream_docs( + conn: TigerGraphConnection, + docs_chan: Channel, + ttl_batches: int = 10, ): - # queries that are currently installed - installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - - tasks = [] - async with asyncio.TaskGroup() as grp: - for q in requried_queries: - async with asyncio.Semaphore(n_workers): - q_name = q.split("/")[-1] - # if the query is not installed, install it - if q_name not in installed_queries: - task = grp.create_task(install_query(conn, q)) - tasks.append(task) - - for t in tasks: - print(t.result()) - # TODO: Check if anything had an error - return "", "", "" - - -async def init( - graphname: str, conn: TigerGraphConnection -) -> tuple[BaseChunker, dict[str, MilvusEmbeddingStore], BaseExtractor]: - # install requried queries - requried_queries = [ - # "common/gsql/supportai/Scan_For_Updates", - # "common/gsql/supportai/Update_Vertices_Processing_Status", - # "common/gsql/supportai/ECC_Status", - # "common/gsql/supportai/Check_Nonexistent_Vertices", - "common/gsql/graphRAG/StreamDocIds", - "common/gsql/graphRAG/StreamDocContent", - ] - # await install_queries(requried_queries, conn) - return await install_queries(requried_queries, conn) - - # init processing tools - chunker = ecc_util.get_chunker() - - vector_indices = {} - vertex_field = milvus_config.get("vertex_field", "vertex_id") - index_names = milvus_config.get( - "indexes", - ["Document", "DocumentChunk", "Entity", "Relationship"], - ) - for index_name in index_names: - vector_indices[graphname + "_" + index_name] = MilvusEmbeddingStore( - embedding_service, - host=milvus_config["host"], - port=milvus_config["port"], - support_ai_instance=True, - collection_name=graphname + "_" + index_name, - username=milvus_config.get("username", ""), - password=milvus_config.get("password", ""), - vector_field=milvus_config.get("vector_field", "document_vector"), - text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field, - ) - - if doc_processing_config.get("extractor") == "llm": - extractor = GraphExtractor() - elif doc_processing_config.get("extractor") == "llm": - extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) - else: - raise ValueError("Invalid extractor type") - - if vertex_field is None: - raise ValueError( - "vertex_field is not defined. Ensure Milvus is enabled in the configuration." - ) - - return chunker, vector_indices, extractor + """ + Streams the document contents into the docs_chan + """ + logger.info("streaming docs") + headers = make_headers(conn) + for i in range(ttl_batches): + doc_ids = await stream_doc_ids(conn, i, ttl_batches) + if doc_ids["error"]: + continue # TODO: handle error + + logger.info("********doc_ids") + logger.info(doc_ids) + logger.info("********") + for d in doc_ids["ids"]: + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + # TODO: check for errors + # this will block and wait if the channel is full + logger.info("steam_docs writes to docs") + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + # break # single doc test FIXME: delete + # break # single batch test FIXME: delete + + logger.info("stream_docs done") + # close the docs chan -- this function is the only sender + logger.info("****** closing docs chan") + docs_chan.close() async def chunk_docs( @@ -109,100 +68,120 @@ async def chunk_docs( Creates and starts one worker for each document in the docs channel. """ + logger.info("Reading from docs channel") doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - await embed_chan.put(content) # send the document to be embedded + logger.info("*********reading from docs chan") + # continue + v_id = content["v_id"] + txt = content["attributes"]["text"] + # send the document to be embedded + logger.info("chunk writes to extract") + await embed_chan.put((v_id, txt, "Document")) + task = grp.create_task( - chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) + workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) ) doc_tasks.append(task) # break # single doc FIXME: delete + logger.info("*********done reading from docs chan") + logger.info("chunk_docs done") # do something with doc_tasks? - for t in doc_tasks: - print(t.result()) - - # FIXME: don't close these there, other functions will send to them - upsert_chan.close() - embed_chan.close() + # for t in doc_tasks: + # logger.info(t.result()) # close the extract chan -- chunk_doc is the only sender - # and chunk_doc calls are kicked off from here (this is technically the sender) + # and chunk_doc calls are kicked off from here + logger.info("********closing extract chan") extract_chan.close() async def upsert(upsert_chan: Channel): """ Creates and starts one worker for each upsert job - queue expects: + chan expects: (func, args) <- q.get() """ + logger.info("Reading from upsert channel") # consume task queue upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: - # print("func name >>>>>", func.__name__, args) - # grp.create_task(todo()) + logger.info("*********reading from upsert chan") + logger.info(f"{func.__name__}, {args[1]}") # continue - # execute the task t = grp.create_task(func(*args)) upsert_tasks.append(t) + logger.info("*********done reading from upsert chan") - print(f"upsert done") + logger.info(f"upsert done") # do something with doc_tasks? - for t in upsert_tasks: - print(t.result()) + # for t in upsert_tasks: + # logger.info(t.result()) -async def embed(embed_chan: Channel): +async def embed( + embed_chan: Channel, index_stores: dict[str, MilvusEmbeddingStore], graphname: str +): """ Creates and starts one worker for each embed job + chan expects: + (v_id, content, index_name) <- q.get() """ - - # consume task queue - responses = [] + logger.info("Reading from embed channel") async with asyncio.TaskGroup() as grp: - async for item in embed_chan: - print("embed item>>>>>", type(item)) - grp.create_task(todo()) - continue - # execute the task - # response = await func(*args) - - # append task results to worker results/response - # responses.append(response) + # consume task queue + async for v_id, content, index_name in embed_chan: + logger.info("*********reading from embed chan") + # continue + embedding_store = index_stores[f"{graphname}_{index_name}"] + logger.info(f"Embed to {graphname}_{index_name}: {v_id}") + grp.create_task( + workers.embed( + embedding_service, + embedding_store, + v_id, + content, + ) + ) + logger.info("*********done reading from embed chan") - print(f"embed done") - return responses + logger.info(f"embed done") -async def extract(extract_chan: Channel): +async def extract( + extract_chan: Channel, + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection, +): """ Creates and starts one worker for each extract job + chan expects: + (chunk , chunk_id) <- q.get() """ - + logger.info("Reading from extract channel") # consume task queue - responses = [] async with asyncio.TaskGroup() as grp: async for item in extract_chan: - print("extract item>>>>>", type(item)) - grp.create_task(todo()) - continue - # execute the task - # response = await func(*args) - + logger.info("*********reading from extract chan") + logger.info("*********done reading from extract chan") + grp.create_task( + workers.extract(upsert_chan, embed_chan, extractor, conn, *item) + ) # append task results to worker results/response - # responses.append(response) + logger.info("*********done reading from extract chan") - print(f"embed done") - return responses + logger.info(f"extract done") - -async def todo(): - await asyncio.sleep(1) + logger.info("****closing upsert and embed chan") + upsert_chan.close() + embed_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -219,14 +198,13 @@ async def run(graphname: str, conn: TigerGraphConnection): """ - # init configurable objects - await init(graphname, conn) + extractor, index_stores = await init(conn) # return start = time.perf_counter() # TODO: make configurable tasks = [] - docs_chan = Channel(15) # process n chunks at a time max + docs_chan = Channel(1) # process n chunks at a time max embed_chan = Channel(100) upsert_chan = Channel(100) extract_chan = Channel(100) @@ -243,12 +221,14 @@ async def run(graphname: str, conn: TigerGraphConnection): t = grp.create_task(upsert(upsert_chan)) tasks.append(t) # # embed - t = grp.create_task(embed(embed_chan)) + t = grp.create_task(embed(embed_chan, index_stores, graphname)) tasks.append(t) # extract entities - t = grp.create_task(extract(extract_chan)) + t = grp.create_task( + extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + ) tasks.append(t) end = time.perf_counter() - print("DONE") - print(end - start) + logger.info("DONE") + logger.info(end - start) diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index cfb84e5a..3fb8f916 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -1,58 +1,117 @@ +import asyncio import base64 import json -import time +import logging import traceback -from urllib.parse import quote_plus -import ecc_util import httpx -from aiochannel import Channel +from graphrag import workers from pyTigerGraph import TigerGraphConnection +from common.config import ( + doc_processing_config, + embedding_service, + get_llm_service, + llm_config, + milvus_config, +) +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor +from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +logger = logging.getLogger(__name__) +http_timeout = httpx.Timeout(15.0) -def make_headers(conn: TigerGraphConnection): - if conn.apiToken is None or conn.apiToken == "": - tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() - headers = {"Authorization": f"Basic {tkn}"} - else: - headers = {"Authorization": f"Bearer {conn.apiToken}"} - return headers +async def install_queries( + requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 +): + # queries that are currently installed + installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] + tasks = [] + async with asyncio.TaskGroup() as grp: + for q in requried_queries: + async with asyncio.Semaphore(n_workers): + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + task = grp.create_task(workers.install_query(conn, q)) + tasks.append(task) -async def install_query( - conn: TigerGraphConnection, query_path: str -) -> dict[str, httpx.Response | str | None]: - LogWriter.info(f"Installing query {query_path}") - with open(f"{query_path}.gsql", "r") as f: - query = f.read() + for t in tasks: + logger.info(t.result()) + # TODO: Check if anything had an error - query_name = query_path.split("/")[-1] - query = f"""\ -USE GRAPH {conn.graphname} -{query} -INSTALL QUERY {query_name}""" - tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() - headers = {"Authorization": f"Basic {tkn}"} - async with httpx.AsyncClient(timeout=None) as client: - res = await client.post( - conn.gsUrl + "/gsqlserver/gsql/file", - data=quote_plus(query.encode("utf-8")), - headers=headers, +async def init( + conn: TigerGraphConnection, +) -> tuple[BaseExtractor, dict[str, MilvusEmbeddingStore]]: + # install requried queries + requried_queries = [ + # "common/gsql/supportai/Scan_For_Updates", + # "common/gsql/supportai/Update_Vertices_Processing_Status", + # "common/gsql/supportai/ECC_Status", + # "common/gsql/supportai/Check_Nonexistent_Vertices", + "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamDocContent", + ] + await install_queries(requried_queries, conn) + + # extractor + if doc_processing_config.get("extractor") == "graphrag": + extractor = GraphExtractor() + elif doc_processing_config.get("extractor") == "llm": + extractor = LLMEntityRelationshipExtractor(get_llm_service(llm_config)) + else: + raise ValueError("Invalid extractor type") + vertex_field = milvus_config.get("vertex_field", "vertex_id") + index_names = milvus_config.get( + "indexes", + [ + "Document", + "DocumentChunk", + "Entity", + "Relationship", + # "Concept", + ], + ) + index_stores = {} + content = "init" + # TODO:do concurrently + for index_name in index_names: + name = conn.graphname + "_" + index_name + s = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, ) + # TODO: only do this if collection doesn't exist + vec = embedding_service.embed_query(content) + LogWriter.info(f"Initializing {name}") + s.add_embeddings([(content, vec)], [{vertex_field: content}]) + s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") + index_stores[name] = s - if "error" in res.text.lower(): - LogWriter.error(res.text) - return { - "result": None, - "error": True, - "message": f"Failed to install query {query_name}", - } + return extractor, index_stores - return {"result": res, "error": False} + +def make_headers(conn: TigerGraphConnection): + if conn.apiToken is None or conn.apiToken == "": + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + else: + headers = {"Authorization": f"Bearer {conn.apiToken}"} + + return headers async def stream_doc_ids( @@ -61,7 +120,7 @@ async def stream_doc_ids( headers = make_headers(conn) try: - async with httpx.AsyncClient(timeout=None) as client: + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", params={ @@ -82,71 +141,6 @@ async def stream_doc_ids( return {"error": True, "message": str(e)} -async def stream_docs( - conn: TigerGraphConnection, - docs_chan: Channel, - ttl_batches: int = 10, -): - """ - Streams the document contents into the docs_chan - """ - headers = make_headers(conn) - for i in range(ttl_batches): - doc_ids = await stream_doc_ids(conn, i, ttl_batches) - if doc_ids["error"]: - break # TODO: handle error - - print("********") - print(doc_ids) - print("********") - for d in doc_ids["ids"]: - async with httpx.AsyncClient(timeout=None) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) - # TODO: check for errors - # this will block and wait if the channel is full - await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # break # single doc test FIXME: delete - # break # single batch test FIXME: delete - - # close the docs chan -- this function is the only sender - docs_chan.close() - - -async def chunk_doc( - conn: TigerGraphConnection, - doc: dict[str, str], - upsert_chan: Channel, - embed_chan: Channel, - extract_chan: Channel, -): - """ - Chunks a document. - Places the resulting chunks into the upsert channel (to be upserted to TG) - and the embed channel (to be embedded and written to the vector store) - """ - chunker = ecc_util.get_chunker() - chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] - # TODO: n chunks at a time - for i, chunk in enumerate(chunks): - # send chunks to be upserted (func, args) - await upsert_chan.put((upsert_chunk, (conn, v_id, f"{v_id}_chunk_{i}", chunk))) - - # send chunks to be embedded - await embed_chan.put(chunk) - - # send chunks to have entities extracted - await extract_chan.put(chunk) - - # break # single chunk FIXME: delete - - return doc["v_id"] - - def map_attrs(attributes: dict): # map attrs attrs = {} @@ -171,11 +165,13 @@ async def upsert_vertex( attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) - async with httpx.AsyncClient(timeout=None) as client: + # print("upsert vertex>>>", vertex_id) + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print("upsert vertex>>>", res.json()) + + res.raise_for_status() async def upsert_edge( @@ -207,37 +203,9 @@ async def upsert_edge( } ) headers = make_headers(conn) - async with httpx.AsyncClient(timeout=None) as client: + # print("upsert edge >>>", src_v_id, tgt_v_id) + async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) - print("upsert edge >>>", res.json()) - - -async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): - date_added = int(time.time()) - await upsert_vertex( - conn, - "DocumentChunk", - chunk_id, - attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, - ) - await upsert_vertex( - conn, - "Content", - chunk_id, - attributes={"text": chunk, "epoch_added": date_added}, - ) - await upsert_edge( - conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id - ) - await upsert_edge(conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id) - if int(chunk_id.split("_")[-1]) > 0: - await upsert_edge( - conn, - "DocumentChunk", - chunk_id, - "IS_AFTER", - "DocumentChunk", - doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), - ) + res.raise_for_status() diff --git a/eventual-consistency-service/app/graphrag/worker.py b/eventual-consistency-service/app/graphrag/worker.py deleted file mode 100644 index 40720deb..00000000 --- a/eventual-consistency-service/app/graphrag/worker.py +++ /dev/null @@ -1,35 +0,0 @@ -import asyncio - -from aiochannel import Channel - - -async def worker( - n: int, - task_queue: Channel, -): - # init worker logging/reporting (TODO) - worker_name = f"worker-{n+1}" - worker_name += " " if n + 1 < 10 else "" - - while task_queue.empty(): - print(f"{worker_name} waiting") - await asyncio.sleep(1) - - # consume task queue - print(f"{worker_name} started") - responses = [] - while not task_queue.empty(): - # get the next task - func, args = await task_queue.get() - - # execute the task - response = await func(*args) - - # append task results to worker results/response - responses.append(response) - - # mark task as done - task_queue.task_done() - - print(f"{worker_name} done") - return responses diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py new file mode 100644 index 00000000..3eb0d0dd --- /dev/null +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -0,0 +1,226 @@ +import base64 +import logging +import time +from urllib.parse import quote_plus + +import ecc_util +import httpx +from aiochannel import Channel +from graphrag import util # import upsert_edge, upsert_vertex +from langchain_community.graphs.graph_document import GraphDocument +from pyTigerGraph import TigerGraphConnection + +from common.config import milvus_config +from common.embeddings.embedding_services import EmbeddingModel +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor +from common.logs.logwriter import LogWriter + +vertex_field = milvus_config.get("vertex_field", "vertex_id") + +logger = logging.getLogger(__name__) + + +async def install_query( + conn: TigerGraphConnection, query_path: str +) -> dict[str, httpx.Response | str | None]: + LogWriter.info(f"Installing query {query_path}") + with open(f"{query_path}.gsql", "r") as f: + query = f.read() + + query_name = query_path.split("/")[-1] + query = f"""\ +USE GRAPH {conn.graphname} +{query} +INSTALL QUERY {query_name}""" + tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() + headers = {"Authorization": f"Basic {tkn}"} + + async with httpx.AsyncClient(timeout=util.http_timeout) as client: + res = await client.post( + conn.gsUrl + "/gsqlserver/gsql/file", + data=quote_plus(query.encode("utf-8")), + headers=headers, + ) + + if "error" in res.text.lower(): + LogWriter.error(res.text) + return { + "result": None, + "error": True, + "message": f"Failed to install query {query_name}", + } + + return {"result": res, "error": False} + + +async def chunk_doc( + conn: TigerGraphConnection, + doc: dict[str, str], + upsert_chan: Channel, + embed_chan: Channel, + extract_chan: Channel, +): + """ + Chunks a document. + Places the resulting chunks into the upsert channel (to be upserted to TG) + and the embed channel (to be embedded and written to the vector store) + """ + chunker = ecc_util.get_chunker() + chunks = chunker.chunk(doc["attributes"]["text"]) + v_id = doc["v_id"] + logger.info(f"Chunking {v_id}") + # TODO: n chunks at a time + for i, chunk in enumerate(chunks): + chunk_id = f"{v_id}_chunk_{i}" + # send chunks to be upserted (func, args) + logger.info("chunk writes to upsert") + await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) + + # send chunks to be embedded + logger.info("chunk writes to embed") + await embed_chan.put((v_id, chunk, "DocumentChunk")) + + # send chunks to have entities extracted + logger.info("chunk writes to extract") + await extract_chan.put((chunk, chunk_id)) + + return doc["v_id"] + + +async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): + logger.info(f"Upserting chunk {chunk_id}") + logger.info(f"Upserting chunk {chunk_id}") + date_added = int(time.time()) + await util.upsert_vertex( + conn, + "DocumentChunk", + chunk_id, + attributes={"epoch_added": date_added, "idx": int(chunk_id.split("_")[-1])}, + ) + await util.upsert_vertex( + conn, + "Content", + chunk_id, + attributes={"text": chunk, "epoch_added": date_added}, + ) + await util.upsert_edge( + conn, "DocumentChunk", chunk_id, "HAS_CONTENT", "Content", chunk_id + ) + await util.upsert_edge( + conn, "Document", doc_id, "HAS_CHILD", "DocumentChunk", chunk_id + ) + if int(chunk_id.split("_")[-1]) > 0: + await util.upsert_edge( + conn, + "DocumentChunk", + chunk_id, + "IS_AFTER", + "DocumentChunk", + doc_id + "_chunk_" + str(int(chunk_id.split("_")[-1]) - 1), + ) + + +async def embed( + embed_svc: EmbeddingModel, + embed_store: MilvusEmbeddingStore, + v_id: str, + content: str, +): + """ + Args: + graphname: str + the name of the graph the documents are in + embed_svc: EmbeddingModel + The class used to vectorize text + embed_store: + The class used to store the vectore to a vector DB + v_id: str + the vertex id that will be embedded + content: str + the content of the document/chunk + index_name: str + the vertex index to write to + """ + logger.info(f"Embedding {v_id}, {content}") + + vec = await embed_svc.aembed_query(content) + await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) + + +async def extract( + upsert_chan: Channel, + embed_chan: Channel, + extractor: BaseExtractor, + conn: TigerGraphConnection, + chunk: str, + chunk_id: str, +): + logger.info(f"Extracting chunk: {chunk_id}") + extracted: list[GraphDocument] = await extractor.aextract(chunk) + # upsert nodes and edges to the graph + for doc in extracted: + for node in doc.nodes: + logger.info("extract writes entity vert to upsert") + logger.info(f"Node: {node.id}| props: {node.properties}") + v_id = str(node.id) + desc = node.properties.get("description", "") + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + # conn, v_id, chunk_id, chunk + ( + conn, + "Entity", # v_type + v_id, # v_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # link the entity to the chunk it came from + logger.info("extract writes contains edge to upsert") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "DocumentChunk", # src_type + chunk_id, # src_id + "CONTAINS_ENTITY", # edge_type + "Entity", # tgt_type + str(node.id), # tgt_id + None, # attributes + ), + ) + ) + + # embed the entity + # (v_id, content, index_name) + await embed_chan.put((v_id, desc, "Entity")) + + for edge in doc.relationships: + logger.info("extract writes relates edge to upsert") + logger.info(f"{edge}") + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + edge.source.id, # src_id + "RELATIONSHIP", # edgeType + "Entity", # tgt_type + edge.target.id, # tgt_id + {"relation_type": edge.type}, # attributes + ), + ) + ) + # embed "Relationship", + # (v_id, content, index_name) + + # TODO: + # embed the extracted entities diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 0277a272..85a1f8ae 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -188,7 +188,8 @@ def consistency_status( case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) # asyncio.run(graphrag.run(graphname, conn)) - ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname})" + import time + ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From bb37198f74e1012880868d44f8d6cbfa09acbfb0 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:36:22 -0400 Subject: [PATCH 07/53] docs processing done -- start community passes --- common/embeddings/embedding_services.py | 2 +- common/gsql/graphRAG/StreamDocContent.gsql | 2 + common/gsql/graphRAG/StreamDocIds.gsql | 7 +- copilot/docs/notebooks/graphrag.ipynb | 159 ++++-------------- .../app/graphrag/graph_rag.py | 69 +++----- .../app/graphrag/util.py | 63 ++++--- .../app/graphrag/workers.py | 30 ++-- 7 files changed, 118 insertions(+), 214 deletions(-) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index dd506670..7ce17478 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -136,7 +136,7 @@ def __init__(self, config): ) from langchain.embeddings import OpenAIEmbeddings - self.embeddings = OpenAIEmbeddings() + self.embeddings = OpenAIEmbeddings().aembed_query class VertexAI_PaLM_Embedding(EmbeddingModel): diff --git a/common/gsql/graphRAG/StreamDocContent.gsql b/common/gsql/graphRAG/StreamDocContent.gsql index 87f12566..a2845148 100644 --- a/common/gsql/graphRAG/StreamDocContent.gsql +++ b/common/gsql/graphRAG/StreamDocContent.gsql @@ -1,5 +1,7 @@ CREATE DISTRIBUTED QUERY StreamDocContent(Vertex doc) { Doc = {doc}; + + // Get the document's content and mark it as processed DocContent = SELECT c FROM Doc:d -(HAS_CONTENT)-> Content:c POST-ACCUM d.epoch_processed = datetime_to_epoch(now()); PRINT DocContent; diff --git a/common/gsql/graphRAG/StreamDocIds.gsql b/common/gsql/graphRAG/StreamDocIds.gsql index d5ec982e..2fb4a9c4 100644 --- a/common/gsql/graphRAG/StreamDocIds.gsql +++ b/common/gsql/graphRAG/StreamDocIds.gsql @@ -1,13 +1,16 @@ CREATE DISTRIBUTED QUERY StreamDocIds(INT current_batch, INT ttl_batches) { + /* + * Get the IDs of documents that have not already been processed (one + * batch at a time) + */ ListAccum @@doc_ids; Docs = {Document.*}; Docs = SELECT d FROM Docs:d WHERE vertex_to_int(d) % ttl_batches == current_batch AND d.epoch_processed == 0 - AND d.epoch_processing == 0 ACCUM @@doc_ids += d.id - POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); + POST-ACCUM d.epoch_processing = datetime_to_epoch(now()); // set the processing time PRINT @@doc_ids; } diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 57ea4b48..38b4939b 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'KNN\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.434 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.932 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.335 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.059 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853566538'}" + "{'job_name': 'load_documents_content_json_a245f14bb5f443acaa051125e4d9a497',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522'}" ] }, - "execution_count": 15, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -121,39 +121,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'asdf' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" + ] + } + ], "source": [ "asdf" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_203b064024e3499ea41b876cc67a85cf',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_203b064024e3499ea41b876cc67a85cf.stream.SupportAI_GraphRAG_pytgdocs_5b098715edbd4c878f7425918eb553c0.1721853623658'}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\",\"Entity\"]:\n", " try:\n", " conn.delVertices(v)\n", " except:\n", " pass\n", "\n", "import time\n", + "\n", "time.sleep(3)\n", "conn.ai.runDocumentIngest(\n", " res[\"load_job_id\"],\n", @@ -168,102 +168,11 @@ "metadata": {}, "outputs": [], "source": [ - "import httpx\n", - "import base64\n", - "\n", - "# conn.ai.forceConsistencyUpdate()\n", - "# url = self.nlqs_host+\"/\"+self.conn.graphname+\"/supportai/forceupdate\"\n", - "# return self.conn._req(\"GET\", url, authMode=\"pwd\", resKey=None)\n", - "httpx.get(f\"http://localhost:8000/{conn.graphname}/supportai/forceupdate\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_core.pydantic_v1 import BaseModel, Field\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "\n", - "class Joke(BaseModel):\n", - " setup: str = Field(description=\"The setup of the joke\")\n", - " punchline: str = Field(description=\"The punchline to the joke\")\n", - "\n", - "\n", - "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0)\n", - "print(model.invoke(\"hi\"))\n", - "structured_llm = model.with_structured_output(Joke)\n", - "structured_llm.invoke(\"Tell me a joke about cats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_core.documents import Document\n", - "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", - "from langchain_openai import ChatOpenAI\n", - "import os\n", - "\n", - "# from langchain_core.pydantic_v1 import BaseModel\n", - "from pydantic import BaseModel\n", - "\n", - "\n", - "class AnswerWithJustification(BaseModel):\n", - " \"\"\"An answer to the user question along with justification for the answer.\"\"\"\n", - "\n", - " answer: str\n", - " justification: str\n", - "\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", - "model_name = \"gpt-4o-mini\"\n", - "llm = ChatOpenAI(model=model_name, temperature=0)\n", - "# sllm = llm.with_structured_output(AnswerWithJustification)\n", - "# print(sllm.invoke(\"What weighs more a pound of bricks or a pound of feathers\"))\n", - "\n", - "\n", - "class GraphExtractor:\n", - " def __init__(self):\n", - " self.transformer = LLMGraphTransformer(\n", - " llm=llm,\n", - " node_properties=[\"description\"],\n", - " relationship_properties=[\"description\"],\n", - " )\n", - "\n", - " def extract(self, text):\n", - " doc = Document(page_content=text)\n", - " graph_docs = self.transformer.convert_to_graph_documents([doc])\n", - " return graph_docs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text = \"\"\"\n", - "Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n", - "She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.\n", - "Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.\n", - "She was, in 1906, the first woman to become a professor at the University of Paris.\n", - "\"\"\"\n", - "ge = GraphExtractor()\n", - "\n", - "docs = ge.extract(text)\n", - "for d in docs:\n", - " for n in d.nodes:\n", - " print(n)\n", - " for r in d.relationships:\n", - " print(r)\n", - "# print(f\"Nodes:{docs[0].nodes}\")\n", - "# print(f\"Relationships:{docs[0].relationships}\")\n", - "# docs" + "conn.gsql(f\"\"\"\n", + "USE GRAPH {conn.graphname}\n", + "DROP QUERY StreamDocIds\n", + "DROP QUERY StreamDocContent\n", + "\"\"\")" ] } ], diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index e248510c..7e67b342 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -1,16 +1,16 @@ import asyncio import logging import time +import traceback import httpx from aiochannel import Channel -from graphrag import workers -from graphrag.util import init, make_headers, stream_doc_ids,http_timeout -from pyTigerGraph import TigerGraphConnection - from common.config import embedding_service from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor +from graphrag import workers +from graphrag.util import http_timeout, init, make_headers, stream_doc_ids +from pyTigerGraph import TigerGraphConnection http_logs = logging.getLogger("httpx") http_logs.setLevel(logging.WARNING) @@ -32,28 +32,32 @@ async def stream_docs( for i in range(ttl_batches): doc_ids = await stream_doc_ids(conn, i, ttl_batches) if doc_ids["error"]: - continue # TODO: handle error + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - logger.info("********doc_ids") - logger.info(doc_ids) - logger.info("********") for d in doc_ids["ids"]: async with httpx.AsyncClient(timeout=http_timeout) as client: - res = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", - params={"doc": d}, - headers=headers, - ) - # TODO: check for errors - # this will block and wait if the channel is full - logger.info("steam_docs writes to docs") - await docs_chan.put(res.json()["results"][0]["DocContent"][0]) - # break # single doc test FIXME: delete - # break # single batch test FIXME: delete + try: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", + params={"doc": d}, + headers=headers, + ) + if res.status_code != 200: + # continue to the next doc. + # This doc will not be marked as processed, so the ecc will process it eventually. + continue + logger.info("steam_docs writes to docs") + await docs_chan.put(res.json()["results"][0]["DocContent"][0]) + except Exception as e: + exc = traceback.format_exc() + logger.error(f"Error retrieving doc: {d} --> {e}\n{exc}") + continue # try retrieving the next doc logger.info("stream_docs done") # close the docs chan -- this function is the only sender - logger.info("****** closing docs chan") + logger.info("closing docs chan") docs_chan.close() @@ -72,8 +76,6 @@ async def chunk_docs( doc_tasks = [] async with asyncio.TaskGroup() as grp: async for content in docs_chan: - logger.info("*********reading from docs chan") - # continue v_id = content["v_id"] txt = content["attributes"]["text"] # send the document to be embedded @@ -84,17 +86,12 @@ async def chunk_docs( workers.chunk_doc(conn, content, upsert_chan, embed_chan, extract_chan) ) doc_tasks.append(task) - # break # single doc FIXME: delete - logger.info("*********done reading from docs chan") logger.info("chunk_docs done") - # do something with doc_tasks? - # for t in doc_tasks: - # logger.info(t.result()) # close the extract chan -- chunk_doc is the only sender # and chunk_doc calls are kicked off from here - logger.info("********closing extract chan") + logger.info("closing extract_chan") extract_chan.close() @@ -110,13 +107,11 @@ async def upsert(upsert_chan: Channel): upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: - logger.info("*********reading from upsert chan") logger.info(f"{func.__name__}, {args[1]}") # continue # execute the task t = grp.create_task(func(*args)) upsert_tasks.append(t) - logger.info("*********done reading from upsert chan") logger.info(f"upsert done") # do something with doc_tasks? @@ -136,7 +131,6 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: - logger.info("*********reading from embed chan") # continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") @@ -148,7 +142,6 @@ async def embed( content, ) ) - logger.info("*********done reading from embed chan") logger.info(f"embed done") @@ -169,17 +162,13 @@ async def extract( # consume task queue async with asyncio.TaskGroup() as grp: async for item in extract_chan: - logger.info("*********reading from extract chan") - logger.info("*********done reading from extract chan") grp.create_task( workers.extract(upsert_chan, embed_chan, extractor, conn, *item) ) - # append task results to worker results/response - logger.info("*********done reading from extract chan") logger.info(f"extract done") - logger.info("****closing upsert and embed chan") + logger.info("closing upsert and embed chan") upsert_chan.close() embed_chan.close() @@ -202,9 +191,8 @@ async def run(graphname: str, conn: TigerGraphConnection): # return start = time.perf_counter() - # TODO: make configurable tasks = [] - docs_chan = Channel(1) # process n chunks at a time max + docs_chan = Channel(1) embed_chan = Channel(100) upsert_chan = Channel(100) extract_chan = Channel(100) @@ -230,5 +218,4 @@ async def run(graphname: str, conn: TigerGraphConnection): tasks.append(t) end = time.perf_counter() - logger.info("DONE") - logger.info(end - start) + logger.info(f"DONE. graphrag.run elapsed: {end-start}") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 3fb8f916..8f2c2141 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -5,9 +5,6 @@ import traceback import httpx -from graphrag import workers -from pyTigerGraph import TigerGraphConnection - from common.config import ( doc_processing_config, embedding_service, @@ -19,6 +16,8 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import workers +from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) @@ -33,6 +32,7 @@ async def install_queries( tasks = [] async with asyncio.TaskGroup() as grp: for q in requried_queries: + # only install n queries at a time (n=n_workers) async with asyncio.Semaphore(n_workers): q_name = q.split("/")[-1] # if the query is not installed, install it @@ -41,8 +41,17 @@ async def install_queries( tasks.append(task) for t in tasks: - logger.info(t.result()) - # TODO: Check if anything had an error + res = t.result() + # stop system if a required query doesn't install + if res["error"]: + raise Exception(res["message"]) + + +async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): + content = "init" + vec = embedding_service.embed_query(content) + await s.aadd_embeddings([(content, vec)], [{vertex_field: content}]) + s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") async def init( @@ -78,28 +87,28 @@ async def init( ], ) index_stores = {} - content = "init" - # TODO:do concurrently - for index_name in index_names: - name = conn.graphname + "_" + index_name - s = MilvusEmbeddingStore( - embedding_service, - host=milvus_config["host"], - port=milvus_config["port"], - support_ai_instance=True, - collection_name=name, - username=milvus_config.get("username", ""), - password=milvus_config.get("password", ""), - vector_field=milvus_config.get("vector_field", "document_vector"), - text_field=milvus_config.get("text_field", "document_content"), - vertex_field=vertex_field, - ) - # TODO: only do this if collection doesn't exist - vec = embedding_service.embed_query(content) - LogWriter.info(f"Initializing {name}") - s.add_embeddings([(content, vec)], [{vertex_field: content}]) - s.remove_embeddings(expr=f"{vertex_field} in ['{content}']") - index_stores[name] = s + async with asyncio.TaskGroup() as tg: + for index_name in index_names: + name = conn.graphname + "_" + index_name + s = MilvusEmbeddingStore( + embedding_service, + host=milvus_config["host"], + port=milvus_config["port"], + support_ai_instance=True, + collection_name=name, + username=milvus_config.get("username", ""), + password=milvus_config.get("password", ""), + vector_field=milvus_config.get("vector_field", "document_vector"), + text_field=milvus_config.get("text_field", "document_content"), + vertex_field=vertex_field, + ) + + LogWriter.info(f"Initializing {name}") + # init collection if it doesn't exist + if not s.check_collection_exists(): + tg.create_task(init_embedding_index(s, vertex_field)) + + index_stores[name] = s return extractor, index_stores diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 3eb0d0dd..b7267b60 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -6,15 +6,14 @@ import ecc_util import httpx from aiochannel import Channel -from graphrag import util # import upsert_edge, upsert_vertex -from langchain_community.graphs.graph_document import GraphDocument -from pyTigerGraph import TigerGraphConnection - from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter +from graphrag import util # import upsert_edge, upsert_vertex +from langchain_community.graphs.graph_document import GraphDocument +from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -36,7 +35,7 @@ async def install_query( tkn = base64.b64encode(f"{conn.username}:{conn.password}".encode()).decode() headers = {"Authorization": f"Basic {tkn}"} - async with httpx.AsyncClient(timeout=util.http_timeout) as client: + async with httpx.AsyncClient(timeout=None) as client: res = await client.post( conn.gsUrl + "/gsqlserver/gsql/file", data=quote_plus(query.encode("utf-8")), @@ -70,26 +69,24 @@ async def chunk_doc( chunks = chunker.chunk(doc["attributes"]["text"]) v_id = doc["v_id"] logger.info(f"Chunking {v_id}") - # TODO: n chunks at a time for i, chunk in enumerate(chunks): chunk_id = f"{v_id}_chunk_{i}" # send chunks to be upserted (func, args) - logger.info("chunk writes to upsert") + logger.info("chunk writes to upsert_chan") await upsert_chan.put((upsert_chunk, (conn, v_id, chunk_id, chunk))) # send chunks to be embedded - logger.info("chunk writes to embed") + logger.info("chunk writes to embed_chan") await embed_chan.put((v_id, chunk, "DocumentChunk")) # send chunks to have entities extracted - logger.info("chunk writes to extract") + logger.info("chunk writes to extract_chan") await extract_chan.put((chunk, chunk_id)) return doc["v_id"] async def upsert_chunk(conn: TigerGraphConnection, doc_id, chunk_id, chunk): - logger.info(f"Upserting chunk {chunk_id}") logger.info(f"Upserting chunk {chunk_id}") date_added = int(time.time()) await util.upsert_vertex( @@ -142,7 +139,7 @@ async def embed( index_name: str the vertex index to write to """ - logger.info(f"Embedding {v_id}, {content}") + logger.info(f"Embedding {v_id}") vec = await embed_svc.aembed_query(content) await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) @@ -161,8 +158,7 @@ async def extract( # upsert nodes and edges to the graph for doc in extracted: for node in doc.nodes: - logger.info("extract writes entity vert to upsert") - logger.info(f"Node: {node.id}| props: {node.properties}") + logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") v_id = str(node.id) desc = node.properties.get("description", "") await upsert_chan.put( @@ -203,8 +199,9 @@ async def extract( await embed_chan.put((v_id, desc, "Entity")) for edge in doc.relationships: - logger.info("extract writes relates edge to upsert") - logger.info(f"{edge}") + logger.info( + f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" + ) await upsert_chan.put( ( util.upsert_edge, @@ -221,6 +218,3 @@ async def extract( ) # embed "Relationship", # (v_id, content, index_name) - - # TODO: - # embed the extracted entities From e9f178e34e39404774e76dd599f3917ba5856ac6 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:09:26 -0400 Subject: [PATCH 08/53] save --- common/embeddings/embedding_services.py | 2 +- common/embeddings/milvus_embedding_store.py | 73 ++++- common/gsql/graphRAG/.clang-format | 269 ++++++++++++++++++ common/gsql/graphRAG/.clangd | 2 + .../gsql/graphRAG/ResolveRelationships.gsql | 26 ++ common/gsql/graphRAG/SetEpochProcessing.gsql | 7 + common/gsql/graphRAG/StreamIds.gsql | 16 ++ common/gsql/graphRAG/leven.cpp | 59 ++++ common/gsql/graphRAG/louvain/louvain1.gsql | 17 ++ .../louvain_1_first_pass.gsql | 16 +- .../louvain_2_other_passes.gsql | 0 .../louvain_3_final_community.gsql | 0 .../louvain_4_modularity_1_for_pass.gsql | 0 .../louvain_4_modularity_2_final.gsql | 0 .../louvain_5_reset.gsql | 0 common/gsql/supportai/SupportAI_Schema.gsql | 4 +- copilot/docs/notebooks/graphrag.ipynb | 227 +++++++++++++-- .../app/graphrag/graph_rag.py | 188 ++++++++---- .../app/graphrag/util.py | 50 +++- .../app/graphrag/workers.py | 148 +++++++++- 20 files changed, 989 insertions(+), 115 deletions(-) create mode 100644 common/gsql/graphRAG/.clang-format create mode 100644 common/gsql/graphRAG/.clangd create mode 100644 common/gsql/graphRAG/ResolveRelationships.gsql create mode 100644 common/gsql/graphRAG/SetEpochProcessing.gsql create mode 100644 common/gsql/graphRAG/StreamIds.gsql create mode 100644 common/gsql/graphRAG/leven.cpp create mode 100644 common/gsql/graphRAG/louvain/louvain1.gsql rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_1_first_pass.gsql (88%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_2_other_passes.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_3_final_community.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_4_modularity_1_for_pass.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_4_modularity_2_final.gsql (100%) rename common/gsql/graphRAG/{louvain => louvain_old}/louvain_5_reset.gsql (100%) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index 7ce17478..dd506670 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -136,7 +136,7 @@ def __init__(self, config): ) from langchain.embeddings import OpenAIEmbeddings - self.embeddings = OpenAIEmbeddings().aembed_query + self.embeddings = OpenAIEmbeddings() class VertexAI_PaLM_Embedding(EmbeddingModel): diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index ac9c5389..fd57c783 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -3,15 +3,16 @@ from time import sleep, time from typing import Iterable, List, Optional, Tuple -from langchain_community.vectorstores import Milvus -from langchain_core.documents.base import Document -from pymilvus import MilvusException, connections, utility - +import Levenshtein as lev +from asyncer import asyncify from common.embeddings.base_embedding_store import EmbeddingStore from common.embeddings.embedding_services import EmbeddingModel from common.logs.log import req_id_cv from common.logs.logwriter import LogWriter from common.metrics.prometheus_metrics import metrics +from langchain_community.vectorstores import Milvus +from langchain_core.documents.base import Document +from pymilvus import MilvusException, SearchResult, connections, utility logger = logging.getLogger(__name__) @@ -32,6 +33,7 @@ def __init__( alias: str = "alias", retry_interval: int = 2, max_retry_attempts: int = 10, + drop_old=False, ): self.embedding_service = embedding_service self.vector_field = vector_field @@ -42,6 +44,7 @@ def __init__( self.milvus_alias = alias self.retry_interval = retry_interval self.max_retry_attempts = max_retry_attempts + self.drop_old = drop_old if host.startswith("http"): if host.endswith(str(port)): @@ -86,7 +89,7 @@ def connect_to_milvus(self): collection_name=self.collection_name, connection_args=self.milvus_connection, auto_id=True, - drop_old=False, + drop_old=self.drop_old, text_field=self.text_field, vector_field=self.vector_field, ) @@ -118,6 +121,9 @@ def metadata_func(record: dict, metadata: dict) -> dict: return metadata LogWriter.info("Milvus add initial load documents init()") + import os + + logger.info(f"*******{os.path.exists('tg_documents')}") loader = DirectoryLoader( "./tg_documents/", glob="*.json", @@ -584,5 +590,62 @@ def query(self, expr: str, output_fields: List[str]): return query_result + def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): + a = a.lower() + b = b.lower() + # if the words are short, they should be the same + if len(a) < 5 and len(b) < 5: + return a == b + + # edit_dist_threshold (as a percent) of word must match + threshold = int(min(len(a), len(b)) * (1 - edit_dist_threshold)) + if p: + print(a, b, threshold, lev.distance(a, b)) + return lev.distance(a, b) < threshold + + async def aget_k_closest( + self, v_id: str, k=15, threshold_similarity=0.90, edit_dist_threshold_pct=0.75 + ) -> list[Document]: + """ + asdf + """ + threshold_dist = 1 - threshold_similarity + + # asyncify necessary funcs + query = asyncify(self.milvus.col.query) + search = asyncify(self.milvus.similarity_search_with_score_by_vector) + + # Get all vectors with this ID + verts = await query( + f'{self.vertex_field} == "{v_id}"', + output_fields=[self.vertex_field, self.vector_field], + ) + result = [] + for v in verts: + # get the k closest verts + sim = await search( + v["document_vector"], + k=k, + ) + # filter verts using similiarity threshold and leven_dist + similar_verts = [ + doc.metadata["vertex_id"] + for doc, dist in sim + # check semantic similarity + if dist < threshold_dist + # check name similarity (won't merge Apple and Google if they're semantically similar) + and self.edit_dist_check( + doc.metadata["vertex_id"], + v_id, + edit_dist_threshold_pct, + # v_id == "Dataframe", + ) + # don't have to merge verts with the same id (they're the same) + and doc.metadata["vertex_id"] != v_id + ] + result.extend(similar_verts) + result.append(v_id) + return set(result) + def __del__(self): metrics.milvus_active_connections.labels(self.collection_name).dec diff --git a/common/gsql/graphRAG/.clang-format b/common/gsql/graphRAG/.clang-format new file mode 100644 index 00000000..f0dcec6c --- /dev/null +++ b/common/gsql/graphRAG/.clang-format @@ -0,0 +1,269 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Left +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterExternBlock: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: true +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 3 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: true +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +KeepEmptyLinesAtEOF: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 4 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: NextLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +PPIndentWidth: -1 +QualifierAlignment: Leave +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + - ParseTestProto + - ParsePartialTestProto + CanonicalDelimiter: pb + BasedOnStyle: google +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Never +SpacesInParensOptions: + InCStyleCasts: false + InConditionalStatements: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Auto +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... diff --git a/common/gsql/graphRAG/.clangd b/common/gsql/graphRAG/.clangd new file mode 100644 index 00000000..ec3be0d8 --- /dev/null +++ b/common/gsql/graphRAG/.clangd @@ -0,0 +1,2 @@ +CompileFlags: + Add: [ -std=c++23 ] diff --git a/common/gsql/graphRAG/ResolveRelationships.gsql b/common/gsql/graphRAG/ResolveRelationships.gsql new file mode 100644 index 00000000..d3c69297 --- /dev/null +++ b/common/gsql/graphRAG/ResolveRelationships.gsql @@ -0,0 +1,26 @@ +CREATE DISTRIBUTED QUERY ResolveRelationships(BOOL printResults=FALSE) SYNTAX V2 { + /* + * RE1 <- entity -RELATES-> entity -> RE2 + * to + * RE1 -resolved-> RE + * + * Combines all of a Resolved entity's children's relationships into + * RESOLVED_RELATIONSHIP + */ + REs = {ResolvedEntity.*}; + + + REs = SELECT re1 FROM REs:re1 -(:rel)- Entity:e_tgt -(RESOLVES_TO>:r)- ResolvedEntity:re2 + // Connect the The first RE to the second RE + ACCUM + INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO) VALUES(re1, re2); + + + IF printResults THEN + // show which entities didn't get resolved + Ents = {Entity.*}; + rEnts = SELECT e FROM Ents:e -(RESOLVES_TO>)- _; + ents = Ents minus rEnts; + PRINT ents; + END; +} diff --git a/common/gsql/graphRAG/SetEpochProcessing.gsql b/common/gsql/graphRAG/SetEpochProcessing.gsql new file mode 100644 index 00000000..9a92ecf9 --- /dev/null +++ b/common/gsql/graphRAG/SetEpochProcessing.gsql @@ -0,0 +1,7 @@ +CREATE DISTRIBUTED QUERY SetEpochProcessing(Vertex v_id) { + Verts = {v_id}; + + // mark the vertex as processed + Verts = SELECT v FROM Verts:v + POST-ACCUM v.epoch_processed = datetime_to_epoch(now()); +} diff --git a/common/gsql/graphRAG/StreamIds.gsql b/common/gsql/graphRAG/StreamIds.gsql new file mode 100644 index 00000000..41181007 --- /dev/null +++ b/common/gsql/graphRAG/StreamIds.gsql @@ -0,0 +1,16 @@ +CREATE DISTRIBUTED QUERY StreamIds(INT current_batch, INT ttl_batches, STRING v_type) { + /* + * Get the IDs of entities that have not already been processed + * (one batch at a time) + */ + ListAccum @@ids; + Verts = {v_type}; + + Verts = SELECT v FROM Verts:v + WHERE vertex_to_int(v) % ttl_batches == current_batch + AND v.epoch_processed == 0 + ACCUM @@ids += v.id + POST-ACCUM v.epoch_processing = datetime_to_epoch(now()); // set the processing time + + PRINT @@ids; +} diff --git a/common/gsql/graphRAG/leven.cpp b/common/gsql/graphRAG/leven.cpp new file mode 100644 index 00000000..10c45669 --- /dev/null +++ b/common/gsql/graphRAG/leven.cpp @@ -0,0 +1,59 @@ +#include +#include + +// Returns the Levenshtein distance between word1 and word2. +int levenshteinDist(std::string word1, std::string word2) { + int size1 = word1.size(); + int size2 = word2.size(); + int verif[size1 + 1][size2 + 1]; // Verification matrix i.e. 2D array + // which will store the calculated distance. + + // If one of the words has zero length, the distance is equal to the size of + // the other word. + if (size1 == 0) return size2; + if (size2 == 0) return size1; + + // Sets the first row and the first column of the verification matrix with + // the numerical order from 0 to the length of each word. + for (int i = 0; i <= size1; i++) verif[i][0] = i; + for (int j = 0; j <= size2; j++) verif[0][j] = j; + + // Verification step / matrix filling. + for (int i = 1; i <= size1; i++) { + for (int j = 1; j <= size2; j++) { + // Sets the modification cost. + // 0 means no modification (i.e. equal letters) and 1 means that a + // modification is needed (i.e. unequal letters). + int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; + + // Sets the current position of the matrix as the minimum value + // between a (deletion), b (insertion) and c (substitution). a = the + // upper adjacent value plus 1: verif[i - 1][j] + 1 b = the left + // adjacent value plus 1: verif[i][j - 1] + 1 c = the upper left + // adjacent value plus the modification cost: verif[i - 1][j - 1] + + // cost + verif[i][j] = + std::min(std::min(verif[i - 1][j] + 1, verif[i][j - 1] + 1), + verif[i - 1][j - 1] + cost); + } + } + + // The last position of the matrix will contain the Levenshtein distance. + return verif[size1][size2]; +} + +int main() { + std::string word1, word2; + + std::cout << "Please input the first word: " << std::endl; + std::cin >> word1; + std::cout << "Please input the second word: " << std::endl; + std::cin >> word2; + + // cout << "The number of modifications needed in order to make one word " + // "equal to the other is: " + std::cout << "The edit distance is: " << levenshteinDist(word1, word2) + << std::endl; + + return 0; +} diff --git a/common/gsql/graphRAG/louvain/louvain1.gsql b/common/gsql/graphRAG/louvain/louvain1.gsql new file mode 100644 index 00000000..494a3625 --- /dev/null +++ b/common/gsql/graphRAG/louvain/louvain1.gsql @@ -0,0 +1,17 @@ +CREATE DISTRIBUTED QUERY graphRAG_louvain_1() { + + Ents = {ResolvedEntity.*}; + + // Put each node into a distinct community + // Assume each Entity starts in its own community + + // For each node i + // Compute ∆Q (modularity) when putting node i into the community of some neighbor j + // move i to community that yields the largest gain in ∆Q + + Z = SELECT v FROM Ents:v -(_:e)-> ResolvedEntity:r + + + ; +} + diff --git a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql similarity index 88% rename from common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql rename to common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql index 4ca06029..0251909f 100644 --- a/common/gsql/graphRAG/louvain/louvain_1_first_pass.gsql +++ b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql @@ -2,20 +2,20 @@ CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( UINT max_hop = 10, UINT batch_num = 12, UINT sample_edge_num = 100 -) FOR GRAPH {graph_name} SYNTAX v1 { +) { - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; --> this should be Community, I think + TYPEDEF TUPLE community, STRING ext_vid> MyTuple; //--> this should be Community, I think SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node + MinAccum> @{community_id_attribute_name}; // the community ID of the node MinAccum @community_vid; // the community ID of the node SumAccum @k; // the sum of the weights of the links incident to the node SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community MaxAccum @@min_double; // used to reset the @best_move @@ -27,7 +27,7 @@ CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( DOUBLE wt = 1.0; // Initialization - All_Nodes = {{{entity_vertex_name}.*}}; + All_Nodes = {{ResolvedEntity.*}}; All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t ACCUM @@m += wt / 2, s.@k += wt, diff --git a/common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_2_other_passes.gsql rename to common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_3_final_community.gsql rename to common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_4_modularity_1_for_pass.gsql rename to common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_4_modularity_2_final.gsql rename to common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql diff --git a/common/gsql/graphRAG/louvain/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql similarity index 100% rename from common/gsql/graphRAG/louvain/louvain_5_reset.gsql rename to common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 0e3cf6c3..1a705eaf 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -2,7 +2,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD VERTEX DocumentChunk(PRIMARY_ID id STRING, idx INT, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Document(PRIMARY_ID id STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Concept(PRIMARY_ID id STRING, description STRING, concept_type STRING, human_curated BOOL, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description STRING, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Entity(PRIMARY_ID id STRING, definition STRING, description SET, entity_type STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Relationship(PRIMARY_ID id STRING, definition STRING, short_name STRING, epoch_added UINT, epoch_processing UINT, epoch_processed UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX DocumentCollection(PRIMARY_ID id STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Content(PRIMARY_ID id STRING, text STRING, epoch_added UINT) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; @@ -21,7 +21,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { // GraphRAG ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, description STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index 38b4939b..bde1b78f 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 2, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.335 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 2.059 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.208 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 3.025 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 4, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_a245f14bb5f443acaa051125e4d9a497',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_a245f14bb5f443acaa051125e4d9a497.stream.SupportAI_GraphRAG_pytgdocs_025b08b3cf60477dbbcfd22b4254d268.1722356202522'}" + "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295'}" ] }, - "execution_count": 6, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -121,7 +121,41 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import httpx\n", + "import base64\n", + "\n", + "\n", + "def make_headers(conn: TigerGraphConnection):\n", + " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + " return headers\n", + "\n", + "\n", + "httpx.get(\n", + " \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", + " headers=make_headers(conn),\n", + ")\n", + "# conn.ai.forceConsistencyUpdate()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -131,7 +165,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -142,11 +176,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658'}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\",\"Entity\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", " except:\n", @@ -170,10 +218,147 @@ "source": [ "conn.gsql(f\"\"\"\n", "USE GRAPH {conn.graphname}\n", - "DROP QUERY StreamDocIds\n", - "DROP QUERY StreamDocContent\n", + "DROP QUERY ResolveRelationships\n", "\"\"\")" ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'deleted_vertices'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m conn\u001b[38;5;241m.\u001b[39mgetToken()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# for v in [\"ResolvedEntity\"]:\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelVertices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.venv/ml/lib/python3.11/site-packages/pyTigerGraph/pyTigerGraphVertex.py:688\u001b[0m, in \u001b[0;36mpyTigerGraphVertex.delVertices\u001b[0;34m(self, vertexType, where, limit, sort, permanent, timeout)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 686\u001b[0m url \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m?\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m isFirst \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m&\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(timeout)\n\u001b[0;32m--> 688\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_delete\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdeleted_vertices\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m logger\u001b[38;5;241m.\u001b[39mlevel \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mDEBUG:\n\u001b[1;32m 691\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(ret))\n", + "\u001b[0;31mKeyError\u001b[0m: 'deleted_vertices'" + ] + } + ], + "source": [ + "conn.graphname = \"Cora\"\n", + "conn.getToken()\n", + "for v in [\"Community\"]:\n", + " # for v in [\"ResolvedEntity\"]:\n", + " conn.delVertices(v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "import json\n", + "import httpx\n", + "import logging\n", + "\n", + "_ = logging.getLogger(__name__)\n", + "\n", + "\n", + "http_timeout = None\n", + "\n", + "\n", + "def make_headers(conn: TigerGraphConnection):\n", + " if conn.apiToken is None or conn.apiToken == \"\":\n", + " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + " else:\n", + " headers = {\"Authorization\": f\"Bearer {conn.apiToken}\"}\n", + "\n", + " return headers\n", + "\n", + "\n", + "def check_vertex_exists(conn, id):\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=http_timeout) as client:\n", + " res = client.get(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{id}\",\n", + " headers=headers,\n", + " )\n", + "\n", + " res.raise_for_status()\n", + " return res.json()\n", + "\n", + "\n", + "# r = check_vertex_exists(conn, \"asdfTigergraphexception\")\n", + "# print(json.dumps(r, indent=2), r[\"error\"])\n", + "r = check_vertex_exists(conn, \"Tigergraphexception\")\n", + "print(json.dumps(r, indent=2), r[\"error\"])\n", + "r[\"results\"][0][\"attributes\"][\"description\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def map_attrs(attributes: dict):\n", + " # map attrs\n", + " attrs = {}\n", + " for k, v in attributes.items():\n", + " if isinstance(v, tuple):\n", + " attrs[k] = {\"value\": v[0], \"op\": v[1]}\n", + " elif isinstance(v, dict):\n", + " attrs[k] = {\n", + " \"value\": {\"keylist\": list(v.keys()), \"valuelist\": list(v.values())}\n", + " }\n", + " else:\n", + " attrs[k] = {\"value\": v}\n", + " return attrs\n", + "\n", + "\n", + "def process_id(v_id: str):\n", + " return v_id.replace(\" \", \"_\").replace(\"/\", \"\")\n", + "\n", + "\n", + "def a(vertex_id=\"Post /Requesttoken\"):\n", + " vertex_id = process_id(vertex_id)\n", + " attributes = { # attrs\n", + " \"description\": [\"test\"],\n", + " \"epoch_added\": int(time.time()),\n", + " }\n", + "\n", + " vertex_id = vertex_id.replace(\" \", \"_\")\n", + " attrs = map_attrs(attributes)\n", + " data = json.dumps({\"vertices\": {\"Entity\": {vertex_id: attrs}}})\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=http_timeout) as client:\n", + " res = client.post(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}\", data=data, headers=headers\n", + " )\n", + "\n", + " res.raise_for_status()\n", + "\n", + " return res.json()\n", + "\n", + "\n", + "a()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib import parse\n", + "\n", + "v_id = \"Post_/Requesttoken\"\n", + "v_id = process_id(v_id)\n", + "print(v_id)\n", + "\n", + "r = check_vertex_exists(conn, v_id)\n", + "print(json.dumps(r, indent=2), r[\"error\"])\n", + "r[\"results\"][0][\"attributes\"][\"description\"]" + ] } ], "metadata": { diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 7e67b342..4403756d 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -9,7 +9,7 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from graphrag import workers -from graphrag.util import http_timeout, init, make_headers, stream_doc_ids +from graphrag.util import http_timeout, init, make_headers, stream_ids from pyTigerGraph import TigerGraphConnection http_logs = logging.getLogger("httpx") @@ -29,15 +29,15 @@ async def stream_docs( """ logger.info("streaming docs") headers = make_headers(conn) - for i in range(ttl_batches): - doc_ids = await stream_doc_ids(conn, i, ttl_batches) - if doc_ids["error"]: - # continue to the next batch. - # These docs will not be marked as processed, so the ecc will process it eventually. - continue + async with httpx.AsyncClient(timeout=http_timeout) as client: + for i in range(ttl_batches): + doc_ids = await stream_ids(conn, "Document", i, ttl_batches) + if doc_ids["error"]: + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - for d in doc_ids["ids"]: - async with httpx.AsyncClient(timeout=http_timeout) as client: + for d in doc_ids["ids"]: try: res = await client.get( f"{conn.restppUrl}/query/{conn.graphname}/StreamDocContent/", @@ -104,19 +104,13 @@ async def upsert(upsert_chan: Channel): logger.info("Reading from upsert channel") # consume task queue - upsert_tasks = [] async with asyncio.TaskGroup() as grp: async for func, args in upsert_chan: logger.info(f"{func.__name__}, {args[1]}") - # continue # execute the task - t = grp.create_task(func(*args)) - upsert_tasks.append(t) + grp.create_task(func(*args)) logger.info(f"upsert done") - # do something with doc_tasks? - # for t in upsert_tasks: - # logger.info(t.result()) async def embed( @@ -131,7 +125,6 @@ async def embed( async with asyncio.TaskGroup() as grp: # consume task queue async for v_id, content, index_name in embed_chan: - # continue embedding_store = index_stores[f"{graphname}_{index_name}"] logger.info(f"Embed to {graphname}_{index_name}: {v_id}") grp.create_task( @@ -173,49 +166,136 @@ async def extract( embed_chan.close() -async def run(graphname: str, conn: TigerGraphConnection): +async def stream_entities( + conn: TigerGraphConnection, + entity_chan: Channel, + ttl_batches: int = 50, +): + """ + Streams entity IDs from the grpah """ - ecc flow + logger.info("streaming entities") + for i in range(ttl_batches): + ids = await stream_ids(conn, "Entity", i, ttl_batches) + if ids["error"]: + # continue to the next batch. + # These docs will not be marked as processed, so the ecc will process it eventually. + continue - initialize_eventual_consistency_checker - instantiates ecc object - writes checker to checker dict - runs ecc_obj.initialize() + for i in ids["ids"]: + if len(i) > 0: + await entity_chan.put(i) + # break + # break # one batch + + logger.info("stream_enities done") + # close the docs chan -- this function is the only sender + logger.info("closing entities chan") + entity_chan.close() - ECC.initialize - loops and calls fetch and process +async def resolve_entities( + conn: TigerGraphConnection, + emb_store: MilvusEmbeddingStore, + entity_chan: Channel, + upsert_chan: Channel, +): """ + Merges entities into their ResolvedEntity form + Groups what should be the same entity into a resolved entity (e.g. V_type and VType should be merged) - extractor, index_stores = await init(conn) - # return - start = time.perf_counter() - - tasks = [] - docs_chan = Channel(1) - embed_chan = Channel(100) - upsert_chan = Channel(100) - extract_chan = Channel(100) + Copies edges between entities to their respective ResolvedEntities + """ async with asyncio.TaskGroup() as grp: - # get docs - t = grp.create_task(stream_docs(conn, docs_chan, 10)) - tasks.append(t) - # process docs - t = grp.create_task( - chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) - ) - tasks.append(t) - # upsert chunks - t = grp.create_task(upsert(upsert_chan)) - tasks.append(t) - # # embed - t = grp.create_task(embed(embed_chan, index_stores, graphname)) - tasks.append(t) - # extract entities - t = grp.create_task( - extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + # for every entity + async for entity_id in entity_chan: + print(f"***Etity ID from chan {entity_id}") + grp.create_task( + workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) + ) + logger.info("closing upsert_chan") + upsert_chan.close() + + # Copy RELATIONSHIP edges to RESOLVED_RELATIONSHIP + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/ResolveRelationships/", + headers=headers, ) - tasks.append(t) - end = time.perf_counter() + res.raise_for_status() + + +async def communities(conn: TigerGraphConnection): + pass + # Setup + + +async def run(graphname: str, conn: TigerGraphConnection): + """ + Set up GraphRAG: + - Install necessary queries. + - Process the documents into: + - chunks + - embeddings + - entities/relationships (and their embeddings) + - upsert everything to the graph + """ + + extractor, index_stores = await init(conn) + init_start = time.perf_counter() + + if False: + docs_chan = Channel(1) + embed_chan = Channel(100) + upsert_chan = Channel(100) + extract_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # get docs + grp.create_task(stream_docs(conn, docs_chan, 10)) + # process docs + grp.create_task( + chunk_docs(conn, docs_chan, embed_chan, upsert_chan, extract_chan) + ) + # upsert chunks + grp.create_task(upsert(upsert_chan)) + # embed + grp.create_task(embed(embed_chan, index_stores, graphname)) + # extract entities + grp.create_task( + extract(extract_chan, upsert_chan, embed_chan, extractor, conn) + ) + init_end = time.perf_counter() + + # Entity Resolution + entity_start = time.perf_counter() + + if False: + entities_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + grp.create_task(stream_entities(conn, entities_chan, 50)) + grp.create_task( + resolve_entities( + conn, + index_stores[f"{conn.graphname}_Entity"], + entities_chan, + upsert_chan, + ) + ) + grp.create_task(upsert(upsert_chan)) + entity_end = time.perf_counter() - logger.info(f"DONE. graphrag.run elapsed: {end-start}") + # Community Detection + community_start = time.perf_counter() + if True: + await communities(conn) + + community_end = time.perf_counter() + + # Community Summarization + end = time.perf_counter() + logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") + logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") + logger.info(f"DONE. graphrag initializer dT: {community_end-community_start}") + logger.info(f"DONE. graphrag.run() total time elaplsed: {end-init_start}") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 8f2c2141..74dbc56d 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -3,6 +3,7 @@ import json import logging import traceback +from glob import glob import httpx from common.config import ( @@ -42,6 +43,7 @@ async def install_queries( for t in tasks: res = t.result() + print(res) # stop system if a required query doesn't install if res["error"]: raise Exception(res["message"]) @@ -63,9 +65,14 @@ async def init( # "common/gsql/supportai/Update_Vertices_Processing_Status", # "common/gsql/supportai/ECC_Status", # "common/gsql/supportai/Check_Nonexistent_Vertices", - "common/gsql/graphRAG/StreamDocIds", + "common/gsql/graphRAG/StreamIds", "common/gsql/graphRAG/StreamDocContent", + "common/gsql/graphRAG/SetEpochProcessing", + "common/gsql/graphRAG/ResolveRelationships", ] + # add louvain to queries + q = [x.split('.gsql')[0] for x in glob("common/gsql/graphRAG/louvain/*")] + requried_queries.extend(q) await install_queries(requried_queries, conn) # extractor @@ -101,13 +108,14 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, + drop_old=False, ) LogWriter.info(f"Initializing {name}") # init collection if it doesn't exist if not s.check_collection_exists(): tg.create_task(init_embedding_index(s, vertex_field)) - + index_stores[name] = s return extractor, index_stores @@ -123,29 +131,28 @@ def make_headers(conn: TigerGraphConnection): return headers -async def stream_doc_ids( - conn: TigerGraphConnection, current_batch: int, ttl_batches: int +async def stream_ids( + conn: TigerGraphConnection, v_type: str, current_batch: int, ttl_batches: int ) -> dict[str, str | list[str]]: headers = make_headers(conn) try: async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( - f"{conn.restppUrl}/query/{conn.graphname}/StreamDocIds", + f"{conn.restppUrl}/query/{conn.graphname}/StreamIds", params={ "current_batch": current_batch, "ttl_batches": ttl_batches, + "v_type": v_type, }, headers=headers, ) - ids = res.json()["results"][0]["@@doc_ids"] + ids = res.json()["results"][0]["@@ids"] return {"error": False, "ids": ids} except Exception as e: exc = traceback.format_exc() - LogWriter.error( - f"/{conn.graphname}/query/StreamDocIds\nException Trace:\n{exc}" - ) + LogWriter.error(f"/{conn.graphname}/query/StreamIds\nException Trace:\n{exc}") return {"error": True, "message": str(e)} @@ -165,16 +172,24 @@ def map_attrs(attributes: dict): return attrs +def process_id(v_id: str): + v_id = v_id.replace(" ", "_").replace("/", "") + if v_id == "''" or v_id == '""': + return "" + + return v_id + + async def upsert_vertex( conn: TigerGraphConnection, vertex_type: str, vertex_id: str, attributes: dict, ): + vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) headers = make_headers(conn) - # print("upsert vertex>>>", vertex_id) async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers @@ -183,6 +198,18 @@ async def upsert_vertex( res.raise_for_status() +async def check_vertex_exists(conn, v_id: str): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=http_timeout) as client: + res = await client.get( + f"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{v_id}", + headers=headers, + ) + + res.raise_for_status() + return res.json() + + async def upsert_edge( conn: TigerGraphConnection, src_v_type: str, @@ -196,6 +223,8 @@ async def upsert_edge( attrs = {} else: attrs = map_attrs(attributes) + src_v_id = src_v_id.replace(" ", "_") + tgt_v_id = tgt_v_id.replace(" ", "_") data = json.dumps( { "edges": { @@ -212,7 +241,6 @@ async def upsert_edge( } ) headers = make_headers(conn) - # print("upsert edge >>>", src_v_id, tgt_v_id) async with httpx.AsyncClient(timeout=http_timeout) as client: res = await client.post( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index b7267b60..4c1174df 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -11,8 +11,8 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import util # import upsert_edge, upsert_vertex -from langchain_community.graphs.graph_document import GraphDocument +from graphrag import util +from langchain_community.graphs.graph_document import GraphDocument, Node from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -67,7 +67,7 @@ async def chunk_doc( """ chunker = ecc_util.get_chunker() chunks = chunker.chunk(doc["attributes"]["text"]) - v_id = doc["v_id"] + v_id = util.process_id(doc["v_id"]) logger.info(f"Chunking {v_id}") for i, chunk in enumerate(chunks): chunk_id = f"{v_id}_chunk_{i}" @@ -145,6 +145,17 @@ async def embed( await embed_store.aadd_embeddings([(content, vec)], [{vertex_field: v_id}]) +async def get_vert_desc(conn, v_id, node: Node): + desc = [node.properties.get("description", "")] + exists = await util.check_vertex_exists(conn, v_id) + # if vertex exists, get description content and append this description to it + if not exists["error"]: + # dedup descriptions + desc.extend(exists["results"][0]["attributes"]["description"]) + desc = list(set(desc)) + return desc + + async def extract( upsert_chan: Channel, embed_chan: Channel, @@ -159,12 +170,22 @@ async def extract( for doc in extracted: for node in doc.nodes: logger.info(f"extract writes entity vert to upsert\nNode: {node.id}") - v_id = str(node.id) - desc = node.properties.get("description", "") + v_id = util.process_id(str(node.id)) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, node) + + # embed the entity + # embed with the v_id if the description is blank + if len(desc[0]): + await embed_chan.put((v_id, v_id, "Entity")) + else: + # (v_id, content, index_name) + await embed_chan.put((v_id, desc[0], "Entity")) + await upsert_chan.put( ( util.upsert_vertex, # func to call - # conn, v_id, chunk_id, chunk ( conn, "Entity", # v_type @@ -188,33 +209,134 @@ async def extract( chunk_id, # src_id "CONTAINS_ENTITY", # edge_type "Entity", # tgt_type - str(node.id), # tgt_id + v_id, # tgt_id None, # attributes ), ) ) - # embed the entity - # (v_id, content, index_name) - await embed_chan.put((v_id, desc, "Entity")) - for edge in doc.relationships: logger.info( f"extract writes relates edge to upsert\n{edge.source.id} -({edge.type})-> {edge.target.id}" ) + # upsert verts first to make sure their ID becomes an attr + v_id = util.process_id(edge.source.id) # src_id + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.source) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + v_id = util.process_id(edge.target.id) + if len(v_id) == 0: + continue + desc = await get_vert_desc(conn, v_id, edge.target) + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Entity", # v_type + v_id, # src_id + { # attrs + "description": desc, + "epoch_added": int(time.time()), + }, + ), + ) + ) + + # upsert the edge between the two entities await upsert_chan.put( ( util.upsert_edge, ( conn, "Entity", # src_type - edge.source.id, # src_id + util.process_id(edge.source.id), # src_id "RELATIONSHIP", # edgeType "Entity", # tgt_type - edge.target.id, # tgt_id + util.process_id(edge.target.id), # tgt_id {"relation_type": edge.type}, # attributes ), ) ) # embed "Relationship", # (v_id, content, index_name) + + +async def resolve_entity( + conn: TigerGraphConnection, + upsert_chan: Channel, + emb_store: MilvusEmbeddingStore, + entity_id: str, +): + """ + get all vectors of E (one name can have multiple discriptions) + get ents close to E + for e in ents: + if e is 95% similar to E and edit_dist(E,e) <=3: + merge + mark e as processed + + mark as processed + """ + results = await emb_store.aget_k_closest(entity_id) + if len(results) == 0: + logger.error( + f"aget_k_closest should, minimally, return the entity itself.\n{results}" + ) + raise Exception() + if entity_id == "Dataframe": + print("result:", entity_id, results) + + # merge all entities into the ResolvedEntity vertex + # use the longest v_id as the resolved entity's v_id + resolved_entity_id = "" + for v in results: + # v_id = v.metadata["vertex_id"] + if len(v) > len(resolved_entity_id): + resolved_entity_id = v + + # upsert the resolved entity + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "ResolvedEntity", # v_type + resolved_entity_id, # v_id + { # attrs + "description": [] + }, + ), + ) + ) + + # create RESOLVES_TO edges from each entity to the ResolvedEntity + for v in results: + await upsert_chan.put( + ( + util.upsert_edge, + ( + conn, + "Entity", # src_type + v, # src_id + "RESOLVES_TO", # edge_type + "ResolvedEntity", # tgt_type + resolved_entity_id, # tgt_id + None, # attributes + ), + ) + ) From 8ab8774cc160445a1602c18ddf2b9e7bc1b87a35 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Fri, 9 Aug 2024 18:47:13 -0400 Subject: [PATCH 09/53] starting to upsert community summaries --- common/embeddings/embedding_services.py | 2 +- .../gsql/graphRAG/ResolveRelationships.gsql | 2 +- .../gsql/graphRAG/get_community_children.gsql | 12 ++ common/gsql/graphRAG/leven.cpp | 59 ------ .../louvain/graphrag_louvain_communities.gsql | 199 ++++++++++++++++++ .../louvain/graphrag_louvain_init.gsql | 185 ++++++++++++++++ common/gsql/graphRAG/louvain/louvain1.gsql | 17 -- common/gsql/graphRAG/louvain/modularity.gsql | 49 +++++ .../graphRAG/louvain/stream_community.gsql | 9 + common/gsql/supportai/SupportAI_Schema.gsql | 14 +- common/py_schemas/tool_io_schemas.py | 25 ++- copilot/docs/notebooks/graphrag.ipynb | 127 +++++------ eventual-consistency-service/app/ecc_util.py | 33 ++- .../app/graphrag/community_summarizer.py | 138 ++++++++++++ .../app/graphrag/graph_rag.py | 158 ++++++++++++-- .../app/graphrag/util.py | 63 ++++-- .../app/graphrag/workers.py | 63 +++++- eventual-consistency-service/requirements.txt | 34 +-- 18 files changed, 968 insertions(+), 221 deletions(-) create mode 100644 common/gsql/graphRAG/get_community_children.gsql delete mode 100644 common/gsql/graphRAG/leven.cpp create mode 100644 common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql create mode 100644 common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql delete mode 100644 common/gsql/graphRAG/louvain/louvain1.gsql create mode 100644 common/gsql/graphRAG/louvain/modularity.gsql create mode 100644 common/gsql/graphRAG/louvain/stream_community.gsql create mode 100644 eventual-consistency-service/app/graphrag/community_summarizer.py diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index dd506670..13c2cfd0 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -134,7 +134,7 @@ def __init__(self, config): super().__init__( config, model_name=config.get("model_name", "OpenAI gpt-4-0613") ) - from langchain.embeddings import OpenAIEmbeddings + from langchain_openai import OpenAIEmbeddings self.embeddings = OpenAIEmbeddings() diff --git a/common/gsql/graphRAG/ResolveRelationships.gsql b/common/gsql/graphRAG/ResolveRelationships.gsql index d3c69297..6a0e515d 100644 --- a/common/gsql/graphRAG/ResolveRelationships.gsql +++ b/common/gsql/graphRAG/ResolveRelationships.gsql @@ -13,7 +13,7 @@ CREATE DISTRIBUTED QUERY ResolveRelationships(BOOL printResults=FALSE) SYNTAX V2 REs = SELECT re1 FROM REs:re1 -(:rel)- Entity:e_tgt -(RESOLVES_TO>:r)- ResolvedEntity:re2 // Connect the The first RE to the second RE ACCUM - INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO) VALUES(re1, re2); + INSERT INTO RESOLVED_RELATIONSHIP(FROM,TO, relation_type) VALUES(re1, re2, rel.relation_type); IF printResults THEN diff --git a/common/gsql/graphRAG/get_community_children.gsql b/common/gsql/graphRAG/get_community_children.gsql new file mode 100644 index 00000000..7913e1b7 --- /dev/null +++ b/common/gsql/graphRAG/get_community_children.gsql @@ -0,0 +1,12 @@ +CREATE DISTRIBUTED QUERY get_community_children(Vertex comm, UINT iter) SYNTAX V2{ + Comms = {comm}; + + IF iter > 1 THEN + Comms = SELECT t FROM Comms:c -()- ResolvedEntity -(_>)- Entity:t; + + PRINT Ents[Ents.description as description] as children; + END; +} diff --git a/common/gsql/graphRAG/leven.cpp b/common/gsql/graphRAG/leven.cpp deleted file mode 100644 index 10c45669..00000000 --- a/common/gsql/graphRAG/leven.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include - -// Returns the Levenshtein distance between word1 and word2. -int levenshteinDist(std::string word1, std::string word2) { - int size1 = word1.size(); - int size2 = word2.size(); - int verif[size1 + 1][size2 + 1]; // Verification matrix i.e. 2D array - // which will store the calculated distance. - - // If one of the words has zero length, the distance is equal to the size of - // the other word. - if (size1 == 0) return size2; - if (size2 == 0) return size1; - - // Sets the first row and the first column of the verification matrix with - // the numerical order from 0 to the length of each word. - for (int i = 0; i <= size1; i++) verif[i][0] = i; - for (int j = 0; j <= size2; j++) verif[0][j] = j; - - // Verification step / matrix filling. - for (int i = 1; i <= size1; i++) { - for (int j = 1; j <= size2; j++) { - // Sets the modification cost. - // 0 means no modification (i.e. equal letters) and 1 means that a - // modification is needed (i.e. unequal letters). - int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; - - // Sets the current position of the matrix as the minimum value - // between a (deletion), b (insertion) and c (substitution). a = the - // upper adjacent value plus 1: verif[i - 1][j] + 1 b = the left - // adjacent value plus 1: verif[i][j - 1] + 1 c = the upper left - // adjacent value plus the modification cost: verif[i - 1][j - 1] + - // cost - verif[i][j] = - std::min(std::min(verif[i - 1][j] + 1, verif[i][j - 1] + 1), - verif[i - 1][j - 1] + cost); - } - } - - // The last position of the matrix will contain the Levenshtein distance. - return verif[size1][size2]; -} - -int main() { - std::string word1, word2; - - std::cout << "Please input the first word: " << std::endl; - std::cin >> word1; - std::cout << "Please input the second word: " << std::endl; - std::cin >> word2; - - // cout << "The number of modifications needed in order to make one word " - // "equal to the other is: " - std::cout << "The edit distance is: " << levenshteinDist(word1, word2) - << std::endl; - - return 0; -} diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql new file mode 100644 index 00000000..366b7ea7 --- /dev/null +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -0,0 +1,199 @@ +CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max_hop = 10, UINT n_batches = 1) SYNTAX V2{ + /* + * This is the same query as tg_louvain, just that Paper-related schema + * are changed to Community-related schema + * + * For the first call to this query, iteration = 1 + */ + TYPEDEF TUPLE community, STRING ext_vid> Move; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @community_id; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community, @is_current_iter, @has_parent; + SumAccum @batch_id; + MinAccum @vid; + + AllNodes = {Community.*}; + + // Get communities of the current iteration + AllNodes = SELECT s FROM AllNodes:s + WHERE s.iteration == iteration + ACCUM s.@is_current_iter += TRUE; + + // init + z = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@is_current_iter AND t.@is_current_iter + ACCUM s.@k += e.weight, + @@m += e.weight/2, + IF s == t THEN // self loop + s.@k_self_loop += e.weight + END + POST-ACCUM + s.@community_id = s, // assign node to its own community + s.@community_vid = to_string(s.id), // external id + s.@vid = getvid(s), // internal id (used in batching) + s.@batch_id = s.@vid % n_batches; // get batch number + + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = AllNodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop += 1; + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -(_>:e)- Community:t + WHERE s.@community_id != t.@community_id // can't move within the same community + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE dq = 1 - s.@k * t.@k / (2 * @@m), + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community += TRUE + END + HAVING s.@to_change_community == TRUE; // only select nodes that will move + ELSE // other iterations + // Calculate sum_total of links in each community + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + @@community_sum_total_map += (s.@community_id -> s.@k); + // store community's total edges in each vert (easier access) + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + + // find the best move + ChangedNodes = {}; + + // process nodes in batch + FOREACH batch_id IN RANGE[0, n_batches-1] DO + Nodes = SELECT s FROM Candidates:s -(_>:e)- Community:t + WHERE s.@batch_id == batch_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + IF s.@community_id == t.@community_id THEN + // add edge weights connected to s + s.@k_in += e.weight + ELSE + // add edge weights connecetd to t + s.@community_k_in_map += (t.@community_id -> e.weight) + END + POST-ACCUM + // ∆Q if s is moved out of its current community + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = Move(@@min_double, s, to_string(s.id)); // reset best move + + // find the best move + Nodes = SELECT s FROM Nodes:s -(_>:E)- Community:t + WHERE s.@community_id != t.@community_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE dq = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community = TRUE// s should move + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; // only select nodes that will move + + // Add nodes that will move to ChangedNodes + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -(_>:e)- Community:t + WHERE s.@best_move.community == t.@community_id + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + AND t.@to_change_community + AND t.@best_move.community == s.@community_id + // if delta Q are the same, only change the one with larger delta Q or the one with smaller @vid + AND ( + s.@delta_Q_remove + s.@best_move.delta_q < t.@delta_Q_remove + t.@best_move.delta_q + OR ( + abs( + (s.@delta_Q_remove + s.@best_move.delta_q) + - (t.@delta_Q_remove + t.@best_move.delta_q) + ) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + // remove SwapNodes (don't need to be changed) + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Update node communities (based on max ∆Q) + SwapNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, // move the node + s.@community_vid = s.@best_move.ext_vid, // move the node (external v_id update) + s.@to_change_community = FALSE; + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -(_>:e)- Community:t + WHERE t.@community_id != s.@community_id + AND s.@is_current_iter AND t.@is_current_iter; // only use Communities in the current iteration + END; + + // Coarsening + @@community_sum_total_map.clear(); + Tmp = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + IF s.@community_id == t.@community_id THEN + // keep track of how many edges are within the community + @@community_sum_in_map += (s.@community_id -> e.weight) + ELSE + // get LINKS_TO edge weights (how many edges are between communities) + // s.@community_k_in_map += (t.@community_id -> 1) + @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> e.weight)) + END, + t.@has_parent += TRUE // Used to help find unattached partitions + POST-ACCUM + // Write the results to a new community vertex (iteration + 1) + // ID , iter, edges within the community + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community + ; + + // Continue community hierarchy for unattached partitions + Tmp = SELECT s FROM AllNodes:s + WHERE s.@is_current_iter + AND NOT s.@has_parent + POST-ACCUM + // if s is a part of an unattached partition, add to its community hierarchy to maintain parity with rest of graph + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO HAS_PARENT VALUES (s, s.id+"_"+to_string(iteration+1)) // link Community's child/parent community + ; + + // link communities + // "If two communities have an edge between them, their parents should also have an edge bewtween them" + Tmp = SELECT s FROM AllNodes:s -(_>:e)- Community:t + WHERE s.@community_vid != t.@community_vid + AND s.@is_current_iter AND t.@is_current_iter // only use Communities in the current iteration + ACCUM + DOUBLE w = @@source_target_k_in_map.get(s.@community_vid).get(t.@community_vid)/2, + INSERT INTO LINKS_TO VALUES (s.@community_vid+"_"+to_string(iteration+1), t.@community_vid+"_"+to_string(iteration+1), w) + ; +} diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql new file mode 100644 index 00000000..2ccbaf2c --- /dev/null +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql @@ -0,0 +1,185 @@ +CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches = 1) { + /* + * Initialize GraphRAG's hierarchical communities. + */ + TYPEDEF TUPLE community, STRING ext_vid> Move; + SumAccum @@m; // the sum of the weights of all the links in the network + MinAccum> @community_id; // the community ID of the node + MinAccum @community_vid; // the community ID of the node + SumAccum @k; // the sum of the weights of the links incident to the node + SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node + SumAccum @k_self_loop; // the weight of the self-loop link + MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community + MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C + SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node + MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community + MapAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) + SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community + MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community + MaxAccum @@min_double; // used to reset the @best_move + SumAccum @@move_cnt; + OrAccum @to_change_community; + SumAccum @batch_id; + MinAccum @vid; + + AllNodes = {ResolvedEntity.*}; + DOUBLE wt = 1.0; + + // prevent multiple init runs + // z = SELECT s FROM AllNodes:s -(_)-> Community:t; + // IF z.size() > 0 THEN + // EXCEPTION reinit(400001); + // RAISE reinit("ERROR: the hierarchical communities have already been initialized"); + // END; + + // init + z = SELECT s FROM AllNodes:s + ACCUM + s.@community_id = s, // assign node to its own community + s.@community_vid = s.id, // external id + s.@vid = getvid(s), // internal id (used in batching) + s.@batch_id = s.@vid % n_batches; // get batch number + z = SELECT s FROM AllNodes:s -(_)-> ResolvedEntity:t + ACCUM s.@k += wt, + @@m += 1; + // POST-ACCUM + // s.@community_id = s, // assign node to its own community + // s.@community_vid = s.id, // external id + // s.@vid = getvid(s), // internal id (used in batching) + // s.@batch_id = s.@vid % n_batches; // get batch number + + PRINT z.size(); + PRINT z; + + // Local moving + INT hop = 0; + Candidates = AllNodes; + WHILE Candidates.size() > 0 AND hop < max_hop DO + hop += 1; + IF hop == 1 THEN // first iteration + ChangedNodes = SELECT s FROM Candidates:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_id != t.@community_id // can't move within the same community + ACCUM + DOUBLE dq = 1 - s.@k * t.@k / (2 * @@m), + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community += TRUE + END + HAVING s.@to_change_community == TRUE; // only select nodes that will move + PRINT ChangedNodes.size(); + ELSE // other iterations + // Calculate sum_total of links in each community + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + @@community_sum_total_map += (s.@community_id -> s.@k); + // store community's total edges in each vert (easier access) + Tmp = SELECT s FROM AllNodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + + // find the best move + ChangedNodes = {}; + + // process nodes in batch + FOREACH batch_id IN RANGE[0, n_batches-1] DO + Nodes = SELECT s FROM Candidates:s -(_:e)-> ResolvedEntity:t + WHERE s.@batch_id == batch_id + ACCUM + IF s.@community_id == t.@community_id THEN + // add edge weights connected to s + s.@k_in += wt + ELSE + // add edge weights connecetd to t + s.@community_k_in_map += (t.@community_id -> wt) + END + POST-ACCUM + // ∆Q if s is moved out of its current community + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = Move(@@min_double, s, to_string(s.id)); // reset best move + + // find the best move + Nodes = SELECT s FROM Nodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_id != t.@community_id + ACCUM + DOUBLE dq = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += Move(dq, t.@community_id, t.@community_vid) // find the best move + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_q > 0 THEN // if the move increases dq + s.@to_change_community = TRUE// s should move + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; // only select nodes that will move + + // Add nodes that will move to ChangedNodes + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s FROM ChangedNodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@best_move.community == t.@community_id + AND t.@to_change_community + AND t.@best_move.community == s.@community_id + // if delta Q are the same, only change the one with larger delta Q or the one with smaller @vid + AND ( + s.@delta_Q_remove + s.@best_move.delta_q < t.@delta_Q_remove + t.@best_move.delta_q + OR ( + abs( + (s.@delta_Q_remove + s.@best_move.delta_q) + - (t.@delta_Q_remove + t.@best_move.delta_q) + ) < 0.00000000001 + AND s.@vid > t.@vid + ) + ) + POST-ACCUM + s.@to_change_community = FALSE; + + // remove SwapNodes (don't need to be changed) + ChangedNodes = ChangedNodes MINUS SwapNodes; + + // Update node communities (based on max ∆Q) + SwapNodes = SELECT s FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, // move the node + s.@community_vid = s.@best_move.ext_vid, // move the node (external v_id update) + s.@to_change_community = FALSE; + @@move_cnt += ChangedNodes.size(); + + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t FROM ChangedNodes:s -(_:e)-> ResolvedEntity:t + WHERE t.@community_id != s.@community_id; + END; + + // Coarsening + UINT new_layer = 0; + @@community_sum_total_map.clear(); + Tmp = SELECT s FROM AllNodes:s -(_:e)-> ResolvedEntity:t + ACCUM + IF s.@community_id == t.@community_id THEN + // keep track of how many edges are within the community + @@community_sum_in_map += (s.@community_id -> wt) + ELSE + // get LINKS_TO edge weights (how many edges are between communities) + @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> 1)) + END + POST-ACCUM + // ID , iter, edges within the community + INSERT INTO Community VALUES (s.@community_vid+"_1", 1, @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO IN_COMMUNITY VALUES (s, s.@community_vid+"_1") // link entity to it's first community + ; + + PRINT @@source_target_k_in_map; + + @@community_sum_total_map.clear(); + // link communities + Tmp = SELECT s FROM AllNodes:s -(_:e)-> ResolvedEntity:t + WHERE s.@community_vid != t.@community_vid + ACCUM + DOUBLE w = @@source_target_k_in_map.get(s.@community_vid).get(t.@community_vid), + INSERT INTO LINKS_TO VALUES (s.@community_vid+"_1", t.@community_vid+"_1", w); + + + PRINT @@source_target_k_in_map; +} diff --git a/common/gsql/graphRAG/louvain/louvain1.gsql b/common/gsql/graphRAG/louvain/louvain1.gsql deleted file mode 100644 index 494a3625..00000000 --- a/common/gsql/graphRAG/louvain/louvain1.gsql +++ /dev/null @@ -1,17 +0,0 @@ -CREATE DISTRIBUTED QUERY graphRAG_louvain_1() { - - Ents = {ResolvedEntity.*}; - - // Put each node into a distinct community - // Assume each Entity starts in its own community - - // For each node i - // Compute ∆Q (modularity) when putting node i into the community of some neighbor j - // move i to community that yields the largest gain in ∆Q - - Z = SELECT v FROM Ents:v -(_:e)-> ResolvedEntity:r - - - ; -} - diff --git a/common/gsql/graphRAG/louvain/modularity.gsql b/common/gsql/graphRAG/louvain/modularity.gsql new file mode 100644 index 00000000..3aaad826 --- /dev/null +++ b/common/gsql/graphRAG/louvain/modularity.gsql @@ -0,0 +1,49 @@ +CREATE DISTRIBUTED QUERY modularity(UINT iteration=1) SYNTAX V2 { + SumAccum @@sum_weight; // the sum of the weights of all the links in the network + MinAccum @community_id; // the community ID of the node + MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C + MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community + SumAccum @@modularity; + MinAccum @parent; + DOUBLE wt = 1.0; + Comms = {Community.*}; + + // Assign Entities to their correct community (given the specified iteration level) + IF iteration > 1 THEN + Comms = SELECT t FROM Comms:c -()- ResolvedEntity:t + ACCUM t.@community_id = c.@parent; + + ELSE + Entities = SELECT t FROM Comms:c -(_>)- ResolvedEntity:t + WHERE c.iteration == iteration + ACCUM t.@community_id = c.id; + END; + + Nodes = SELECT s FROM Entities:s -(_>:e)- ResolvedEntity:t + ACCUM + IF s.@community_id == t.@community_id THEN + @@community_in_weight_map += (s.@community_id -> wt) + END, + @@community_total_weight_map += (s.@community_id -> wt), + @@sum_weight += wt; + + @@modularity = 0; + FOREACH (community, total_weight) IN @@community_total_weight_map DO + DOUBLE in_weight = 0; + IF @@community_in_weight_map.containsKey(community) THEN + in_weight = @@community_in_weight_map.get(community); + END; + @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); + END; + + PRINT @@modularity as mod; +} diff --git a/common/gsql/graphRAG/louvain/stream_community.gsql b/common/gsql/graphRAG/louvain/stream_community.gsql new file mode 100644 index 00000000..d01959d2 --- /dev/null +++ b/common/gsql/graphRAG/louvain/stream_community.gsql @@ -0,0 +1,9 @@ +CREATE DISTRIBUTED QUERY stream_community(UINT iter) { + Comms = {Community.*}; + + // Get communities of the current iteration + Comms = SELECT s FROM Comms:s + WHERE s.iteration == iter; + + PRINT Comms; +} diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 1a705eaf..3e127d82 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -20,16 +20,14 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; // GraphRAG - ADD VERTEX Community(PRIMARY_ID id STRING, description INT) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, k_in UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; // TODO: check where knn algo writes results + ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; ADD DIRECTED EDGE RESOLVES_TO(FROM Entity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVES_TO"; // Connect ResolvedEntities with their children entities - ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved - ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; + ADD DIRECTED EDGE RESOLVED_RELATIONSHIP(FROM ResolvedEntity, TO ResolvedEntity, relation_type STRING) WITH REVERSE_EDGE="reverse_RESOLVED_RELATIONSHIP"; // store edges between entities after they're resolved - // TODO: louvain will be run on resolved entities, but stored in community then on communities until louvain runs out - // Hierarchical communities (Louvain/Leiden) - // ADD UNDIRECTED EDGE LINKS_TO(FROM Community, TO Community); - // ADD DIRECTED EDGE BELONGS_TO(FROM Community, TO Community); + ADD DIRECTED EDGE IN_COMMUNITY(FROM ResolvedEntity, TO Community) WITH REVERSE_EDGE="reverse_IN_COMMUNITY"; + ADD DIRECTED EDGE LINKS_TO (from Community, to Community, weight DOUBLE) WITH REVERSE_EDGE="reverse_LINKS_TO"; + ADD DIRECTED EDGE HAS_PARENT (from Community, to Community) WITH REVERSE_EDGE="reverse_HAS_PARENT"; } diff --git a/common/py_schemas/tool_io_schemas.py b/common/py_schemas/tool_io_schemas.py index 1ea6ed3e..4ca91b3d 100644 --- a/common/py_schemas/tool_io_schemas.py +++ b/common/py_schemas/tool_io_schemas.py @@ -1,10 +1,8 @@ +from typing import Dict, List, Optional + from langchain.pydantic_v1 import BaseModel, Field -from typing import Optional -from langchain_community.graphs.graph_document import ( - Node as BaseNode, - Relationship as BaseRelationship, -) -from typing import List, Dict, Type +from langchain_community.graphs.graph_document import Node as BaseNode +from langchain_community.graphs.graph_document import Relationship as BaseRelationship class MapQuestionToSchemaResponse(BaseModel): @@ -81,14 +79,27 @@ class KnowledgeGraph(BaseModel): ..., description="List of relationships in the knowledge graph" ) + class ReportQuestion(BaseModel): question: str = Field("The question to be asked") reasoning: str = Field("The reasoning behind the question") + class ReportSection(BaseModel): section: str = Field("Name of the section") description: str = Field("Description of the section") - questions: List[ReportQuestion] = Field("List of questions and reasoning for the section") + questions: List[ReportQuestion] = Field( + "List of questions and reasoning for the section" + ) + class ReportSections(BaseModel): sections: List[ReportSection] = Field("List of sections for the report") + + +class CommunitySummary(BaseModel): + """Generate a summary of the documents that are within this community.""" + + summary: str = Field( + ..., description="The community summary derived from the input documents" + ) diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index bde1b78f..e915f392 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 10, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -54,32 +54,32 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 2.208 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 3.025 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.043 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.066 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# And then add CoPilot's address to the connection. This address\n", - "# is the host's address where the CoPilot container is running.\n", + "# # And then add CoPilot's address to the connection. This address\n", + "# # is the host's address where the CoPilot container is running.\n", "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", "conn.ai.initializeSupportAI()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722466964295'}" + "{'job_name': 'load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268'}" ] }, - "execution_count": 14, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -121,41 +121,31 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import httpx\n", - "import base64\n", + "# import httpx\n", + "# import base64\n", "\n", "\n", - "def make_headers(conn: TigerGraphConnection):\n", - " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - " return headers\n", + "# def make_headers(conn: TigerGraphConnection):\n", + "# tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", + "# headers = {\"Authorization\": f\"Basic {tkn}\"}\n", + "# return headers\n", "\n", "\n", - "httpx.get(\n", - " \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", - " headers=make_headers(conn),\n", - ")\n", - "# conn.ai.forceConsistencyUpdate()" + "# httpx.get(\n", + "# \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", + "# headers=make_headers(conn),\n", + "# timeout=None,\n", + "# )\n", + "# # conn.ai.forceConsistencyUpdate()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -165,7 +155,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -176,24 +166,39 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "for v in [\"Community\"]:\n", + " try:\n", + " conn.delVertices(v)\n", + " except:\n", + " pass\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_b89acfebac9e4fb98efd20a49659808e',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_b89acfebac9e4fb98efd20a49659808e.stream.SupportAI_GraphRAG_pytgdocs_5698bff74d844534901cba9e1b3d55bf.1722531204658'}" + "{'job_name': 'load_documents_content_json_3e62fb87723945ea9a0380956694b7ec',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186'}" ] }, - "execution_count": 30, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", + "# for v in [\"ResolvedEntity\"]:\n", "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", @@ -222,32 +227,6 @@ "\"\"\")" ] }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'deleted_vertices'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m conn\u001b[38;5;241m.\u001b[39mgetToken()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# for v in [\"ResolvedEntity\"]:\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelVertices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.venv/ml/lib/python3.11/site-packages/pyTigerGraph/pyTigerGraphVertex.py:688\u001b[0m, in \u001b[0;36mpyTigerGraphVertex.delVertices\u001b[0;34m(self, vertexType, where, limit, sort, permanent, timeout)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 686\u001b[0m url \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m?\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m isFirst \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m&\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout=\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(timeout)\n\u001b[0;32m--> 688\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_delete\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdeleted_vertices\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m logger\u001b[38;5;241m.\u001b[39mlevel \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mDEBUG:\n\u001b[1;32m 691\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(ret))\n", - "\u001b[0;31mKeyError\u001b[0m: 'deleted_vertices'" - ] - } - ], - "source": [ - "conn.graphname = \"Cora\"\n", - "conn.getToken()\n", - "for v in [\"Community\"]:\n", - " # for v in [\"ResolvedEntity\"]:\n", - " conn.delVertices(v)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/eventual-consistency-service/app/ecc_util.py b/eventual-consistency-service/app/ecc_util.py index 5656e219..bccadd77 100644 --- a/eventual-consistency-service/app/ecc_util.py +++ b/eventual-consistency-service/app/ecc_util.py @@ -1,5 +1,15 @@ from common.chunkers import character_chunker, regex_chunker, semantic_chunker -from common.config import doc_processing_config, embedding_service +from common.config import doc_processing_config, embedding_service, llm_config +from common.llm_services import ( + AWS_SageMaker_Endpoint, + AWSBedrock, + AzureOpenAI, + GoogleVertexAI, + Groq, + HuggingFaceEndpoint, + Ollama, + OpenAI, +) def get_chunker(): @@ -22,3 +32,24 @@ def get_chunker(): raise ValueError("Invalid chunker type") return chunker + + +def get_llm_service(): + if llm_config["completion_service"]["llm_service"].lower() == "openai": + llm_provider = OpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "azure": + llm_provider = AzureOpenAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "sagemaker": + llm_provider = AWS_SageMaker_Endpoint(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "vertexai": + llm_provider = GoogleVertexAI(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "bedrock": + llm_provider = AWSBedrock(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "groq": + llm_provider = Groq(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "ollama": + llm_provider = Ollama(llm_config["completion_service"]) + elif llm_config["completion_service"]["llm_service"].lower() == "huggingface": + llm_provider = HuggingFaceEndpoint(llm_config["completion_service"]) + + return llm_provider diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py new file mode 100644 index 00000000..d250b1f3 --- /dev/null +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -0,0 +1,138 @@ +import json + +from langchain.output_parsers import PydanticOutputParser +from langchain.prompts import ChatPromptTemplate +from langchain_core.prompts import PromptTemplate + +from common.llm_services import LLM_Model +from common.py_schemas import CommunitySummary + +# src: https://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/summarize/prompts.py +SUMMARIZE_PROMPT = PromptTemplate.from_template(""" +You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""") + + +class CommunitySummarizer: + def __init__( + self, + llm_service: LLM_Model, + ): + self.llm_service = llm_service + + def _extract_kg_from_doc(self, doc, chain, parser): + try: + out = chain.invoke( + {"input": doc, "format_instructions": parser.get_format_instructions()} + ) + except Exception as e: + print("Error: ", e) + return {"nodes": [], "rels": []} + try: + if "```json" not in out.content: + json_out = json.loads(out.content.strip("content=")) + else: + json_out = json.loads( + out.content.split("```")[1].strip("```").strip("json").strip() + ) + + formatted_rels = [] + for rels in json_out["rels"]: + if isinstance(rels["source"], str) and isinstance(rels["target"], str): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], str + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], str) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + elif isinstance(rels["source"], dict) and isinstance( + rels["target"], dict + ): + formatted_rels.append( + { + "source": rels["source"]["id"], + "target": rels["target"]["id"], + "type": rels["relation_type"].replace(" ", "_").upper(), + "definition": rels["definition"], + } + ) + else: + raise Exception("Relationship parsing error") + formatted_nodes = [] + for node in json_out["nodes"]: + formatted_nodes.append( + { + "id": node["id"], + "type": node["node_type"].replace(" ", "_").capitalize(), + "definition": node["definition"], + } + ) + + # filter relationships and nodes based on allowed types + if self.strict_mode: + if self.allowed_vertex_types: + formatted_nodes = [ + node + for node in formatted_nodes + if node["type"] in self.allowed_vertex_types + ] + if self.allowed_edge_types: + formatted_rels = [ + rel + for rel in formatted_rels + if rel["type"] in self.allowed_edge_types + ] + return {"nodes": formatted_nodes, "rels": formatted_rels} + except: + print("Error Processing: ", out) + return {"nodes": [], "rels": []} + + async def summarize(self, name: str, text: list[str]) -> CommunitySummary: + # parser = PydanticOutputParser(pydantic_object=CommunitySummary) + structured_llm = self.llm_service.model.with_structured_output(CommunitySummary) + chain = SUMMARIZE_PROMPT | structured_llm + summary = await chain.ainvoke( + { + "entity_name": name, + "description_list": text, + # "format_instructions": parser.get_format_instructions(), + } + ) + # summary = self._extract_kg_from_doc(text, chain, parser) + # summary = None + return summary.summary diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 4403756d..d4e3a7d6 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -5,15 +5,16 @@ import httpx from aiochannel import Channel -from common.config import embedding_service -from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore -from common.extractors.BaseExtractor import BaseExtractor from graphrag import workers from graphrag.util import http_timeout, init, make_headers, stream_ids from pyTigerGraph import TigerGraphConnection -http_logs = logging.getLogger("httpx") -http_logs.setLevel(logging.WARNING) +from common.config import embedding_service +from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore +from common.extractors.BaseExtractor import BaseExtractor + +# http_logs = logging.getLogger("httpx") +# http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) consistency_checkers = {} @@ -209,7 +210,7 @@ async def resolve_entities( async with asyncio.TaskGroup() as grp: # for every entity async for entity_id in entity_chan: - print(f"***Etity ID from chan {entity_id}") + print(f"***Entity ID from chan {entity_id}", flush=True) grp.create_task( workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) ) @@ -226,9 +227,115 @@ async def resolve_entities( res.raise_for_status() -async def communities(conn: TigerGraphConnection): - pass - # Setup +async def communities(conn: TigerGraphConnection, community_chan: Channel): + """ + Run louvain + """ + # first pass: Group ResolvedEntities into Communities + logger.info("Initializing Communities (first louvain pass)") + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_init", + params={"n_batches": 1}, + headers=headers, + ) + res.raise_for_status() + # get the modularity + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": 1, "batch_num": 1}, + headers=headers, + ) + res.raise_for_status() + mod = res.json()["results"][0]["mod"] + print(f"****mod 1: {mod}", flush=True) + await community_chan.put(1) + + # nth pass: Iterate on Resolved Entities until modularity stops increasing + prev_mod = -10 + i = 0 + # for _ in range(1, 5): + prev_mod = 0 + while abs(prev_mod - mod) > 0.0000001 and prev_mod != 0: + prev_mod = mod + logger.info(f"Running louvain on Communities (iteration: {i})") + i += 1 + # louvain pass + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", + params={"n_batches": 1}, + headers=headers, + ) + + res.raise_for_status() + + # get the modularity + async with httpx.AsyncClient(timeout=None) as client: + res = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/modularity", + params={"iteration": i + 1, "batch_num": 1}, + headers=headers, + ) + res.raise_for_status() + mod = res.json()["results"][0]["mod"] + print(f"*** mod {i+1}: {mod}", flush=True) + print(f"****** mod diff: {abs(prev_mod - mod)}", flush=True) + + # write iter to chan for layer to be processed + await community_chan.put(i + 1) + + # TODO: erase last run since it's ∆q to the run before it will be small + logger.info("closing communities chan") + community_chan.close() + + +async def stream_communities( + conn: TigerGraphConnection, + community_chan: Channel, + comm_process_chan: Channel, +): + """ + Streams Community IDs from the grpah for a given iteration (from the channel) + """ + logger.info("streaming communities") + + headers = make_headers(conn) + # TODO: + # can only do one layer at a time to ensure that every child community has their descriptions + async for i in community_chan: + # get the community from that layer + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/stream_community", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + comms = resp.json()["results"][0]["Comms"] + + for c in comms: + await comm_process_chan.put((i, c["v_id"])) + + logger.info("stream_communities done") + logger.info("closing comm_process_chan") + comm_process_chan.close() + + +async def summarize_communities( + conn: TigerGraphConnection, + comm_process_chan: Channel, + upsert_chan: Channel, +): + async with asyncio.TaskGroup() as tg: + async for c in comm_process_chan: + tg.create_task(workers.process_community(conn, upsert_chan, *c)) + break + + logger.info("closing upsert_chan") + upsert_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -245,7 +352,10 @@ async def run(graphname: str, conn: TigerGraphConnection): extractor, index_stores = await init(conn) init_start = time.perf_counter() - if False: + abc = True + abc = False + if abc: + logger.info("Doc Processing Start") docs_chan = Channel(1) embed_chan = Channel(100) upsert_chan = Channel(100) @@ -266,11 +376,13 @@ async def run(graphname: str, conn: TigerGraphConnection): extract(extract_chan, upsert_chan, embed_chan, extractor, conn) ) init_end = time.perf_counter() + logger.info("Doc Processing End") # Entity Resolution entity_start = time.perf_counter() - if False: + if abc: + logger.info("Entity Processing Start") entities_chan = Channel(100) upsert_chan = Channel(100) async with asyncio.TaskGroup() as grp: @@ -285,13 +397,35 @@ async def run(graphname: str, conn: TigerGraphConnection): ) grp.create_task(upsert(upsert_chan)) entity_end = time.perf_counter() + logger.info("Entity Processing End") # Community Detection community_start = time.perf_counter() if True: - await communities(conn) + # FIXME: delete community delete + for v in ["Community"]: + try: + conn.delVertices(v) + except: + pass + logger.info("Community Processing Start") + communities_chan = Channel(1) + upsert_chan = Channel(10) + comm_process_chan = Channel(100) + upsert_chan = Channel(100) + async with asyncio.TaskGroup() as grp: + # run louvain + grp.create_task(communities(conn, communities_chan)) + # get the communities + grp.create_task( + stream_communities(conn, communities_chan, comm_process_chan) + ) + # summarize each community + grp.create_task(summarize_communities(conn, comm_process_chan, upsert_chan)) + grp.create_task(upsert(upsert_chan)) community_end = time.perf_counter() + logger.info("Community Processing End") # Community Summarization end = time.perf_counter() diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 74dbc56d..6876b5de 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -6,6 +6,9 @@ from glob import glob import httpx +from graphrag import workers +from pyTigerGraph import TigerGraphConnection + from common.config import ( doc_processing_config, embedding_service, @@ -17,36 +20,28 @@ from common.extractors import GraphExtractor, LLMEntityRelationshipExtractor from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import workers -from pyTigerGraph import TigerGraphConnection logger = logging.getLogger(__name__) http_timeout = httpx.Timeout(15.0) async def install_queries( - requried_queries: list[str], conn: TigerGraphConnection, n_workers=8 + requried_queries: list[str], + conn: TigerGraphConnection, ): # queries that are currently installed installed_queries = [q.split("/")[-1] for q in conn.getEndpoints(dynamic=True)] - tasks = [] - async with asyncio.TaskGroup() as grp: - for q in requried_queries: - # only install n queries at a time (n=n_workers) - async with asyncio.Semaphore(n_workers): - q_name = q.split("/")[-1] - # if the query is not installed, install it - if q_name not in installed_queries: - task = grp.create_task(workers.install_query(conn, q)) - tasks.append(task) - - for t in tasks: - res = t.result() - print(res) - # stop system if a required query doesn't install - if res["error"]: - raise Exception(res["message"]) + # doesn't need to be parallel since tg only does it one at a time + for q in requried_queries: + # only install n queries at a time (n=n_workers) + q_name = q.split("/")[-1] + # if the query is not installed, install it + if q_name not in installed_queries: + res = await workers.install_query(conn, q) + # stop system if a required query doesn't install + if res["error"]: + raise Exception(res["message"]) async def init_embedding_index(s: MilvusEmbeddingStore, vertex_field: str): @@ -69,9 +64,14 @@ async def init( "common/gsql/graphRAG/StreamDocContent", "common/gsql/graphRAG/SetEpochProcessing", "common/gsql/graphRAG/ResolveRelationships", + "common/gsql/graphRAG/get_community_children", + "common/gsql/graphRAG/louvain/graphrag_louvain_init", + "common/gsql/graphRAG/louvain/graphrag_louvain_communities", + "common/gsql/graphRAG/louvain/modularity", + "common/gsql/graphRAG/louvain/stream_community", ] # add louvain to queries - q = [x.split('.gsql')[0] for x in glob("common/gsql/graphRAG/louvain/*")] + q = [x.split(".gsql")[0] for x in glob("common/gsql/graphRAG/louvain/*")] requried_queries.extend(q) await install_queries(requried_queries, conn) @@ -246,3 +246,24 @@ async def upsert_edge( f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers ) res.raise_for_status() + + +async def get_commuinty_children(conn, i: int, c: str): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/get_community_children", + params={"comm": c, "iter": i}, + headers=headers, + ) + resp.raise_for_status() + descrs = [] + for d in resp.json()["results"][0]["children"]: + desc = d["attributes"]["description"] + if len(desc) == 0: + desc = d["v_id"] + + descrs.append(desc) + + print(">>>", descrs, flush=True) + return descrs diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 4c1174df..22980d96 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -6,14 +6,15 @@ import ecc_util import httpx from aiochannel import Channel +from graphrag import community_summarizer, util +from langchain_community.graphs.graph_document import GraphDocument, Node +from pyTigerGraph import TigerGraphConnection + from common.config import milvus_config from common.embeddings.embedding_services import EmbeddingModel from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor from common.logs.logwriter import LogWriter -from graphrag import util -from langchain_community.graphs.graph_document import GraphDocument, Node -from pyTigerGraph import TigerGraphConnection vertex_field = milvus_config.get("vertex_field", "vertex_id") @@ -298,14 +299,14 @@ async def resolve_entity( f"aget_k_closest should, minimally, return the entity itself.\n{results}" ) raise Exception() - if entity_id == "Dataframe": - print("result:", entity_id, results) + # FIXME: deleteme + # if entity_id == "Dataframe": + # print("result:", entity_id, results) # merge all entities into the ResolvedEntity vertex # use the longest v_id as the resolved entity's v_id - resolved_entity_id = "" + resolved_entity_id = entity_id for v in results: - # v_id = v.metadata["vertex_id"] if len(v) > len(resolved_entity_id): resolved_entity_id = v @@ -318,7 +319,7 @@ async def resolve_entity( "ResolvedEntity", # v_type resolved_entity_id, # v_id { # attrs - "description": [] + # "id": resolved_entity_id, }, ), ) @@ -340,3 +341,49 @@ async def resolve_entity( ), ) ) + + +async def process_community( + conn: TigerGraphConnection, + upsert_chan: Channel, + i: int, + c: str, +): + """ + https://github.com/microsoft/graphrag/blob/main/graphrag/prompt_tune/template/community_report_summarization.py + + Get children verts (Entity for layer-1 Communities, Community otherwise) + if the commuinty only has one child, use its description -- no need to summarize + + embed summaries + """ + print(i, c, flush=True) + + # get the children of the community + children = await util.get_commuinty_children(conn, i, c) + if i == 1: + tmp = [] + for c in children: + tmp.extend(c) + children = list(filter(lambda x: len(x) > 0, tmp)) + print(">>>", children, flush=True) + llm = ecc_util.get_llm_service() + summarizer = community_summarizer.CommunitySummarizer(llm) + summary = await summarizer.summarize(c, children) + await upsert_chan.put((upsert_summary, (conn,summary))) + + +async def upsert_summary(conn: TigerGraphConnection, summary: str): + print(f"SUMMARY:> {summary}", flush=True) + + # vertex_id = vertex_id.replace(" ", "_") + # attrs = map_attrs(attributes) + # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) + # headers = make_headers(conn) + # async with httpx.AsyncClient(timeout=http_timeout) as client: + # res = await client.post( + # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers + # ) + # + # res.raise_for_status() + # diff --git a/eventual-consistency-service/requirements.txt b/eventual-consistency-service/requirements.txt index 3bc0dae0..5d566dd1 100644 --- a/eventual-consistency-service/requirements.txt +++ b/eventual-consistency-service/requirements.txt @@ -7,6 +7,7 @@ appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 +asyncer==0.0.7 attrs==23.1.0 azure-core==1.30.1 azure-storage-blob==12.19.1 @@ -24,12 +25,15 @@ cryptography==42.0.5 dataclasses-json==0.5.14 distro==1.8.0 docker-pycreds==0.4.0 +docstring_parser==0.16 emoji==2.8.0 environs==9.5.0 exceptiongroup==1.1.3 fastapi==0.103.1 +filelock==3.15.4 filetype==1.2.0 frozenlist==1.4.0 +fsspec==2024.6.1 gitdb==4.0.11 GitPython==3.1.40 google-api-core==2.14.0 @@ -51,24 +55,28 @@ h11==0.14.0 httpcore==0.18.0 httptools==0.6.0 httpx==0.25.0 -huggingface_hub==0.23.0 +huggingface-hub==0.23.0 idna==3.4 +iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.1.12 -langchain-community==0.0.28 -langchain-core==0.1.49 -langchain-experimental==0.0.54 +langchain==0.2.12 +langchain-community==0.2.11 +langchain-core==0.2.29 +langchain-experimental==0.0.64 langchain-groq==0.1.3 -langchain-text-splitters==0.0.1 +langchain-openai==0.1.20 +langchain-text-splitters==0.2.2 langchainhub==0.1.14 langdetect==1.0.9 langgraph==0.0.40 -langsmith==0.1.24 +langsmith==0.1.98 +Levenshtein==0.25.1 lxml==4.9.3 marshmallow==3.20.1 minio==7.2.5 @@ -76,11 +84,12 @@ multidict==6.0.4 mypy-extensions==1.0.0 nltk==3.8.1 numpy==1.26.4 -openai==1.3.7 +openai==1.40.2 orjson==3.9.15 packaging==23.2 pandas==2.1.1 pathtools==0.1.2 +pluggy==1.5.0 prometheus_client==0.20.0 proto-plus==1.22.3 protobuf==4.24.4 @@ -94,15 +103,16 @@ pydantic==2.3.0 pydantic_core==2.6.3 pygit2==1.13.2 pymilvus==2.3.6 +pytest==8.2.0 python-dateutil==2.8.2 python-dotenv==1.0.0 python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.1 +pyTigerGraph==1.6.5 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.9.6 regex==2023.10.3 requests==2.31.0 rsa==4.9 @@ -118,12 +128,12 @@ SQLAlchemy==2.0.20 starlette==0.27.0 tabulate==0.9.0 tenacity==8.2.3 -tiktoken==0.5.1 +tiktoken==0.7.0 tqdm==4.66.1 types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.7.1 +typing_extensions==4.12.2 tzdata==2023.3 ujson==5.9.0 unstructured==0.10.23 From ef842ba278fd8cadd9b5be54dd6800040386cb8b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:50:33 -0400 Subject: [PATCH 10/53] graphrag pipeline done --- common/embeddings/milvus_embedding_store.py | 3 - .../gsql/graphRAG/communities_have_desc.gsql | 14 ++ .../louvain_old/louvain_1_first_pass.gsql | 176 -------------- .../louvain_old/louvain_2_other_passes.gsql | 217 ------------------ .../louvain_3_final_community.gsql | 44 ---- .../louvain_4_modularity_1_for_pass.gsql | 39 ---- .../louvain_4_modularity_2_final.gsql | 52 ----- .../graphRAG/louvain_old/louvain_5_reset.gsql | 13 -- copilot/docs/notebooks/graphrag.ipynb | 82 +++++-- .../app/graphrag/community_summarizer.py | 110 +-------- .../app/graphrag/graph_rag.py | 97 ++++---- .../app/graphrag/util.py | 30 ++- .../app/graphrag/workers.py | 58 ++--- 13 files changed, 196 insertions(+), 739 deletions(-) create mode 100644 common/gsql/graphRAG/communities_have_desc.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql delete mode 100644 common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index fd57c783..7384e76f 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -606,9 +606,6 @@ def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): async def aget_k_closest( self, v_id: str, k=15, threshold_similarity=0.90, edit_dist_threshold_pct=0.75 ) -> list[Document]: - """ - asdf - """ threshold_dist = 1 - threshold_similarity # asyncify necessary funcs diff --git a/common/gsql/graphRAG/communities_have_desc.gsql b/common/gsql/graphRAG/communities_have_desc.gsql new file mode 100644 index 00000000..f5cda70e --- /dev/null +++ b/common/gsql/graphRAG/communities_have_desc.gsql @@ -0,0 +1,14 @@ +CREATE DISTRIBUTED QUERY communities_have_desc(UINT iter) SYNTAX V2{ + SumAccum @@descrs; + Comms = {Community.*}; + Comms = SELECT c FROM Comms:c + WHERE c.iteration == iter + ACCUM + IF length(c.description) > 0 THEN + @@descrs += 1 + END; + + + PRINT (@@descrs == Comms.size()) as all_have_desc; + PRINT @@descrs, Comms.size(); +} diff --git a/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql deleted file mode 100644 index 0251909f..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_1_first_pass.gsql +++ /dev/null @@ -1,176 +0,0 @@ -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_1( - UINT max_hop = 10, - UINT batch_num = 12, - UINT sample_edge_num = 100 -) { - - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; //--> this should be Community, I think - SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node - MinAccum @community_vid; // the community ID of the node - SumAccum @k; // the sum of the weights of the links incident to the node - SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node - SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C - SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) - SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community - MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community - MaxAccum @@min_double; // used to reset the @best_move - SumAccum @@move_cnt; - OrAccum @to_change_community; - SumAccum @batch_id; - SumAccum @vid; - - DOUBLE wt = 1.0; - - // Initialization - All_Nodes = {{ResolvedEntity.*}}; - All_Nodes = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM @@m += wt / 2, - s.@k += wt, - IF s == t THEN // self-loop link - js.@k_self_loop += wt - END - POST-ACCUM - s.@{community_id_attribute_name} = s, - s.@community_vid = to_string(s.id), - s.@vid = getvid(s), - s.@batch_id = s.@vid % batch_num; - - IF @@m < 0.00000000001 THEN - PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; - RETURN; - END; - - // Local moving - INT hop = 0; - Candidates = All_Nodes; - WHILE Candidates.size() > 0 AND hop < max_hop DO - hop = hop + 1; - LOG(TRUE, hop); - IF hop == 1 THEN // first iteration - ChangedNodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t - WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END - HAVING s.@to_change_community == TRUE; - - ELSE // remaining iterations - // Calculate sum_total - Tmp = SELECT s FROM All_Nodes:s - POST-ACCUM - @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k); - Tmp = SELECT s FROM All_Nodes:s - POST-ACCUM - s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}); - - @@community_sum_total_map.clear(); - // Find the best move - ChangedNodes = {{}}; - FOREACH batch_id IN RANGE[0, batch_num-1] DO - LOG(TRUE, batch_id); - // Calculate the delta Q to remove the node from the previous community - Nodes = SELECT s FROM Candidates:s -({relation_edge_name}:e)- :t - WHERE s.@batch_id == batch_id - ACCUM - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - s.@k_in += wt - ELSE - s.@community_k_in_map += (t.@{community_id_attribute_name} -> wt) - END - POST-ACCUM - s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, - s.@k_in = 0, - s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add - ; - - // Find the best move - Nodes = SELECT s FROM Nodes:s -({relation_edge_name}:e)- :t - //SAMPLE sample_edge_num EDGE WHEN s.outdegree("{relation_edge_name}") > sample_edge_num - WHERE s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, - s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END, - s.@community_k_in_map.clear() - HAVING s.@to_change_community == TRUE; - - ChangedNodes = ChangedNodes UNION Nodes; - END; - END; - // If two nodes swap, only change the community of one of them - SwapNodes = SELECT s FROM ChangedNodes:s -({relation_edge_name}:e)- :t - WHERE s.@best_move.community == t.@{community_id_attribute_name} - AND t.@to_change_community == TRUE - AND t.@best_move.community == s.@{community_id_attribute_name} - // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same - AND ( - s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add - OR ( - abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 - AND s.@vid > t.@vid - ) - ) - POST-ACCUM - s.@to_change_community = FALSE; - - ChangedNodes = ChangedNodes MINUS SwapNodes; - - // Place each node of ChangedNodes in the community in which the gain is maximum - ChangedNodes = SELECT s FROM ChangedNodes:s - POST-ACCUM - s.@{community_id_attribute_name} = s.@best_move.community, - s.@community_vid = s.@best_move.ext_vid, - s.@to_change_community = FALSE; - - @@move_cnt += ChangedNodes.size(); - - // Get all neighbours of the changed node that do not belong to the node’s new community - Candidates = SELECT t FROM ChangedNodes:s -({relation_edge_name}:e)- :t - WHERE t.@{community_id_attribute_name} != s.@{community_id_attribute_name}; - END; - - PRINT @@move_cnt AS Delta; - - // Coarsening - UINT new_layer = 0; - @@community_sum_total_map.clear(); - Tmp = - SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - @@community_sum_in_map += (s.@{community_id_attribute_name} -> wt) - END - POST-ACCUM - //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), - INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), - IF @@community_sum_in_map.containsKey(s) THEN - //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) - END; - - @@community_sum_in_map.clear(); - - Tmp = SELECT s FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM - IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN - @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> wt)) - END - POST-ACCUM - IF @@source_target_k_in_map.containsKey(s) THEN - FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO - //f_links_to.println(s.id, target_community, k_in, new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) - END - END; - - @@source_target_k_in_map.clear(); -} diff --git a/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql b/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql deleted file mode 100644 index 231631d6..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_2_other_passes.gsql +++ /dev/null @@ -1,217 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_2( - UINT layer = 0, - UINT max_hop = 10, - UINT batch_num = 1 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - TYPEDEF TUPLE community, STRING ext_vid> MyTuple; - SumAccum @@m; // the sum of the weights of all the links in the network - MinAccum> @{community_id_attribute_name}; // the community ID of the node - MinAccum @community_vid; // the community ID of the node - SumAccum @k; // the sum of the weights of the links incident to the node - SumAccum @k_in; // the sum of the weights of the links inside the previous community of the node - SumAccum @k_self_loop; // the weight of the self-loop link - MapAccum, SumAccum> @community_k_in_map; // the community of the neighbors of the nodes -> the sum of the weights of the links inside the community - MapAccum, SumAccum> @@community_sum_total_map; // community ID C -> the sum of the weights of the links incident to nodes in C - SumAccum @community_sum_total; // the sum of the weights of the links incident to nodes in the community of the node - MapAccum, SumAccum> @@community_sum_in_map; // community ID -> the sum of the weights of the links inside the community - MapAccum, MapAccum, SumAccum>> @@source_target_k_in_map; // source community ID -> (target community ID -> the sum of the weights of the links from the source community to the target community) - SumAccum @delta_Q_remove; // delta Q to remove the node from the previous community - MaxAccum @best_move; // best move of the node with the highest delta Q to move the isolated node into the new community - MaxAccum @@min_double; // used to reset the @best_move - SumAccum @@move_cnt; - OrAccum @to_change_community; - SumAccum @batch_id; - SumAccum @vid; - SumAccum @@links_to_check; - - // Initialization - LOG(TRUE, "Query started!"); - All_Nodes = {{{entity_vertex_name}.*}}; - _tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - ACCUM - @@links_to_check += 1; - - All_Nodes = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - @@m += weight / 2, - s.@k += weight, - IF s == t THEN // self-loop link - s.@k_self_loop += weight - END - POST-ACCUM - s.@{community_id_attribute_name} = s, - s.@community_vid = to_string(s.id), - s.@vid = getvid(s), - s.@batch_id = s.@vid % batch_num - ; - LOG(TRUE, All_Nodes.size()); - IF @@m < 0.00000000001 THEN - PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; - RETURN; - END; - - // Local moving - INT hop = 0; - Candidates = All_Nodes; - WHILE Candidates.size() > 0 AND hop < max_hop DO - hop = hop + 1; - LOG(TRUE, hop); - IF hop == 1 THEN // first iteration - ChangedNodes = - SELECT s - FROM Candidates:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END - HAVING s.@to_change_community == TRUE - ; - ELSE // remaining iterations - // Calculate sum_total - Tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - @@community_sum_total_map += (s.@{community_id_attribute_name} -> s.@k) - ; - Tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - s.@community_sum_total = @@community_sum_total_map.get(s.@{community_id_attribute_name}) - ; - LOG(TRUE, @@community_sum_total_map.size()); - @@community_sum_total_map.clear(); - // Find the best move - ChangedNodes = {{}}; - FOREACH batch_id IN RANGE[0, batch_num-1] DO - LOG(TRUE, batch_id); - // Calculate the delta Q to remove the node from the previous community - Nodes = - SELECT s - FROM Candidates:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@batch_id == batch_id - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - s.@k_in += weight - ELSE - s.@community_k_in_map += (t.@{community_id_attribute_name} -> weight) - END - POST-ACCUM - s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, - s.@k_in = 0, - s.@best_move = MyTuple(@@min_double, s, to_string(s.id)) // reset the delta_Q_add - ; - // Find the best move - Nodes = - SELECT s - FROM Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@{community_id_attribute_name} != t.@{community_id_attribute_name} - ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@{community_id_attribute_name}) - s.@k * t.@community_sum_total / @@m, - s.@best_move += MyTuple(delta_Q_add, t.@{community_id_attribute_name}, t.@community_vid) - POST-ACCUM - IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // the gain (delta Q) is positive - s.@to_change_community = TRUE - END, - s.@community_k_in_map.clear() - HAVING s.@to_change_community == TRUE - ; - ChangedNodes = ChangedNodes UNION Nodes; - END; - END; - // If two nodes swap, only change the community of one of them - SwapNodes = - SELECT s - FROM ChangedNodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND s.@best_move.community == t.@{community_id_attribute_name} - AND t.@to_change_community == TRUE - AND t.@best_move.community == s.@{community_id_attribute_name} - // only change the one with larger delta Q or the one with smaller @vid if delta Q are the same - AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add - OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 - AND s.@vid > t.@vid)) - POST-ACCUM - s.@to_change_community = FALSE - ; - LOG(TRUE, SwapNodes.size()); - ChangedNodes = ChangedNodes MINUS SwapNodes; - LOG(TRUE, ChangedNodes.size()); - // Place each node of ChangedNodes in the community in which the gain is maximum - ChangedNodes = - SELECT s - FROM ChangedNodes:s - POST-ACCUM - s.@{community_id_attribute_name} = s.@best_move.community, - s.@community_vid = s.@best_move.ext_vid, - s.@to_change_community = FALSE - ; - - @@move_cnt += ChangedNodes.size(); - // Get all neighbours of the changed node that do not belong to the node’s new community - Candidates = - SELECT t - FROM ChangedNodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - AND t.@{community_id_attribute_name} != s.@{community_id_attribute_name} - ; - LOG(TRUE, Candidates.size()); - END; - - PRINT @@move_cnt AS Delta; - - // Coarsening - LOG(TRUE, "Coarsening"); - UINT new_layer = layer + 1; - @@community_sum_total_map.clear(); - Tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM IF s.@{community_id_attribute_name} == t.@{community_id_attribute_name} THEN - DOUBLE weight = e.layer_weight_map.get(layer), - @@community_sum_in_map += (s.@{community_id_attribute_name} -> weight) - END - POST-ACCUM - //f_belongs_to.println(s.id, s.@{community_id_attribute_name}, new_layer), - INSERT INTO {belongs_to_edge_name} VALUES (s, str_to_int(s.@community_vid), new_layer), - IF @@community_sum_in_map.containsKey(s) THEN - //f_links_to.println(s.id, s.id, @@community_sum_in_map.get(s), new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,s, (new_layer -> @@community_sum_in_map.get(s))) - END - ; - LOG(TRUE, @@community_sum_in_map.size()); - @@community_sum_in_map.clear(); - Tmp = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s.@{community_id_attribute_name} != t.@{community_id_attribute_name} THEN - @@source_target_k_in_map += (s.@{community_id_attribute_name} -> (t.@{community_id_attribute_name} -> weight)) - END - POST-ACCUM - IF @@source_target_k_in_map.containsKey(s) THEN - FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO - //f_links_to.println(s.uniq_id, target_community, k_in, new_layer) - INSERT INTO {links_to_edge_name} VALUES (s,target_community, (new_layer -> k_in)) - END - END - ; - LOG(TRUE, @@source_target_k_in_map.size()); - @@source_target_k_in_map.clear(); - PRINT @@links_to_check; - LOG(TRUE, "Query finished!"); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql b/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql deleted file mode 100644 index 75cbad7e..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_3_final_community.gsql +++ /dev/null @@ -1,44 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_3( - UINT top_layer = 2 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - MinAccum @{community_id_attribute_name}; // the community ID of the node - INT layer = top_layer; - - // Initialization - LOG(TRUE, "Query started!"); - All_Nodes = {{{entity_vertex_name}.*}}; - - // Top layer - Nodes = - SELECT t - FROM All_Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t - WHERE layer IN e.layer_set - ACCUM t.@{community_id_attribute_name} = to_string(s.id) - ; - LOG(TRUE, layer, Nodes.size()); - - // Other layers - WHILE Nodes.size() > 0 AND layer > 0 DO - layer = layer - 1; - Nodes = - SELECT t - FROM Nodes:s -(reverse_{belongs_to_edge_name}:e)- :t - WHERE layer IN e.layer_set - ACCUM t.@{community_id_attribute_name} = s.@{community_id_attribute_name} - ; - LOG(TRUE, layer, Nodes.size()); - END; - - // Write to the file - Nodes = - SELECT s - FROM Nodes:s - POST-ACCUM - //f.println(s.uniq_id, s.@{community_id_attribute_name}) - s.{community_id_attribute_name} = s.@{community_id_attribute_name} - - ; - LOG(TRUE, "Query finished!"); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql deleted file mode 100644 index 0058d0ee..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_1_for_pass.gsql +++ /dev/null @@ -1,39 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4a( - UINT layer=0 -) FOR GRAPH {graph_name} SYNTAX v1 {{ - SumAccum @@sum_weight; // the sum of the weights of all the links in the network - MapAccum, SumAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C - MapAccum, SumAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community - SumAccum @@modularity; - - All_Nodes = {{{entity_vertex_name}.*}}; - All_Nodes = - SELECT s - FROM All_Nodes:s -({links_to_edge_name}:e)- :t - WHERE e.layer_weight_map.containsKey(layer) - ACCUM DOUBLE weight = e.layer_weight_map.get(layer), - IF s == t THEN - @@community_in_weight_map += (s -> weight) - END, - @@community_total_weight_map += (s -> weight), - @@sum_weight += weight - ; - LOG(TRUE, All_Nodes.size()); - @@modularity = 0; - FOREACH (community, total_weight) IN @@community_total_weight_map DO - DOUBLE in_weight = 0; - IF @@community_in_weight_map.containsKey(community) THEN - in_weight = @@community_in_weight_map.get(community); - END; - @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); - END; - // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; - PRINT layer; - PRINT @@modularity AS modularity; - PRINT @@community_total_weight_map.size() AS community_number; - PRINT All_Nodes.size(); - @@community_in_weight_map.clear(); - @@community_total_weight_map.clear(); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql b/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql deleted file mode 100644 index 31ba4d0b..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_4_modularity_2_final.gsql +++ /dev/null @@ -1,52 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_4b( -) FOR GRAPH {graph_name} SYNTAX v1 {{ - SumAccum @@sum_weight; // the sum of the weights of all the links in the network - MapAccum> @@community_total_weight_map; // community ID C -> the sum of the weights of the links incident to nodes in C - MapAccum> @@community_in_weight_map; // community ID -> the sum of the weights of the links inside the community - SumAccum @@modularity; - MapAccum> @@Community_sizes; - MapAccum> @@count_of_sizes; - AvgAccum @@avg_community_size; - - DOUBLE wt = 1.0; - All_Nodes = {{{entity_vertex_name}.*}}; - Nodes = - SELECT s - FROM All_Nodes:s -({relation_edge_name}:e)- :t - ACCUM IF s.{community_id_attribute_name} == t.{community_id_attribute_name} THEN - @@community_in_weight_map += (s.{community_id_attribute_name} -> wt) - END, - @@community_total_weight_map += (s.{community_id_attribute_name} -> wt), - @@sum_weight += wt - ; - @@modularity = 0; - FOREACH (community, total_weight) IN @@community_total_weight_map DO - DOUBLE in_weight = 0; - IF @@community_in_weight_map.containsKey(community) THEN - in_weight = @@community_in_weight_map.get(community); - END; - @@modularity += in_weight / @@sum_weight - pow(total_weight / @@sum_weight, 2); - END; - - _tmp = - SELECT s - FROM All_Nodes:s - POST-ACCUM - @@Community_sizes += (s.{community_id_attribute_name} -> 1); - - FOREACH (comm, cnt) IN @@Community_sizes DO - @@count_of_sizes += (cnt -> 1); - @@avg_community_size += cnt; - END; - - // PRINT @@modularity, @@community_in_weight_map, @@community_total_weight_map, @@sum_weight; - PRINT @@modularity AS modularity; - PRINT @@community_total_weight_map.size() AS community_number; - PRINT @@count_of_sizes AS num_communities_by_size; - PRINT @@avg_community_size AS avg_community_size; - - @@community_in_weight_map.clear(); - @@community_total_weight_map.clear(); -}} diff --git a/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql b/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql deleted file mode 100644 index 7590935a..00000000 --- a/common/gsql/graphRAG/louvain_old/louvain_5_reset.gsql +++ /dev/null @@ -1,13 +0,0 @@ -USE GRAPH {graph_name} -DROP QUERY {query_name} -CREATE OR REPLACE DISTRIBUTED QUERY graphRAG_louvain_5_reset( -) FOR GRAPH {graph_name} SYNTAX v1 {{ - - // Initialization - Nodes = {{{entity_vertex_name}.*}}; - - // Top layer - DELETE e - FROM Nodes:s -(({belongs_to_edge_name}|{links_to_edge_name}):e)- :t - ; -}} diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb index e915f392..411f5d62 100644 --- a/copilot/docs/notebooks/graphrag.ipynb +++ b/copilot/docs/notebooks/graphrag.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -33,7 +33,7 @@ "'The graph GraphRAG_pytgdocs is created.'" ] }, - "execution_count": 17, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -54,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.043 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.066 seconds!\\\\nLocal schema change succeeded.\"'}" + " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.845 seconds!\\\\nLocal schema change succeeded.\"',\n", + " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.085 seconds!\\\\nLocal schema change succeeded.\"'}" ] }, - "execution_count": 19, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -95,18 +95,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_852f54bfd00a475fa4efc3ba9319f0ac.stream.SupportAI_GraphRAG_pytgdocs_6a6331e3e5e248eaae389788c9bab325.1723217024268'}" + "{'job_name': 'load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507'}" ] }, - "execution_count": 21, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -155,7 +155,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", + "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" ] } @@ -166,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -183,22 +183,28 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sleep\n" + ] + }, { "data": { "text/plain": [ - "{'job_name': 'load_documents_content_json_3e62fb87723945ea9a0380956694b7ec',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_3e62fb87723945ea9a0380956694b7ec.stream.SupportAI_GraphRAG_pytgdocs_cc751adab29643b28af1b7bf13b6515b.1723213722186'}" + "{'job_name': 'load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a',\n", + " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603',\n", + " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603'}" ] }, - "execution_count": 11, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\"]:\n", - "# for v in [\"ResolvedEntity\"]:\n", + "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\",\"Community\"]:\n", "# for v in [\"ResolvedEntity\"]:\n", " try:\n", " conn.delVertices(v)\n", @@ -207,6 +213,7 @@ "\n", "import time\n", "\n", + "print('sleep')\n", "time.sleep(3)\n", "conn.ai.runDocumentIngest(\n", " res[\"load_job_id\"],\n", @@ -273,6 +280,33 @@ "r[\"results\"][0][\"attributes\"][\"description\"]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "def check_vertex_has_desc(conn, comm: str):\n", + " headers = make_headers(conn)\n", + " with httpx.Client(timeout=None) as client:\n", + " resp = client.get(\n", + " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Community/{comm}\",\n", + " headers=headers,\n", + " )\n", + " resp.raise_for_status()\n", + "\n", + " print(json.dumps(resp.json(),indent=2))\n", + " desc = resp.json()[\"results\"][0][\"attributes\"][\"description\"]\n", + " print(f\">>>*****{comm}:{desc}********\", flush=True)\n", + "\n", + " return len(desc) > 0\n", + "check_vertex_has_desc(conn,'Value_Property_1_2')\n", + "conn.upsertVertex(\"Community\",\"Rmse_1_2\",{\n", + " \"description\":\"asdf\"\n", + "})" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/eventual-consistency-service/app/graphrag/community_summarizer.py b/eventual-consistency-service/app/graphrag/community_summarizer.py index d250b1f3..2bef4095 100644 --- a/eventual-consistency-service/app/graphrag/community_summarizer.py +++ b/eventual-consistency-service/app/graphrag/community_summarizer.py @@ -1,7 +1,5 @@ -import json +import re -from langchain.output_parsers import PydanticOutputParser -from langchain.prompts import ChatPromptTemplate from langchain_core.prompts import PromptTemplate from common.llm_services import LLM_Model @@ -12,17 +10,17 @@ You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary, but do not add any information that is not in the description. Make sure it is written in third person, and include the entity names so we the have full context. ####### -Data- -Entities: {entity_name} +Commuinty Title: {entity_name} Description List: {description_list} -####### -Output: """) +id_pat = re.compile(r"[_\d]*") + class CommunitySummarizer: def __init__( @@ -31,108 +29,16 @@ def __init__( ): self.llm_service = llm_service - def _extract_kg_from_doc(self, doc, chain, parser): - try: - out = chain.invoke( - {"input": doc, "format_instructions": parser.get_format_instructions()} - ) - except Exception as e: - print("Error: ", e) - return {"nodes": [], "rels": []} - try: - if "```json" not in out.content: - json_out = json.loads(out.content.strip("content=")) - else: - json_out = json.loads( - out.content.split("```")[1].strip("```").strip("json").strip() - ) - - formatted_rels = [] - for rels in json_out["rels"]: - if isinstance(rels["source"], str) and isinstance(rels["target"], str): - formatted_rels.append( - { - "source": rels["source"], - "target": rels["target"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], dict) and isinstance( - rels["target"], str - ): - formatted_rels.append( - { - "source": rels["source"]["id"], - "target": rels["target"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], str) and isinstance( - rels["target"], dict - ): - formatted_rels.append( - { - "source": rels["source"], - "target": rels["target"]["id"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - elif isinstance(rels["source"], dict) and isinstance( - rels["target"], dict - ): - formatted_rels.append( - { - "source": rels["source"]["id"], - "target": rels["target"]["id"], - "type": rels["relation_type"].replace(" ", "_").upper(), - "definition": rels["definition"], - } - ) - else: - raise Exception("Relationship parsing error") - formatted_nodes = [] - for node in json_out["nodes"]: - formatted_nodes.append( - { - "id": node["id"], - "type": node["node_type"].replace(" ", "_").capitalize(), - "definition": node["definition"], - } - ) - - # filter relationships and nodes based on allowed types - if self.strict_mode: - if self.allowed_vertex_types: - formatted_nodes = [ - node - for node in formatted_nodes - if node["type"] in self.allowed_vertex_types - ] - if self.allowed_edge_types: - formatted_rels = [ - rel - for rel in formatted_rels - if rel["type"] in self.allowed_edge_types - ] - return {"nodes": formatted_nodes, "rels": formatted_rels} - except: - print("Error Processing: ", out) - return {"nodes": [], "rels": []} - async def summarize(self, name: str, text: list[str]) -> CommunitySummary: - # parser = PydanticOutputParser(pydantic_object=CommunitySummary) structured_llm = self.llm_service.model.with_structured_output(CommunitySummary) chain = SUMMARIZE_PROMPT | structured_llm + + # remove iteration tags from name + name = id_pat.sub("", name) summary = await chain.ainvoke( { "entity_name": name, "description_list": text, - # "format_instructions": parser.get_format_instructions(), } ) - # summary = self._extract_kg_from_doc(text, chain, parser) - # summary = None return summary.summary diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index d4e3a7d6..d1e7fdc0 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -6,7 +6,13 @@ import httpx from aiochannel import Channel from graphrag import workers -from graphrag.util import http_timeout, init, make_headers, stream_ids +from graphrag.util import ( + check_vertex_has_desc, + http_timeout, + init, + make_headers, + stream_ids, +) from pyTigerGraph import TigerGraphConnection from common.config import embedding_service @@ -210,7 +216,6 @@ async def resolve_entities( async with asyncio.TaskGroup() as grp: # for every entity async for entity_id in entity_chan: - print(f"***Entity ID from chan {entity_id}", flush=True) grp.create_task( workers.resolve_entity(conn, upsert_chan, emb_store, entity_id) ) @@ -227,7 +232,7 @@ async def resolve_entities( res.raise_for_status() -async def communities(conn: TigerGraphConnection, community_chan: Channel): +async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): """ Run louvain """ @@ -250,23 +255,21 @@ async def communities(conn: TigerGraphConnection, community_chan: Channel): ) res.raise_for_status() mod = res.json()["results"][0]["mod"] - print(f"****mod 1: {mod}", flush=True) - await community_chan.put(1) + logger.info(f"****mod pass 1: {mod}") + await stream_communities(conn, 1, comm_process_chan) # nth pass: Iterate on Resolved Entities until modularity stops increasing prev_mod = -10 i = 0 - # for _ in range(1, 5): - prev_mod = 0 while abs(prev_mod - mod) > 0.0000001 and prev_mod != 0: prev_mod = mod - logger.info(f"Running louvain on Communities (iteration: {i})") i += 1 + logger.info(f"Running louvain on Communities (iteration: {i})") # louvain pass async with httpx.AsyncClient(timeout=None) as client: res = await client.get( f"{conn.restppUrl}/query/{conn.graphname}/graphrag_louvain_communities", - params={"n_batches": 1}, + params={"n_batches": 1, "iteration": i}, headers=headers, ) @@ -281,20 +284,20 @@ async def communities(conn: TigerGraphConnection, community_chan: Channel): ) res.raise_for_status() mod = res.json()["results"][0]["mod"] - print(f"*** mod {i+1}: {mod}", flush=True) - print(f"****** mod diff: {abs(prev_mod - mod)}", flush=True) + logger.info(f"*** mod pass {i+1}: {mod} (diff= {abs(prev_mod - mod)})") # write iter to chan for layer to be processed - await community_chan.put(i + 1) + await stream_communities(conn, i + 1, comm_process_chan) # TODO: erase last run since it's ∆q to the run before it will be small logger.info("closing communities chan") - community_chan.close() + comm_process_chan.close() async def stream_communities( conn: TigerGraphConnection, - community_chan: Channel, + # community_chan: Channel, + i: int, comm_process_chan: Channel, ): """ @@ -305,37 +308,48 @@ async def stream_communities( headers = make_headers(conn) # TODO: # can only do one layer at a time to ensure that every child community has their descriptions - async for i in community_chan: - # get the community from that layer - async with httpx.AsyncClient(timeout=None) as client: - resp = await client.get( - f"{conn.restppUrl}/query/{conn.graphname}/stream_community", - params={"iter": i}, - headers=headers, - ) - resp.raise_for_status() - comms = resp.json()["results"][0]["Comms"] - for c in comms: - await comm_process_chan.put((i, c["v_id"])) + # async for i in community_chan: + # get the community from that layer + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/stream_community", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + comms = resp.json()["results"][0]["Comms"] + + for c in comms: + await comm_process_chan.put((i, c["v_id"])) + + # Wait for all communities for layer i to be processed before doing next layer + # all community descriptions must be populated before the next layer can be processed + if len(comms) > 0: + while not await check_vertex_has_desc(conn, i): + logger.info(f"Waiting for layer{i} to finish processing") + await asyncio.sleep(5) + await asyncio.sleep(3) logger.info("stream_communities done") logger.info("closing comm_process_chan") - comm_process_chan.close() + # comm_process_chan.close() async def summarize_communities( conn: TigerGraphConnection, comm_process_chan: Channel, upsert_chan: Channel, + embed_chan: Channel, ): async with asyncio.TaskGroup() as tg: async for c in comm_process_chan: - tg.create_task(workers.process_community(conn, upsert_chan, *c)) - break + tg.create_task(workers.process_community(conn, upsert_chan, embed_chan, *c)) + # break logger.info("closing upsert_chan") upsert_chan.close() + embed_chan.close() async def run(graphname: str, conn: TigerGraphConnection): @@ -347,14 +361,17 @@ async def run(graphname: str, conn: TigerGraphConnection): - embeddings - entities/relationships (and their embeddings) - upsert everything to the graph + - Resolve Entities + Ex: "Vincent van Gogh" and "van Gogh" should be resolved to "Vincent van Gogh" """ extractor, index_stores = await init(conn) init_start = time.perf_counter() - abc = True - abc = False - if abc: + doc_process_switch = True + entity_resolution_switch = True + community_detection_switch = True + if doc_process_switch: logger.info("Doc Processing Start") docs_chan = Channel(1) embed_chan = Channel(100) @@ -381,7 +398,7 @@ async def run(graphname: str, conn: TigerGraphConnection): # Entity Resolution entity_start = time.perf_counter() - if abc: + if entity_resolution_switch: logger.info("Entity Processing Start") entities_chan = Channel(100) upsert_chan = Channel(100) @@ -401,7 +418,7 @@ async def run(graphname: str, conn: TigerGraphConnection): # Community Detection community_start = time.perf_counter() - if True: + if community_detection_switch: # FIXME: delete community delete for v in ["Community"]: try: @@ -409,20 +426,22 @@ async def run(graphname: str, conn: TigerGraphConnection): except: pass logger.info("Community Processing Start") - communities_chan = Channel(1) upsert_chan = Channel(10) comm_process_chan = Channel(100) upsert_chan = Channel(100) + embed_chan = Channel(100) async with asyncio.TaskGroup() as grp: # run louvain - grp.create_task(communities(conn, communities_chan)) + # grp.create_task(communities(conn, communities_chan)) + grp.create_task(communities(conn, comm_process_chan)) # get the communities + # grp.create_task( stream_communities(conn, communities_chan, comm_process_chan)) + # summarize each community grp.create_task( - stream_communities(conn, communities_chan, comm_process_chan) + summarize_communities(conn, comm_process_chan, upsert_chan, embed_chan) ) - # summarize each community - grp.create_task(summarize_communities(conn, comm_process_chan, upsert_chan)) grp.create_task(upsert(upsert_chan)) + grp.create_task(embed(embed_chan, index_stores, graphname)) community_end = time.perf_counter() logger.info("Community Processing End") diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index 6876b5de..bcf1befe 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -2,6 +2,7 @@ import base64 import json import logging +import re import traceback from glob import glob @@ -65,6 +66,7 @@ async def init( "common/gsql/graphRAG/SetEpochProcessing", "common/gsql/graphRAG/ResolveRelationships", "common/gsql/graphRAG/get_community_children", + "common/gsql/graphRAG/communities_have_desc", "common/gsql/graphRAG/louvain/graphrag_louvain_init", "common/gsql/graphRAG/louvain/graphrag_louvain_communities", "common/gsql/graphRAG/louvain/modularity", @@ -91,6 +93,7 @@ async def init( "Entity", "Relationship", # "Concept", + "Community", ], ) index_stores = {} @@ -108,7 +111,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=False, + drop_old=True, ) LogWriter.info(f"Initializing {name}") @@ -174,6 +177,10 @@ def map_attrs(attributes: dict): def process_id(v_id: str): v_id = v_id.replace(" ", "_").replace("/", "") + + has_func = re.compile(r"(.*)\(").findall(v_id) + if len(has_func) > 0: + v_id = has_func[0] if v_id == "''" or v_id == '""': return "" @@ -186,6 +193,7 @@ async def upsert_vertex( vertex_id: str, attributes: dict, ): + logger.info(f"Upsert vertex: {vertex_type} {vertex_id}") vertex_id = vertex_id.replace(" ", "_") attrs = map_attrs(attributes) data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) @@ -260,10 +268,26 @@ async def get_commuinty_children(conn, i: int, c: str): descrs = [] for d in resp.json()["results"][0]["children"]: desc = d["attributes"]["description"] - if len(desc) == 0: + if i == 1 and all(len(x) == 0 for x in desc): + desc = [d["v_id"]] + elif len(desc) == 0: desc = d["v_id"] descrs.append(desc) - print(">>>", descrs, flush=True) return descrs + + +async def check_vertex_has_desc(conn, i: int): + headers = make_headers(conn) + async with httpx.AsyncClient(timeout=None) as client: + resp = await client.get( + f"{conn.restppUrl}/query/{conn.graphname}/communities_have_desc", + params={"iter": i}, + headers=headers, + ) + resp.raise_for_status() + + res = resp.json()["results"][0]["all_have_desc"] + + return res diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 22980d96..77f3d6d8 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -299,9 +299,6 @@ async def resolve_entity( f"aget_k_closest should, minimally, return the entity itself.\n{results}" ) raise Exception() - # FIXME: deleteme - # if entity_id == "Dataframe": - # print("result:", entity_id, results) # merge all entities into the ResolvedEntity vertex # use the longest v_id as the resolved entity's v_id @@ -346,8 +343,9 @@ async def resolve_entity( async def process_community( conn: TigerGraphConnection, upsert_chan: Channel, + embed_chan: Channel, i: int, - c: str, + comm_id: str, ): """ https://github.com/microsoft/graphrag/blob/main/graphrag/prompt_tune/template/community_report_summarization.py @@ -357,33 +355,39 @@ async def process_community( embed summaries """ - print(i, c, flush=True) + logger.info(f"Processing Community: {comm_id}") # get the children of the community - children = await util.get_commuinty_children(conn, i, c) + children = await util.get_commuinty_children(conn, i, comm_id) if i == 1: tmp = [] for c in children: tmp.extend(c) children = list(filter(lambda x: len(x) > 0, tmp)) - print(">>>", children, flush=True) - llm = ecc_util.get_llm_service() - summarizer = community_summarizer.CommunitySummarizer(llm) - summary = await summarizer.summarize(c, children) - await upsert_chan.put((upsert_summary, (conn,summary))) - - -async def upsert_summary(conn: TigerGraphConnection, summary: str): - print(f"SUMMARY:> {summary}", flush=True) - - # vertex_id = vertex_id.replace(" ", "_") - # attrs = map_attrs(attributes) - # data = json.dumps({"vertices": {vertex_type: {vertex_id: attrs}}}) - # headers = make_headers(conn) - # async with httpx.AsyncClient(timeout=http_timeout) as client: - # res = await client.post( - # f"{conn.restppUrl}/graph/{conn.graphname}", data=data, headers=headers - # ) - # - # res.raise_for_status() - # + comm_id = util.process_id(comm_id) + + # if the community only has one child, use its description + if len(children) == 1: + summary = children[0] + else: + llm = ecc_util.get_llm_service() + summarizer = community_summarizer.CommunitySummarizer(llm) + summary = await summarizer.summarize(comm_id, children) + + await upsert_chan.put( + ( + util.upsert_vertex, # func to call + ( + conn, + "Community", # v_type + comm_id, # v_id + { # attrs + "description": summary, + "iteration": i, + }, + ), + ) + ) + + # (v_id, content, index_name) + await embed_chan.put((comm_id, summary, "Community")) From 08aca044b071352020cb3bfec3e743e8e178aaa0 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:54:16 -0400 Subject: [PATCH 11/53] cleanup --- .../app/graphrag/graph_rag.py | 15 +-------------- .../app/graphrag/workers.py | 3 +-- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index d1e7fdc0..86f172b8 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -19,8 +19,6 @@ from common.embeddings.milvus_embedding_store import MilvusEmbeddingStore from common.extractors.BaseExtractor import BaseExtractor -# http_logs = logging.getLogger("httpx") -# http_logs.setLevel(logging.WARNING) logger = logging.getLogger(__name__) consistency_checkers = {} @@ -192,8 +190,6 @@ async def stream_entities( for i in ids["ids"]: if len(i) > 0: await entity_chan.put(i) - # break - # break # one batch logger.info("stream_enities done") # close the docs chan -- this function is the only sender @@ -296,7 +292,6 @@ async def communities(conn: TigerGraphConnection, comm_process_chan: Channel): async def stream_communities( conn: TigerGraphConnection, - # community_chan: Channel, i: int, comm_process_chan: Channel, ): @@ -333,7 +328,6 @@ async def stream_communities( logger.info("stream_communities done") logger.info("closing comm_process_chan") - # comm_process_chan.close() async def summarize_communities( @@ -345,7 +339,6 @@ async def summarize_communities( async with asyncio.TaskGroup() as tg: async for c in comm_process_chan: tg.create_task(workers.process_community(conn, upsert_chan, embed_chan, *c)) - # break logger.info("closing upsert_chan") upsert_chan.close() @@ -369,7 +362,7 @@ async def run(graphname: str, conn: TigerGraphConnection): init_start = time.perf_counter() doc_process_switch = True - entity_resolution_switch = True + entity_resolution_switch = True community_detection_switch = True if doc_process_switch: logger.info("Doc Processing Start") @@ -419,12 +412,6 @@ async def run(graphname: str, conn: TigerGraphConnection): # Community Detection community_start = time.perf_counter() if community_detection_switch: - # FIXME: delete community delete - for v in ["Community"]: - try: - conn.delVertices(v) - except: - pass logger.info("Community Processing Start") upsert_chan = Channel(10) comm_process_chan = Channel(100) diff --git a/eventual-consistency-service/app/graphrag/workers.py b/eventual-consistency-service/app/graphrag/workers.py index 77f3d6d8..755b1085 100644 --- a/eventual-consistency-service/app/graphrag/workers.py +++ b/eventual-consistency-service/app/graphrag/workers.py @@ -151,7 +151,7 @@ async def get_vert_desc(conn, v_id, node: Node): exists = await util.check_vertex_exists(conn, v_id) # if vertex exists, get description content and append this description to it if not exists["error"]: - # dedup descriptions + # deduplicate descriptions desc.extend(exists["results"][0]["attributes"]["description"]) desc = list(set(desc)) return desc @@ -316,7 +316,6 @@ async def resolve_entity( "ResolvedEntity", # v_type resolved_entity_id, # v_id { # attrs - # "id": resolved_entity_id, }, ), ) From f2828406ee10fd71b486a32fb0e704d6db895cb3 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:01:26 -0400 Subject: [PATCH 12/53] fmt after merge conflicts --- common/embeddings/milvus_embedding_store.py | 8 +++----- eventual-consistency-service/app/main.py | 7 +++++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index ae352c9e..9302f6f8 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -5,9 +5,10 @@ import Levenshtein as lev from asyncer import asyncify -from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from pymilvus import connections, utility +from langchain_milvus.vectorstores import Milvus +from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException from common.embeddings.base_embedding_store import EmbeddingStore @@ -15,9 +16,6 @@ from common.logs.log import req_id_cv from common.logs.logwriter import LogWriter from common.metrics.prometheus_metrics import metrics -from langchain_community.vectorstores import Milvus -from langchain_core.documents.base import Document -from pymilvus import MilvusException, connections, utility logger = logging.getLogger(__name__) diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index ce7a2e04..701e363e 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -96,7 +96,7 @@ def initialize_eventual_consistency_checker( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - alias=milvus_config.get("alias", "default") + alias=milvus_config.get("alias", "default"), ) chunker = ecc_util.get_chunker() @@ -190,7 +190,10 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) # asyncio.run(graphrag.run(graphname, conn)) import time - ecc_status = f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + + ecc_status = ( + f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + ) case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From 50a4fd516cd4195a2693f4693dbbc545e5524326 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:27:30 -0400 Subject: [PATCH 13/53] rm clang dotfiles --- common/gsql/graphRAG/.clang-format | 269 ----------------------------- common/gsql/graphRAG/.clangd | 2 - 2 files changed, 271 deletions(-) delete mode 100644 common/gsql/graphRAG/.clang-format delete mode 100644 common/gsql/graphRAG/.clangd diff --git a/common/gsql/graphRAG/.clang-format b/common/gsql/graphRAG/.clang-format deleted file mode 100644 index f0dcec6c..00000000 --- a/common/gsql/graphRAG/.clang-format +++ /dev/null @@ -1,269 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: true -AlignConsecutiveBitFields: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveDeclarations: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveMacros: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveShortCaseStatements: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Left -AlignOperands: Align -AlignTrailingComments: - Kind: Always - OverEmptyLines: 0 -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: WithoutElse -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterExternBlock: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakArrays: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: Always -BreakBeforeBraces: Attach -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: BeforeColon -BreakInheritanceList: BeforeColon -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*\.h>' - Priority: 1 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 3 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '([-_](test|unittest))?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseBlocks: false -IndentCaseLabels: true -IndentExternBlock: AfterExternBlock -IndentGotoLabels: true -IndentPPDirectives: None -IndentRequiresClause: true -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: false -InsertNewlineAtEOF: false -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 0 - BinaryMinDigits: 0 - Decimal: 0 - DecimalMinDigits: 0 - Hex: 0 - HexMinDigits: 0 -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -KeepEmptyLinesAtEOF: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 4 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PackConstructorInitializers: NextLine -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 0 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -PPIndentWidth: -1 -QualifierAlignment: Leave -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - - ParseTestProto - - ParsePartialTestProto - CanonicalDelimiter: pb - BasedOnStyle: google -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: false -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: CaseSensitive -SortJavaStaticImport: Before -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - AfterRequiresInClause: false - AfterRequiresInExpression: false - BeforeNonEmptyParentheses: false -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: Never -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParens: Never -SpacesInParensOptions: - InCStyleCasts: false - InConditionalStatements: false - InEmptyParentheses: false - Other: false -SpacesInSquareBrackets: false -Standard: Auto -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseTab: Never -VerilogBreakBetweenInstancePorts: true -WhitespaceSensitiveMacros: - - BOOST_PP_STRINGIZE - - CF_SWIFT_NAME - - NS_SWIFT_NAME - - PP_STRINGIZE - - STRINGIZE -... diff --git a/common/gsql/graphRAG/.clangd b/common/gsql/graphRAG/.clangd deleted file mode 100644 index ec3be0d8..00000000 --- a/common/gsql/graphRAG/.clangd +++ /dev/null @@ -1,2 +0,0 @@ -CompileFlags: - Add: [ -std=c++23 ] From f007c8aac453d7e00009d10d69fb0b49ed174acf Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:12:05 -0400 Subject: [PATCH 14/53] final cleanup --- common/embeddings/milvus_embedding_store.py | 5 +- common/extractors/GraphExtractor.py | 3 +- .../louvain/graphrag_louvain_communities.gsql | 5 +- .../louvain/graphrag_louvain_init.gsql | 17 +- common/gsql/supportai/Scan_For_Updates.gsql | 8 +- common/gsql/supportai/SupportAI_Schema.gsql | 2 +- common/logs/logwriter.py | 2 +- common/py_schemas/schemas.py | 4 - copilot/docs/notebooks/graphrag.ipynb | 398 ------------------ eventual-consistency-service/app/main.py | 3 +- 10 files changed, 17 insertions(+), 430 deletions(-) delete mode 100644 copilot/docs/notebooks/graphrag.ipynb diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 9302f6f8..7169379e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -596,7 +596,7 @@ def query(self, expr: str, output_fields: List[str]): return query_result - def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): + def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float): a = a.lower() b = b.lower() # if the words are short, they should be the same @@ -605,8 +605,6 @@ def edit_dist_check(self, a: str, b: str, edit_dist_threshold: float, p=False): # edit_dist_threshold (as a percent) of word must match threshold = int(min(len(a), len(b)) * (1 - edit_dist_threshold)) - if p: - print(a, b, threshold, lev.distance(a, b)) return lev.distance(a, b) < threshold async def aget_k_closest( @@ -641,7 +639,6 @@ async def aget_k_closest( doc.metadata["vertex_id"], v_id, edit_dist_threshold_pct, - # v_id == "Dataframe", ) # don't have to merge verts with the same id (they're the same) and doc.metadata["vertex_id"] != v_id diff --git a/common/extractors/GraphExtractor.py b/common/extractors/GraphExtractor.py index 282729a4..2a7ba505 100644 --- a/common/extractors/GraphExtractor.py +++ b/common/extractors/GraphExtractor.py @@ -40,8 +40,7 @@ def extract(self, text) -> list[GraphDocument]: """ doc = Document(page_content=text) graph_docs = self.transformer.convert_to_graph_documents([doc]) - translated_docs = self.translate(graph_docs) - return translated_docs + return graph_docs async def aextract(self, text:str) -> list[GraphDocument]: """ diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql index 366b7ea7..4137ca68 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -166,14 +166,13 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max @@community_sum_in_map += (s.@community_id -> e.weight) ELSE // get LINKS_TO edge weights (how many edges are between communities) - // s.@community_k_in_map += (t.@community_id -> 1) @@source_target_k_in_map += (s.@community_vid -> (t.@community_vid -> e.weight)) END, t.@has_parent += TRUE // Used to help find unattached partitions POST-ACCUM // Write the results to a new community vertex (iteration + 1) // ID , iter, edges within the community - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1), ""), INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community ; @@ -183,7 +182,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max AND NOT s.@has_parent POST-ACCUM // if s is a part of an unattached partition, add to its community hierarchy to maintain parity with rest of graph - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, s.k_in + @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, ""), INSERT INTO HAS_PARENT VALUES (s, s.id+"_"+to_string(iteration+1)) // link Community's child/parent community ; diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql index 2ccbaf2c..42e9108d 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_init.gsql @@ -26,11 +26,11 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches DOUBLE wt = 1.0; // prevent multiple init runs - // z = SELECT s FROM AllNodes:s -(_)-> Community:t; - // IF z.size() > 0 THEN - // EXCEPTION reinit(400001); - // RAISE reinit("ERROR: the hierarchical communities have already been initialized"); - // END; + z = SELECT s FROM AllNodes:s -(_)-> Community:t; + IF z.size() > 0 THEN + EXCEPTION reinit(400001); + RAISE reinit("ERROR: the hierarchical communities have already been initialized"); + END; // init z = SELECT s FROM AllNodes:s @@ -42,11 +42,6 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches z = SELECT s FROM AllNodes:s -(_)-> ResolvedEntity:t ACCUM s.@k += wt, @@m += 1; - // POST-ACCUM - // s.@community_id = s, // assign node to its own community - // s.@community_vid = s.id, // external id - // s.@vid = getvid(s), // internal id (used in batching) - // s.@batch_id = s.@vid % n_batches; // get batch number PRINT z.size(); PRINT z; @@ -166,7 +161,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_init(UINT max_hop = 10, UINT n_batches END POST-ACCUM // ID , iter, edges within the community - INSERT INTO Community VALUES (s.@community_vid+"_1", 1, @@community_sum_in_map.get(s.@community_id), ""), + INSERT INTO Community VALUES (s.@community_vid+"_1", 1, ""), INSERT INTO IN_COMMUNITY VALUES (s, s.@community_vid+"_1") // link entity to it's first community ; diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index 7d9d1b83..ba5444bd 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - // ELSE IF v_type == "Concept" THEN - // res = SELECT s FROM start:s - // POST-ACCUM @@v_and_text += (s.id -> s.description), - // s.epoch_processing = datetime_to_epoch(now()); + ELSE IF v_type == "Concept" THEN + res = SELECT s FROM start:s + POST-ACCUM @@v_and_text += (s.id -> s.description), + s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), diff --git a/common/gsql/supportai/SupportAI_Schema.gsql b/common/gsql/supportai/SupportAI_Schema.gsql index 3e127d82..718ab1a7 100644 --- a/common/gsql/supportai/SupportAI_Schema.gsql +++ b/common/gsql/supportai/SupportAI_Schema.gsql @@ -20,7 +20,7 @@ CREATE SCHEMA_CHANGE JOB add_supportai_schema { ADD DIRECTED EDGE CONTAINS_DOCUMENT(FROM DocumentCollection, TO Document) WITH REVERSE_EDGE="reverse_CONTAINS_DOCUMENT"; // GraphRAG - ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, k_in UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Community (PRIMARY_ID id STRING, iteration UINT, description STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX ResolvedEntity(PRIMARY_ID id STRING, entity_type STRING) WITH PRIMARY_ID_AS_ATTRIBUTE="true"; ADD DIRECTED EDGE RELATIONSHIP(FROM Entity, TO Entity, relation_type STRING) WITH REVERSE_EDGE="reverse_RELATIONSHIP"; diff --git a/common/logs/logwriter.py b/common/logs/logwriter.py index f75be00c..ff13feed 100644 --- a/common/logs/logwriter.py +++ b/common/logs/logwriter.py @@ -142,7 +142,7 @@ def log(level, message, mask_pii=True, **kwargs): LogWriter.general_logger.info(message) @staticmethod - def info(message, mask_pii=False, **kwargs): + def info(message, mask_pii=True, **kwargs): LogWriter.log("info", message, mask_pii, **kwargs) @staticmethod diff --git a/common/py_schemas/schemas.py b/common/py_schemas/schemas.py index 07a2113f..a58d4660 100644 --- a/common/py_schemas/schemas.py +++ b/common/py_schemas/schemas.py @@ -20,10 +20,6 @@ class SupportAIMethod(enum.StrEnum): GRAPHRAG = enum.auto() -class EccConfig(BaseModel): - method: SupportAIMethod = SupportAIMethod.SUPPORTAI - - class GSQLQueryInfo(BaseModel): function_header: str description: str diff --git a/copilot/docs/notebooks/graphrag.ipynb b/copilot/docs/notebooks/graphrag.ipynb deleted file mode 100644 index 411f5d62..00000000 --- a/copilot/docs/notebooks/graphrag.ipynb +++ /dev/null @@ -1,398 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pyTigerGraph import TigerGraphConnection\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()\n", - "# We first create a connection to the database\n", - "host = os.environ[\"HOST\"]\n", - "username = os.getenv(\"USERNAME\", \"tigergraph\")\n", - "password = os.getenv(\"PASS\", \"tigergraph\")\n", - "conn = TigerGraphConnection(\n", - " host=host,\n", - " username=username,\n", - " password=password,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The graph GraphRAG_pytgdocs is created.'" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.graphname = \"GraphRAG_pytgdocs\"\n", - "conn.gsql(\"\"\"CREATE GRAPH GraphRAG_pytgdocs()\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "_ = conn.getToken()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'host_name': 'https://algotesting.i.tgcloud.io',\n", - " 'schema_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_schema].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_schema\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 0)\\\\nTrying to add local vertex \\'DocumentChunk\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Document\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Concept\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Entity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Relationship\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'DocumentCollection\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Content\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'Community\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local vertex \\'ResolvedEntity\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CONTENT\\' and its reverse edge \\'reverse_HAS_CONTENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_CHILD_OF\\' and its reverse edge \\'reverse_IS_CHILD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_HEAD_OF\\' and its reverse edge \\'reverse_IS_HEAD_OF\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_TAIL\\' and its reverse edge \\'reverse_HAS_TAIL\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_RELATIONSHIP\\' and its reverse edge \\'reverse_DESCRIBES_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'DESCRIBES_ENTITY\\' and its reverse edge \\'reverse_DESCRIBES_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_ENTITY\\' and its reverse edge \\'reverse_CONTAINS_ENTITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'MENTIONS_RELATIONSHIP\\' and its reverse edge \\'reverse_MENTIONS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IS_AFTER\\' and its reverse edge \\'reverse_IS_AFTER\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_CHILD\\' and its reverse edge \\'reverse_HAS_CHILD\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_RELATIONSHIP\\' and its reverse edge \\'reverse_HAS_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'CONTAINS_DOCUMENT\\' and its reverse edge \\'reverse_CONTAINS_DOCUMENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RELATIONSHIP\\' and its reverse edge \\'reverse_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVES_TO\\' and its reverse edge \\'reverse_RESOLVES_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'RESOLVED_RELATIONSHIP\\' and its reverse edge \\'reverse_RESOLVED_RELATIONSHIP\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'IN_COMMUNITY\\' and its reverse edge \\'reverse_IN_COMMUNITY\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'LINKS_TO\\' and its reverse edge \\'reverse_LINKS_TO\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add local edge \\'HAS_PARENT\\' and its reverse edge \\'reverse_HAS_PARENT\\' to the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 1\\\\nThe job add_supportai_schema completes in 1.845 seconds!\\\\nLocal schema change succeeded.\"',\n", - " 'index_creation_status': '\"Using graph \\'GraphRAG_pytgdocs\\'\\\\nSuccessfully created schema change jobs: [add_supportai_indexes].\\\\nWARNING: When modifying the graph schema, reinstalling all affected queries is required, and the duration of this process may vary based on the number and complexity of the queries. To skip query reinstallation, you can run with the \\'-N\\' option, but manual reinstallation of queries will be necessary afterwards.\\\\nKick off schema change job add_supportai_indexes\\\\nDoing schema change on graph \\'GraphRAG_pytgdocs\\' (current version: 1)\\\\nTrying to add index \\'doc_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_epoch_processing_indexepoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Document\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'doc_chunk_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'DocumentChunk\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_added_index\\' on the attribute \\'epoch_added\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processing_index\\' on the attribute \\'epoch_processing\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\nTrying to add index \\'concept_epoch_processed_index\\' on the attribute \\'epoch_processed\\' of local vertex \\'Concept\\' on the graph \\'GraphRAG_pytgdocs\\'.\\\\n\\\\nGraph GraphRAG_pytgdocs updated to new version 2\\\\nThe job add_supportai_indexes completes in 1.085 seconds!\\\\nLocal schema change succeeded.\"'}" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# # And then add CoPilot's address to the connection. This address\n", - "# # is the host's address where the CoPilot container is running.\n", - "conn.ai.configureCoPilotHost(\"http://localhost:8000\")\n", - "conn.ai.initializeSupportAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "access = os.environ[\"AWS_ACCESS_KEY_ID\"]\n", - "sec = os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n", - "res = conn.ai.createDocumentIngest(\n", - " data_source=\"s3\",\n", - " data_source_config={\"aws_access_key\": access, \"aws_secret_key\": sec},\n", - " loader_config={\"doc_id_field\": \"url\", \"content_field\": \"content\"},\n", - " file_format=\"json\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_cde7e4db979b4ba8a0b6ec5eb927f875.stream.SupportAI_GraphRAG_pytgdocs_48ee36da7b7644e4995722a6e057d446.1723494758507'}" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conn.ai.runDocumentIngest(\n", - " res[\"load_job_id\"],\n", - " res[\"data_source_id\"],\n", - " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# import httpx\n", - "# import base64\n", - "\n", - "\n", - "# def make_headers(conn: TigerGraphConnection):\n", - "# tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - "# headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - "# return headers\n", - "\n", - "\n", - "# httpx.get(\n", - "# \"http://localhost:8001/GraphRAG_pytgdocs/consistency_status/graphrag\",\n", - "# headers=make_headers(conn),\n", - "# timeout=None,\n", - "# )\n", - "# # conn.ai.forceConsistencyUpdate()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'asdf' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43masdf\u001b[49m\n", - "\u001b[0;31mNameError\u001b[0m: name 'asdf' is not defined" - ] - } - ], - "source": [ - "asdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for v in [\"Community\"]:\n", - " try:\n", - " conn.delVertices(v)\n", - " except:\n", - " pass\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sleep\n" - ] - }, - { - "data": { - "text/plain": [ - "{'job_name': 'load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a',\n", - " 'job_id': 'GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603',\n", - " 'log_location': '/home/tigergraph/tigergraph/log/kafkaLoader/GraphRAG_pytgdocs.load_documents_content_json_8a4ea730f21c43abbb58d818b9dd4d5a.stream.SupportAI_GraphRAG_pytgdocs_7aed8a01c9c1432b8026ea6c708bf08b.1723490129603'}" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "for v in [\"Document\", \"Content\", \"DocumentChunk\", \"Entity\",\"ResolvedEntity\",\"Community\"]:\n", - "# for v in [\"ResolvedEntity\"]:\n", - " try:\n", - " conn.delVertices(v)\n", - " except:\n", - " pass\n", - "\n", - "import time\n", - "\n", - "print('sleep')\n", - "time.sleep(3)\n", - "conn.ai.runDocumentIngest(\n", - " res[\"load_job_id\"],\n", - " res[\"data_source_id\"],\n", - " \"s3://tg-documentation/pytg_current/pytg_current.jsonl\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conn.gsql(f\"\"\"\n", - "USE GRAPH {conn.graphname}\n", - "DROP QUERY ResolveRelationships\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "import json\n", - "import httpx\n", - "import logging\n", - "\n", - "_ = logging.getLogger(__name__)\n", - "\n", - "\n", - "http_timeout = None\n", - "\n", - "\n", - "def make_headers(conn: TigerGraphConnection):\n", - " if conn.apiToken is None or conn.apiToken == \"\":\n", - " tkn = base64.b64encode(f\"{conn.username}:{conn.password}\".encode()).decode()\n", - " headers = {\"Authorization\": f\"Basic {tkn}\"}\n", - " else:\n", - " headers = {\"Authorization\": f\"Bearer {conn.apiToken}\"}\n", - "\n", - " return headers\n", - "\n", - "\n", - "def check_vertex_exists(conn, id):\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=http_timeout) as client:\n", - " res = client.get(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Entity/{id}\",\n", - " headers=headers,\n", - " )\n", - "\n", - " res.raise_for_status()\n", - " return res.json()\n", - "\n", - "\n", - "# r = check_vertex_exists(conn, \"asdfTigergraphexception\")\n", - "# print(json.dumps(r, indent=2), r[\"error\"])\n", - "r = check_vertex_exists(conn, \"Tigergraphexception\")\n", - "print(json.dumps(r, indent=2), r[\"error\"])\n", - "r[\"results\"][0][\"attributes\"][\"description\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "def check_vertex_has_desc(conn, comm: str):\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=None) as client:\n", - " resp = client.get(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}/vertices/Community/{comm}\",\n", - " headers=headers,\n", - " )\n", - " resp.raise_for_status()\n", - "\n", - " print(json.dumps(resp.json(),indent=2))\n", - " desc = resp.json()[\"results\"][0][\"attributes\"][\"description\"]\n", - " print(f\">>>*****{comm}:{desc}********\", flush=True)\n", - "\n", - " return len(desc) > 0\n", - "check_vertex_has_desc(conn,'Value_Property_1_2')\n", - "conn.upsertVertex(\"Community\",\"Rmse_1_2\",{\n", - " \"description\":\"asdf\"\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def map_attrs(attributes: dict):\n", - " # map attrs\n", - " attrs = {}\n", - " for k, v in attributes.items():\n", - " if isinstance(v, tuple):\n", - " attrs[k] = {\"value\": v[0], \"op\": v[1]}\n", - " elif isinstance(v, dict):\n", - " attrs[k] = {\n", - " \"value\": {\"keylist\": list(v.keys()), \"valuelist\": list(v.values())}\n", - " }\n", - " else:\n", - " attrs[k] = {\"value\": v}\n", - " return attrs\n", - "\n", - "\n", - "def process_id(v_id: str):\n", - " return v_id.replace(\" \", \"_\").replace(\"/\", \"\")\n", - "\n", - "\n", - "def a(vertex_id=\"Post /Requesttoken\"):\n", - " vertex_id = process_id(vertex_id)\n", - " attributes = { # attrs\n", - " \"description\": [\"test\"],\n", - " \"epoch_added\": int(time.time()),\n", - " }\n", - "\n", - " vertex_id = vertex_id.replace(\" \", \"_\")\n", - " attrs = map_attrs(attributes)\n", - " data = json.dumps({\"vertices\": {\"Entity\": {vertex_id: attrs}}})\n", - " headers = make_headers(conn)\n", - " with httpx.Client(timeout=http_timeout) as client:\n", - " res = client.post(\n", - " f\"{conn.restppUrl}/graph/{conn.graphname}\", data=data, headers=headers\n", - " )\n", - "\n", - " res.raise_for_status()\n", - "\n", - " return res.json()\n", - "\n", - "\n", - "a()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from urllib import parse\n", - "\n", - "v_id = \"Post_/Requesttoken\"\n", - "v_id = process_id(v_id)\n", - "print(v_id)\n", - "\n", - "r = check_vertex_exists(conn, v_id)\n", - "print(json.dumps(r, indent=2), r[\"error\"])\n", - "r[\"results\"][0][\"attributes\"][\"description\"]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ml", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 701e363e..34403f1e 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -188,11 +188,10 @@ def consistency_status( LogWriter.info(f"Returning consistency status for {graphname}: {status}") case SupportAIMethod.GRAPHRAG: background.add_task(graphrag.run, graphname, conn) - # asyncio.run(graphrag.run(graphname, conn)) import time ecc_status = ( - f"hi from graph rag ecc: {conn.graphname} ({graphname}) {time.ctime()}" + f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" ) case _: response.status_code = status.HTTP_404_NOT_FOUND From 2d1e98b16a759100686e115f35c3c479ad537ddb Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:34:16 -0400 Subject: [PATCH 15/53] reqs to fix unit tests --- copilot/requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7a8bd83f..03157f17 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,15 +68,15 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.25 -langchain-experimental==0.0.63 +langchain==0.2.12 +langchain-community==0.2.11 +langchain-core==0.2.29 +langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain_openai==0.1.19 +langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 @@ -152,4 +152,4 @@ wandb==0.15.12 watchfiles==0.20.0 websockets==11.0.3 yarl==1.9.2 -zipp==3.19.2 \ No newline at end of file +zipp==3.19.2 From e0065ee60b85b42b483e28ce0603c4ef2451c05b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:43:42 -0400 Subject: [PATCH 16/53] reqs to fix unit test --- copilot/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 03157f17..3035d7c1 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,6 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.12 langchain-community==0.2.11 -langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 2a5434abd2dcffac69e689d097e232f25be1ca09 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:47:01 -0400 Subject: [PATCH 17/53] reqs to fix unit test --- copilot/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 3035d7c1..302c9b44 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -79,7 +79,6 @@ langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 -langsmith==0.1.94 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From a43490a852729c076a7aa0a11d685298ce66c3da Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:51:28 -0400 Subject: [PATCH 18/53] reqs to fix unit test --- copilot/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 302c9b44..7df43165 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,17 +68,17 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 langchain-community==0.2.11 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 +langchain==0.2.12 +langchain-openai==0.1.20 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From 4b76e73d5a284b90bb923707e52e72dae3c1d040 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:57:39 -0400 Subject: [PATCH 19/53] reqs to fix unit test --- copilot/requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7df43165..ba1f04e3 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,17 +68,19 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 +langchain==0.2.12 langchain-community==0.2.11 +langchain-core==0.2.3 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 +langchain-openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 -langchain==0.2.12 -langchain-openai==0.1.20 +langsmith==0.1.94 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 From 115b1b3f9f5c046b1f8d03761dccdf43a9b32320 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:02:42 -0400 Subject: [PATCH 20/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index ba1f04e3..5e475767 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,7 +68,7 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 +langchain==0.2.13 langchain-community==0.2.11 langchain-core==0.2.3 langchain-experimental==0.0.64 From 58b5cbe6694f24f46f5e669e85b2e3abde0a1598 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:04:33 -0400 Subject: [PATCH 21/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 5e475767..7b30e5b5 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,7 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.13 langchain-community==0.2.11 -langchain-core==0.2.3 +langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From fa960394b2acb3f88ef9171218445c5c57915b84 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:11:50 -0400 Subject: [PATCH 22/53] reqs to fix unit test --- copilot/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7b30e5b5..632a6eba 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -68,15 +68,15 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.13 -langchain-community==0.2.11 -langchain-core==0.2.30 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain-openai==0.1.20 +langchain_openai==0.1.19 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 From 905d5cfa324d373af3dd7f9266c6d795ec122b1c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:30:37 -0400 Subject: [PATCH 23/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 632a6eba..e69f2be6 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,7 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -langchain-core==0.2.25 +langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 5e8b0aeaf569ffa9570ac871fd804dce21b89414 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:09:27 -0400 Subject: [PATCH 24/53] reqs to fix unit test --- copilot/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e69f2be6..e6fb3718 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -81,6 +81,7 @@ langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 langsmith==0.1.94 +Levenshtein==0.25.1 lomond==0.3.3 lxml==4.9.3 marshmallow==3.20.1 @@ -118,7 +119,7 @@ pyTigerDriver==1.0.15 pyTigerGraph==1.6.2 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.8.0 regex==2023.10.3 requests==2.32.2 rsa==4.9 From be0177e9b5d9dd50231d07fb43c8c5b6dd69b377 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:29:45 -0400 Subject: [PATCH 25/53] reqs to fix unit test --- copilot/requirements.txt | 237 +++++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 108 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e6fb3718..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,156 +1,177 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.29 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 langchain-experimental==0.0.64 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.19 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 -rapidfuzz==3.8.0 -regex==2023.10.3 +pyTigerGraph==1.6.5 +pytz==2024.1 +PyYAML==6.0.2 +rapidfuzz==3.9.6 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +typing_extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From cb43815468caf756311d087c03b25dc2395184fb Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:35:57 -0400 Subject: [PATCH 26/53] reqs to fix unit test --- common/requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index bb20e5b9..a8cc3d51 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,15 +70,14 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 +langchain==0.2.13 langchain-community==0.2.11 -langchain-core==0.2.29 +langchain-core==0.2.3 langchain-experimental==0.0.64 langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langsmith==0.1.98 Levenshtein==0.25.1 -langchain==0.2.11 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain_milvus==0.1.3 From ac6d3fe8d910eee102af6bab204437fc45626486 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:52:41 -0400 Subject: [PATCH 27/53] reqs to fix unit test --- .github/workflows/pull-test-merge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..3a61ecaf 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -33,7 +33,7 @@ jobs: python -m venv venv source venv/bin/activate python -m pip install --upgrade pip - pip install -r copilot/requirements.txt + pip install --no-cache-dir -r copilot/requirements.txt pip install pytest - name: Create db config From 60aa569ef12749af9b36c09684c62b12fda7231a Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:07:02 -0400 Subject: [PATCH 28/53] reqs to fix unit test --- copilot/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..7ee3073f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -78,7 +78,6 @@ jsonpointer==3.0.0 kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 -langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 @@ -89,7 +88,6 @@ langchainhub==0.1.21 langdetect==1.0.9 langgraph==0.2.3 langgraph-checkpoint==1.0.2 -langsmith==0.1.99 Levenshtein==0.25.1 lomond==0.3.3 lxml==5.3.0 From 2d377569d5aadac4396abe456320de39d4106966 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:17:56 -0400 Subject: [PATCH 29/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7ee3073f..7f6269f2 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 From 1929aa22cba052b004a335d61941fe7f5deb0d9b Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:51:26 -0400 Subject: [PATCH 30/53] reqs to fix unit test --- .github/workflows/pull-test-merge.yaml | 2 +- common/requirements.txt | 4 +- copilot/requirements.txt | 239 ++++++++++++------------- 3 files changed, 113 insertions(+), 132 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 3a61ecaf..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -33,7 +33,7 @@ jobs: python -m venv venv source venv/bin/activate python -m pip install --upgrade pip - pip install --no-cache-dir -r copilot/requirements.txt + pip install -r copilot/requirements.txt pip install pytest - name: Create db config diff --git a/common/requirements.txt b/common/requirements.txt index a8cc3d51..2d9a90ba 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,9 +70,9 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.13 +langchain==0.2.12 langchain-community==0.2.11 -langchain-core==0.2.3 +langchain-core==0.2.29 langchain-experimental==0.0.64 langchain-openai==0.1.20 langchain-text-splitters==0.2.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 7f6269f2..df06f401 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,175 +1,156 @@ -aiohappyeyeballs==2.3.5 -aiohttp==3.10.3 +aiohttp==3.9.3 aiosignal==1.3.1 -annotated-types==0.7.0 -anyio==4.4.0 +annotated-types==0.5.0 +anyio==3.7.1 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==24.2.0 -azure-core==1.30.2 -azure-storage-blob==12.22.0 +attrs==23.1.0 +azure-core==1.30.1 +azure-storage-blob==12.19.1 backoff==2.2.1 -beautifulsoup4==4.12.3 -boto3==1.34.159 -botocore==1.34.159 -cachetools==5.4.0 -certifi==2024.7.4 -cffi==1.17.0 +beautifulsoup4==4.12.2 +boto3==1.28.83 +botocore==1.31.83 +cachetools==5.3.2 +certifi==2023.7.22 +cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.2 +charset-normalizer==3.2.0 click==8.1.7 -contourpy==1.2.1 -cryptography==43.0.0 -cycler==0.12.1 -dataclasses-json==0.6.7 -deepdiff==7.0.1 -distro==1.9.0 +cryptography==42.0.5 +dataclasses-json==0.5.14 +distro==1.8.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.12.1 +emoji==2.8.0 environs==9.5.0 -exceptiongroup==1.2.2 -fastapi==0.112.0 +exceptiongroup==1.1.3 +fastapi==0.103.1 filelock==3.15.4 filetype==1.2.0 -fonttools==4.53.1 -frozenlist==1.4.1 -fsspec==2024.6.1 +frozenlist==1.4.0 +fsspec==2024.6.0 gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.19.1 -google-auth==2.33.0 -google-cloud-aiplatform==1.61.0 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-resource-manager==1.12.5 -google-cloud-storage==2.18.2 +GitPython==3.1.40 +google-api-core==2.14.0 +google-auth==2.23.4 +google-cloud-aiplatform==1.52.0 +google-cloud-bigquery==3.13.0 +google-cloud-core==2.3.3 +google-cloud-resource-manager==1.10.4 +google-cloud-storage==2.13.0 google-crc32c==1.5.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.63.2 -greenlet==3.0.3 -groq==0.9.0 -grpc-google-iam-v1==0.13.1 -grpcio==1.63.0 -grpcio-status==1.63.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 +greenlet==2.0.2 +groq==0.5.0 +grpc-google-iam-v1==0.12.7 +grpcio==1.59.2 +grpcio-status==1.59.2 h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub==0.24.5 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 +huggingface-hub==0.23.0 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.1.5 -idna==3.7 -importlib_metadata==8.2.0 +ibm_watsonx_ai==1.0.11 +idna==3.4 +importlib_metadata==8.0.0 iniconfig==2.0.0 isodate==0.6.1 -jiter==0.5.0 jmespath==1.0.1 -joblib==1.4.2 -jq==1.7.0 +joblib==1.3.2 +jq==1.6.0 jsonpatch==1.33 -jsonpath-python==1.0.6 -jsonpointer==3.0.0 -kiwisolver==1.4.5 -langchain==0.2.13 -langchain-community==0.2.12 -langchain-experimental==0.0.64 -langchain-groq==0.1.9 -langchain-ibm==0.1.12 -langchain-milvus==0.1.4 -langchain-openai +jsonpointer==2.4 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 +langchain-experimental==0.0.63 +langchain-groq==0.1.8 +langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 -langchainhub==0.1.21 -langdetect==1.0.9 -langgraph==0.2.3 -langgraph-checkpoint==1.0.2 +langchain_milvus==0.1.3 +langchain_openai==0.1.19 +langchainhub==0.1.20 Levenshtein==0.25.1 +langdetect==1.0.9 +langgraph==0.1.16 +langsmith==0.1.94 lomond==0.3.3 -lxml==5.3.0 -marshmallow==3.21.3 -matplotlib==3.9.2 -milvus-lite==2.4.9 -minio==7.2.7 -multidict==6.0.5 +lxml==4.9.3 +marshmallow==3.20.1 +matplotlib==3.9.1 +minio==7.2.5 +multidict==6.0.4 mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 -openai==1.40.6 -ordered-set==4.1.0 -orjson==3.10.7 -packaging==24.1 -pandas==2.1.4 +openai==1.37.1 +orjson==3.9.15 +packaging==23.2 +pandas==2.1.1 pathtools==0.1.2 -pillow==10.4.0 -platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.24.0 -protobuf==5.27.3 -psutil==6.0.0 -pyarrow==17.0.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pycparser==2.22 +proto-plus==1.22.3 +protobuf==4.24.4 +psutil==5.9.6 +pyarrow==15.0.1 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 pycryptodome==3.20.0 -pydantic==2.8.2 -pydantic_core==2.20.1 -pygit2==1.15.1 -pymilvus==2.4.5 -pyparsing==3.1.2 -pypdf==4.3.1 -pytest==8.3.2 +pydantic==2.3.0 +pydantic_core==2.6.3 +pygit2==1.13.2 +pymilvus==2.4.4 +pytest==8.2.0 python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-iso639==2024.4.27 +python-dotenv==1.0.0 +python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.5 -pytz==2024.1 -PyYAML==6.0.2 -rapidfuzz==3.9.6 -regex==2024.7.24 +pyTigerGraph==1.6.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +rapidfuzz==3.4.0 +regex==2023.10.3 requests==2.32.2 -requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.10.2 +s3transfer==0.7.0 scikit-learn==1.5.1 -scipy==1.14.0 -sentry-sdk==2.13.0 +sentry-sdk==1.32.0 setproctitle==1.3.3 -shapely==2.0.5 +shapely==2.0.2 six==1.16.0 smmap==5.0.1 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.32 -starlette==0.37.2 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.20 +starlette==0.27.0 tabulate==0.9.0 -tenacity==8.5.0 -threadpoolctl==3.5.0 +tenacity==8.2.3 tiktoken==0.7.0 -tqdm==4.66.5 -types-requests==2.32.0.20240712 +tqdm==4.66.1 +types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -unstructured==0.15.1 -unstructured-client==0.25.5 -urllib3==2.2.2 -uvicorn==0.30.6 -uvloop==0.19.0 -validators==0.33.0 -wandb==0.17.6 -watchfiles==0.23.0 -websockets==12.0 -wrapt==1.16.0 -yarl==1.9.4 -zipp==3.20.0 +typing_extensions==4.8.0 +tzdata==2023.3 +ujson==5.9.0 +unstructured==0.10.23 +urllib3==1.26.18 +uvicorn==0.23.2 +uvloop==0.17.0 +validators==0.22.0 +wandb==0.15.12 +watchfiles==0.20.0 +websockets==11.0.3 +yarl==1.9.2 +zipp==3.19.2 From f33ddef95e855bf1b400966ff4615ae64b89cdbc Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:52:49 -0400 Subject: [PATCH 31/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index df06f401..fad0e729 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -119,7 +119,7 @@ pyTigerDriver==1.0.15 pyTigerGraph==1.6.2 pytz==2023.3.post1 PyYAML==6.0.1 -rapidfuzz==3.4.0 +rapidfuzz==3.9.6 regex==2023.10.3 requests==2.32.2 rsa==4.9 From 1a971813609fb9eca7849826025f1d04ea1e85b4 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:57:19 -0400 Subject: [PATCH 32/53] langchain-openai conflicts --- common/llm_services/openai_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 22b32380..7b166398 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,6 +1,7 @@ import logging import os -from langchain_openai import ChatOpenAI + +from langchain_community.chat_models.openai import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv From e9f7468e44ec311e1621fb91d4abe7b4665137f1 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:31:30 -0400 Subject: [PATCH 33/53] reqs to fix unit test --- common/requirements.txt | 8 +- copilot/requirements.txt | 239 +++++++++++++++++++++------------------ 2 files changed, 134 insertions(+), 113 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index 2d9a90ba..122b1b73 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -70,11 +70,11 @@ joblib==1.3.2 jq==1.6.0 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.2.12 -langchain-community==0.2.11 -langchain-core==0.2.29 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 langchain-experimental==0.0.64 -langchain-openai==0.1.20 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langsmith==0.1.98 Levenshtein==0.25.1 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index fad0e729..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,156 +1,177 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-core==0.2.25 -langchain-experimental==0.0.63 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 +langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.19 -langchainhub==0.1.20 -Levenshtein==0.25.1 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 +pyTigerGraph==1.6.5 +pytz==2024.1 +PyYAML==6.0.2 rapidfuzz==3.9.6 -regex==2023.10.3 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +typing_extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From c8248d72e614d055f4fc1e5969373fa25d134f1e Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:48:48 -0400 Subject: [PATCH 34/53] reqs to fix unit test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..e1a28c91 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 -langchain-milvus==0.1.4 +langchain-milvus==0.1.3 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 From 210d0fc74c57bb7919f36798822286461402fd95 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:58:24 -0400 Subject: [PATCH 35/53] reqs to fix unit test --- common/requirements.txt | 223 ++++++++++++++++++++++------------------ 1 file changed, 121 insertions(+), 102 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index 122b1b73..af45c357 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,158 +1,177 @@ -aiochannel==1.2.1 -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.159 +botocore==1.34.159 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.8.0 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 +fonttools==4.53.1 +frozenlist==1.4.1 fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +GitPython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm_watsonx_ai==1.1.5 +idna==3.7 +importlib_metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 langchain-core==0.2.30 langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langsmith==0.1.98 -Levenshtein==0.25.1 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 -langchain_milvus==0.1.3 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +Levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.40.2 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 pyTigerDriver==1.0.15 pyTigerGraph==1.6.5 -pytz==2023.3.post1 -PyYAML==6.0.1 +pytz==2024.1 +PyYAML==6.0.2 rapidfuzz==3.9.6 -regex==2023.10.3 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 typing-inspect==0.9.0 typing_extensions==4.12.2 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From 9c8b183273e5649d36c4a6e8bdd0f42c198df77c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:06:59 -0400 Subject: [PATCH 36/53] reqs to fix unit test --- common/requirements.txt | 2 -- copilot/requirements.txt | 2 -- 2 files changed, 4 deletions(-) diff --git a/common/requirements.txt b/common/requirements.txt index af45c357..97fe5736 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -95,7 +95,6 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 -milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -123,7 +122,6 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 -pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e1a28c91..d2426a03 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -95,7 +95,6 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 -milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -123,7 +122,6 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 -pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 From e4d8168dfe4d3c44ba57844a3e6abbe2472ac8a2 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:11:28 -0400 Subject: [PATCH 37/53] reqs to fix unit test --- common/requirements.txt | 2 ++ copilot/requirements.txt | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/common/requirements.txt b/common/requirements.txt index 97fe5736..af45c357 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -95,6 +95,7 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 +milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -122,6 +123,7 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 +pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 diff --git a/copilot/requirements.txt b/copilot/requirements.txt index d2426a03..af45c357 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -82,7 +82,7 @@ langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 -langchain-milvus==0.1.3 +langchain-milvus==0.1.4 langchain-openai==0.1.21 langchain-text-splitters==0.2.2 langchainhub==0.1.21 @@ -95,6 +95,7 @@ lomond==0.3.3 lxml==5.3.0 marshmallow==3.21.3 matplotlib==3.9.2 +milvus-lite==2.4.9 minio==7.2.7 multidict==6.0.5 mypy-extensions==1.0.0 @@ -122,6 +123,7 @@ pycryptodome==3.20.0 pydantic==2.8.2 pydantic_core==2.20.1 pygit2==1.15.1 +pymilvus==2.4.5 pyparsing==3.1.2 pypdf==4.3.1 pytest==8.3.2 From 538653f2f09c3abd0d1df456d501758b776e9f57 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:24:20 -0400 Subject: [PATCH 38/53] reqs to fix unit tests --- common/embeddings/milvus_embedding_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 7169379e..de7812fd 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,7 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from langchain_milvus.vectorstores import Milvus +# from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException From a63d3768971f6ddd01dfc59e7cd33a15ec073aa4 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:28:32 -0400 Subject: [PATCH 39/53] reqs to fix unit tests --- common/embeddings/embedding_services.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index 13c2cfd0..8020b97f 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -134,7 +134,8 @@ def __init__(self, config): super().__init__( config, model_name=config.get("model_name", "OpenAI gpt-4-0613") ) - from langchain_openai import OpenAIEmbeddings + # from langchain_openai import OpenAIEmbeddings + from langchain_community.embeddings.openai import OpenAIEmbeddings self.embeddings = OpenAIEmbeddings() From fe6643c79af599316cc6bf397c3ffac4053fa361 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:55:04 -0400 Subject: [PATCH 40/53] smoke test --- .../graphRAG/louvain/graphrag_louvain_communities.gsql | 2 +- common/gsql/supportai/Scan_For_Updates.gsql | 8 ++++---- common/llm_services/openai_service.py | 2 +- copilot/app/routers/supportai.py | 7 ------- copilot/requirements.txt | 1 + eventual-consistency-service/app/graphrag/util.py | 2 +- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql index 4137ca68..241ccaf0 100644 --- a/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql +++ b/common/gsql/graphRAG/louvain/graphrag_louvain_communities.gsql @@ -172,7 +172,7 @@ CREATE DISTRIBUTED QUERY graphrag_louvain_communities(UINT iteration=1, UINT max POST-ACCUM // Write the results to a new community vertex (iteration + 1) // ID , iter, edges within the community - INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1), ""), + INSERT INTO Community VALUES (s.id+"_"+to_string(iteration+1), iteration+1, ""), INSERT INTO HAS_PARENT VALUES (s, s.@community_vid+"_"+to_string(iteration+1)) // link Community's child/parent community ; diff --git a/common/gsql/supportai/Scan_For_Updates.gsql b/common/gsql/supportai/Scan_For_Updates.gsql index ba5444bd..7d9d1b83 100644 --- a/common/gsql/supportai/Scan_For_Updates.gsql +++ b/common/gsql/supportai/Scan_For_Updates.gsql @@ -24,10 +24,10 @@ CREATE DISTRIBUTED QUERY Scan_For_Updates(STRING v_type = "Document", res = SELECT s FROM start:s -(HAS_CONTENT)-> Content:c ACCUM @@v_and_text += (s.id -> c.text) POST-ACCUM s.epoch_processing = datetime_to_epoch(now()); - ELSE IF v_type == "Concept" THEN - res = SELECT s FROM start:s - POST-ACCUM @@v_and_text += (s.id -> s.description), - s.epoch_processing = datetime_to_epoch(now()); + // ELSE IF v_type == "Concept" THEN + // res = SELECT s FROM start:s + // POST-ACCUM @@v_and_text += (s.id -> s.description), + // s.epoch_processing = datetime_to_epoch(now()); ELSE IF v_type == "Entity" THEN res = SELECT s FROM start:s POST-ACCUM @@v_and_text += (s.id -> s.definition), diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 7b166398..4f70b8cf 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,7 +1,7 @@ import logging import os -from langchain_community.chat_models.openai import ChatOpenAI +from langchain_openai.chat_models import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv diff --git a/copilot/app/routers/supportai.py b/copilot/app/routers/supportai.py index 7b09acc9..0eff3c41 100644 --- a/copilot/app/routers/supportai.py +++ b/copilot/app/routers/supportai.py @@ -18,13 +18,6 @@ HNSWSiblingRetriever, ) -from common.config import ( - db_config, - embedding_service, - embedding_store, - get_llm_service, - llm_config, -) from common.config import ( db_config, embedding_service, diff --git a/copilot/requirements.txt b/copilot/requirements.txt index af45c357..d287660f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 diff --git a/eventual-consistency-service/app/graphrag/util.py b/eventual-consistency-service/app/graphrag/util.py index bcf1befe..186ab11a 100644 --- a/eventual-consistency-service/app/graphrag/util.py +++ b/eventual-consistency-service/app/graphrag/util.py @@ -111,7 +111,7 @@ async def init( vector_field=milvus_config.get("vector_field", "document_vector"), text_field=milvus_config.get("text_field", "document_content"), vertex_field=vertex_field, - drop_old=True, + drop_old=False, ) LogWriter.info(f"Initializing {name}") From 64b3998e3d1a3838e46848eb9d69954ccf12b763 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:03:03 -0400 Subject: [PATCH 41/53] smoke test --- .github/workflows/pull-test-merge.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..a7c93c7e 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,6 +34,7 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt + pip install -U langchain-core pip install pytest - name: Create db config From e08d42a5d498615679b9859e93b3f67e94d70d0f Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:07:55 -0400 Subject: [PATCH 42/53] smoke test --- .github/workflows/pull-test-merge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index a7c93c7e..e7cdd5a1 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,7 +34,7 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt - pip install -U langchain-core + pip install -U langchain-core langchain pip install pytest - name: Create db config From 17b09df8611321363012db71a75d3fc404ee0e54 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:12:16 -0400 Subject: [PATCH 43/53] smoke test --- copilot/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index d287660f..662ec077 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -79,12 +79,11 @@ jsonpointer==3.0.0 kiwisolver==1.4.5 langchain==0.2.13 langchain-community==0.2.12 -langchain-core==0.2.30 langchain-experimental==0.0.64 langchain-groq==0.1.9 langchain-ibm==0.1.12 langchain-milvus==0.1.4 -langchain-openai==0.1.21 +langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 From 6ce885f341bef5ae1fa5b1216d7dfa7254d3a17c Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:22:12 -0400 Subject: [PATCH 44/53] smoke test --- .github/workflows/pull-test-merge.yaml | 1 - copilot/requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index e7cdd5a1..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -34,7 +34,6 @@ jobs: source venv/bin/activate python -m pip install --upgrade pip pip install -r copilot/requirements.txt - pip install -U langchain-core langchain pip install pytest - name: Create db config diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 662ec077..e4da4613 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -87,6 +87,7 @@ langchain-openai==0.1.20 langchain-text-splitters==0.2.2 langchainhub==0.1.21 langdetect==1.0.9 +langchain-core==0.2.29 langgraph==0.2.3 langgraph-checkpoint==1.0.2 langsmith==0.1.99 From 442564bde03c69974ed8a953c5baa04d68681964 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:25:14 -0400 Subject: [PATCH 45/53] smoke test --- copilot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index e4da4613..56b5f71f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -77,7 +77,7 @@ jsonpatch==1.33 jsonpath-python==1.0.6 jsonpointer==3.0.0 kiwisolver==1.4.5 -langchain==0.2.13 +langchain==0.2.12 langchain-community==0.2.12 langchain-experimental==0.0.64 langchain-groq==0.1.9 From 2d8675eb540caa4fca57283f24f4ded62ec90752 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:32:40 -0400 Subject: [PATCH 46/53] smoke test --- copilot/requirements.txt | 240 ++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 131 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 56b5f71f..f737ed1f 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,178 +1,156 @@ -aiochannel==1.2.1 -aiohappyeyeballs==2.3.5 -aiohttp==3.10.3 +aiohttp==3.9.3 aiosignal==1.3.1 -annotated-types==0.7.0 -anyio==4.4.0 +annotated-types==0.5.0 +anyio==3.7.1 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==24.2.0 -azure-core==1.30.2 -azure-storage-blob==12.22.0 +attrs==23.1.0 +azure-core==1.30.1 +azure-storage-blob==12.19.1 backoff==2.2.1 -beautifulsoup4==4.12.3 -boto3==1.34.159 -botocore==1.34.159 -cachetools==5.4.0 -certifi==2024.7.4 -cffi==1.17.0 +beautifulsoup4==4.12.2 +boto3==1.28.83 +botocore==1.31.83 +cachetools==5.3.2 +certifi==2023.7.22 +cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.2 +charset-normalizer==3.2.0 click==8.1.7 -contourpy==1.2.1 -cryptography==43.0.0 -cycler==0.12.1 -dataclasses-json==0.6.7 -deepdiff==7.0.1 -distro==1.9.0 +cryptography==42.0.5 +dataclasses-json==0.5.14 +distro==1.8.0 docker-pycreds==0.4.0 docstring_parser==0.16 -emoji==2.12.1 +emoji==2.8.0 environs==9.5.0 -exceptiongroup==1.2.2 -fastapi==0.112.0 +exceptiongroup==1.1.3 +fastapi==0.103.1 filelock==3.15.4 filetype==1.2.0 -fonttools==4.53.1 -frozenlist==1.4.1 -fsspec==2024.6.1 +frozenlist==1.4.0 +fsspec==2024.6.0 gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.19.1 -google-auth==2.33.0 -google-cloud-aiplatform==1.61.0 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-resource-manager==1.12.5 -google-cloud-storage==2.18.2 +GitPython==3.1.40 +google-api-core==2.14.0 +google-auth==2.23.4 +google-cloud-aiplatform==1.52.0 +google-cloud-bigquery==3.13.0 +google-cloud-core==2.3.3 +google-cloud-resource-manager==1.10.4 +google-cloud-storage==2.13.0 google-crc32c==1.5.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.63.2 -greenlet==3.0.3 -groq==0.9.0 -grpc-google-iam-v1==0.13.1 -grpcio==1.63.0 -grpcio-status==1.63.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 +greenlet==2.0.2 +groq==0.5.0 +grpc-google-iam-v1==0.12.7 +grpcio==1.59.2 +grpcio-status==1.59.2 h11==0.14.0 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub==0.24.5 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 +huggingface-hub==0.23.0 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.1.5 -idna==3.7 -importlib_metadata==8.2.0 +ibm_watsonx_ai==1.0.11 +idna==3.4 +importlib_metadata==8.0.0 iniconfig==2.0.0 isodate==0.6.1 -jiter==0.5.0 jmespath==1.0.1 -joblib==1.4.2 -jq==1.7.0 +joblib==1.3.2 +jq==1.6.0 jsonpatch==1.33 -jsonpath-python==1.0.6 -jsonpointer==3.0.0 -kiwisolver==1.4.5 -langchain==0.2.12 -langchain-community==0.2.12 -langchain-experimental==0.0.64 -langchain-groq==0.1.9 -langchain-ibm==0.1.12 -langchain-milvus==0.1.4 -langchain-openai==0.1.20 +jsonpointer==2.4 +langchain==0.2.11 +langchain-community==0.2.10 +langchain-core==0.2.25 +langchain-experimental==0.0.63 +langchain-groq==0.1.8 +langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 -langchainhub==0.1.21 +langchain_milvus==0.1.3 +langchain_openai==0.1.19 +langchainhub==0.1.20 langdetect==1.0.9 -langchain-core==0.2.29 -langgraph==0.2.3 -langgraph-checkpoint==1.0.2 -langsmith==0.1.99 +langgraph==0.1.16 +langsmith==0.1.94 Levenshtein==0.25.1 lomond==0.3.3 -lxml==5.3.0 -marshmallow==3.21.3 -matplotlib==3.9.2 -milvus-lite==2.4.9 -minio==7.2.7 -multidict==6.0.5 +lxml==4.9.3 +marshmallow==3.20.1 +matplotlib==3.9.1 +minio==7.2.5 +multidict==6.0.4 mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -nltk==3.8.2 +nltk==3.8.1 numpy==1.26.4 -openai==1.40.6 -ordered-set==4.1.0 -orjson==3.10.7 -packaging==24.1 -pandas==2.1.4 +openai==1.37.1 +orjson==3.9.15 +packaging==23.2 +pandas==2.1.1 pathtools==0.1.2 -pillow==10.4.0 -platformdirs==4.2.2 pluggy==1.5.0 prometheus_client==0.20.0 -proto-plus==1.24.0 -protobuf==5.27.3 -psutil==6.0.0 -pyarrow==17.0.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pycparser==2.22 +proto-plus==1.22.3 +protobuf==4.24.4 +psutil==5.9.6 +pyarrow==15.0.1 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 pycryptodome==3.20.0 -pydantic==2.8.2 -pydantic_core==2.20.1 -pygit2==1.15.1 -pymilvus==2.4.5 -pyparsing==3.1.2 -pypdf==4.3.1 -pytest==8.3.2 +pydantic==2.3.0 +pydantic_core==2.6.3 +pygit2==1.13.2 +pymilvus==2.4.4 +pytest==8.2.0 python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-iso639==2024.4.27 +python-dotenv==1.0.0 +python-iso639==2023.6.15 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph==1.6.5 -pytz==2024.1 -PyYAML==6.0.2 -rapidfuzz==3.9.6 -regex==2024.7.24 +pyTigerGraph==1.6.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +rapidfuzz==3.8.0 +regex==2023.10.3 requests==2.32.2 -requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.10.2 +s3transfer==0.7.0 scikit-learn==1.5.1 -scipy==1.14.0 -sentry-sdk==2.13.0 +sentry-sdk==1.32.0 setproctitle==1.3.3 -shapely==2.0.5 +shapely==2.0.2 six==1.16.0 smmap==5.0.1 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.32 -starlette==0.37.2 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.20 +starlette==0.27.0 tabulate==0.9.0 -tenacity==8.5.0 -threadpoolctl==3.5.0 +tenacity==8.2.3 tiktoken==0.7.0 -tqdm==4.66.5 -types-requests==2.32.0.20240712 +tqdm==4.66.1 +types-requests==2.31.0.6 types-urllib3==1.26.25.14 typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -unstructured==0.15.1 -unstructured-client==0.25.5 -urllib3==2.2.2 -uvicorn==0.30.6 -uvloop==0.19.0 -validators==0.33.0 -wandb==0.17.6 -watchfiles==0.23.0 -websockets==12.0 -wrapt==1.16.0 -yarl==1.9.4 -zipp==3.20.0 +typing_extensions==4.8.0 +tzdata==2023.3 +ujson==5.9.0 +unstructured==0.10.23 +urllib3==1.26.18 +uvicorn==0.23.2 +uvloop==0.17.0 +validators==0.22.0 +wandb==0.15.12 +watchfiles==0.20.0 +websockets==11.0.3 +yarl==1.9.2 +zipp==3.19.2 From e9f5e9d2719e68022f742f5dd2d50943669eb051 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:42:56 -0400 Subject: [PATCH 47/53] smoke test --- common/embeddings/milvus_embedding_store.py | 4 ++-- copilot/requirements.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index de7812fd..c60a8e2e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,8 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -# from langchain_milvus.vectorstores import Milvus -from langchain_community.vectorstores.milvus import Milvus +from langchain_milvus.vectorstores import Milvus +# from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException diff --git a/copilot/requirements.txt b/copilot/requirements.txt index f737ed1f..98af8b4b 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,13 +70,13 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -langchain-core==0.2.25 +# langchain-core==0.2.25 langchain-experimental==0.0.63 langchain-groq==0.1.8 langchain-ibm==0.1.11 langchain-text-splitters==0.2.2 langchain_milvus==0.1.3 -langchain_openai==0.1.19 +langchain_openai==0.1.20 langchainhub==0.1.20 langdetect==1.0.9 langgraph==0.1.16 From 0ca73a31dfb653b8cab07111a4db019e000171d9 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:50:02 -0400 Subject: [PATCH 48/53] smoke test --- .github/workflows/pull-test-merge.yaml | 2 ++ copilot/requirements.txt | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 654703d8..20024b34 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -64,6 +64,8 @@ jobs: - name: Run pytest run: | source venv/bin/activate + pip install -r copilot/requirements.txt + pip install -U langchain langchain-core cp -r copilot/tests/*test* copilot/tests/create_wandb_report.py copilot/app/ cd copilot/app python -m pytest --disable-warnings diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 98af8b4b..ac0a6ba6 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -70,7 +70,6 @@ jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.11 langchain-community==0.2.10 -# langchain-core==0.2.25 langchain-experimental==0.0.63 langchain-groq==0.1.8 langchain-ibm==0.1.11 From 8252c1ecb193f11d570f27263afa1ff990814806 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:58:48 -0400 Subject: [PATCH 49/53] smoke test --- copilot/requirements.txt | 247 +++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 112 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index ac0a6ba6..5aed6147 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,155 +1,178 @@ -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -annotated-types==0.5.0 -anyio==3.7.1 +annotated-types==0.7.0 +anyio==4.4.0 appdirs==1.4.4 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 async-timeout==4.0.3 asyncer==0.0.7 -attrs==23.1.0 -azure-core==1.30.1 -azure-storage-blob==12.19.1 +attrs==24.2.0 +azure-core==1.30.2 +azure-storage-blob==12.22.0 backoff==2.2.1 -beautifulsoup4==4.12.2 -boto3==1.28.83 -botocore==1.31.83 -cachetools==5.3.2 -certifi==2023.7.22 -cffi==1.16.0 +beautifulsoup4==4.12.3 +boto3==1.34.160 +botocore==1.34.160 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 chardet==5.2.0 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 click==8.1.7 -cryptography==42.0.5 -dataclasses-json==0.5.14 -distro==1.8.0 +contourpy==1.2.1 +cryptography==43.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +deepdiff==7.0.1 +distro==1.9.0 docker-pycreds==0.4.0 -docstring_parser==0.16 -emoji==2.8.0 +docstring-parser==0.16 +emoji==2.12.1 environs==9.5.0 -exceptiongroup==1.1.3 -fastapi==0.103.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 filelock==3.15.4 filetype==1.2.0 -frozenlist==1.4.0 -fsspec==2024.6.0 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 gitdb==4.0.11 -GitPython==3.1.40 -google-api-core==2.14.0 -google-auth==2.23.4 -google-cloud-aiplatform==1.52.0 -google-cloud-bigquery==3.13.0 -google-cloud-core==2.3.3 -google-cloud-resource-manager==1.10.4 -google-cloud-storage==2.13.0 +gitpython==3.1.43 +google-api-core==2.19.1 +google-auth==2.33.0 +google-cloud-aiplatform==1.62.0 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-resource-manager==1.12.5 +google-cloud-storage==2.18.2 google-crc32c==1.5.0 -google-resumable-media==2.6.0 -googleapis-common-protos==1.61.0 -greenlet==2.0.2 -groq==0.5.0 -grpc-google-iam-v1==0.12.7 -grpcio==1.59.2 -grpcio-status==1.59.2 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +groq==0.9.0 +grpc-google-iam-v1==0.13.1 +grpcio==1.63.0 +grpcio-status==1.63.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 -huggingface-hub==0.23.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.5 ibm-cos-sdk==2.13.6 ibm-cos-sdk-core==2.13.6 ibm-cos-sdk-s3transfer==2.13.6 -ibm_watsonx_ai==1.0.11 -idna==3.4 -importlib_metadata==8.0.0 +ibm-watsonx-ai==1.1.5 +idna==3.7 +importlib-metadata==8.2.0 iniconfig==2.0.0 isodate==0.6.1 +jiter==0.5.0 jmespath==1.0.1 -joblib==1.3.2 -jq==1.6.0 +joblib==1.4.2 +jq==1.7.0 jsonpatch==1.33 -jsonpointer==2.4 -langchain==0.2.11 -langchain-community==0.2.10 -langchain-experimental==0.0.63 -langchain-groq==0.1.8 -langchain-ibm==0.1.11 +jsonpath-python==1.0.6 +jsonpointer==3.0.0 +kiwisolver==1.4.5 +langchain==0.2.13 +langchain-community==0.2.12 +langchain-core==0.2.30 +langchain-experimental==0.0.64 +langchain-groq==0.1.9 +langchain-ibm==0.1.12 +langchain-milvus==0.1.4 +langchain-openai==0.1.21 langchain-text-splitters==0.2.2 -langchain_milvus==0.1.3 -langchain_openai==0.1.20 -langchainhub==0.1.20 +langchainhub==0.1.21 langdetect==1.0.9 -langgraph==0.1.16 -langsmith==0.1.94 -Levenshtein==0.25.1 +langgraph==0.2.3 +langgraph-checkpoint==1.0.2 +langsmith==0.1.99 +levenshtein==0.25.1 lomond==0.3.3 -lxml==4.9.3 -marshmallow==3.20.1 -matplotlib==3.9.1 -minio==7.2.5 -multidict==6.0.4 +lxml==5.3.0 +marshmallow==3.21.3 +matplotlib==3.9.2 +milvus-lite==2.4.9 +minio==7.2.7 +multidict==6.0.5 mypy-extensions==1.0.0 -nltk==3.8.1 +nest-asyncio==1.6.0 +nltk==3.8.2 numpy==1.26.4 -openai==1.37.1 -orjson==3.9.15 -packaging==23.2 -pandas==2.1.1 +openai==1.40.6 +ordered-set==4.1.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.1.4 pathtools==0.1.2 +pillow==10.4.0 +platformdirs==4.2.2 pluggy==1.5.0 -prometheus_client==0.20.0 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.6 -pyarrow==15.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 +prometheus-client==0.20.0 +proto-plus==1.24.0 +protobuf==5.27.3 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1-modules==0.4.0 +pycparser==2.22 pycryptodome==3.20.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -pygit2==1.13.2 -pymilvus==2.4.4 -pytest==8.2.0 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygit2==1.15.1 +pymilvus==2.4.5 +pyparsing==3.1.2 +pypdf==4.3.1 +pytest==8.3.2 python-dateutil==2.9.0.post0 -python-dotenv==1.0.0 -python-iso639==2023.6.15 +python-dotenv==1.0.1 +python-iso639==2024.4.27 python-magic==0.4.27 -pyTigerDriver==1.0.15 -pyTigerGraph==1.6.2 -pytz==2023.3.post1 -PyYAML==6.0.1 -rapidfuzz==3.8.0 -regex==2023.10.3 +pytigerdriver==1.0.15 +pytigergraph==1.6.5 +pytz==2024.1 +pyyaml==6.0.2 +rapidfuzz==3.9.6 +regex==2024.7.24 requests==2.32.2 +requests-toolbelt==1.0.0 rsa==4.9 -s3transfer==0.7.0 +s3transfer==0.10.2 scikit-learn==1.5.1 -sentry-sdk==1.32.0 +scipy==1.14.0 +sentry-sdk==2.13.0 setproctitle==1.3.3 -shapely==2.0.2 +setuptools==72.2.0 +shapely==2.0.5 six==1.16.0 smmap==5.0.1 -sniffio==1.3.0 -soupsieve==2.5 -SQLAlchemy==2.0.20 -starlette==0.27.0 +sniffio==1.3.1 +soupsieve==2.6 +sqlalchemy==2.0.32 +starlette==0.37.2 tabulate==0.9.0 -tenacity==8.2.3 +tenacity==8.5.0 +threadpoolctl==3.5.0 tiktoken==0.7.0 -tqdm==4.66.1 -types-requests==2.31.0.6 +tqdm==4.66.5 +types-requests==2.32.0.20240712 types-urllib3==1.26.25.14 +typing-extensions==4.12.2 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzdata==2023.3 -ujson==5.9.0 -unstructured==0.10.23 -urllib3==1.26.18 -uvicorn==0.23.2 -uvloop==0.17.0 -validators==0.22.0 -wandb==0.15.12 -watchfiles==0.20.0 -websockets==11.0.3 -yarl==1.9.2 -zipp==3.19.2 +tzdata==2024.1 +ujson==5.10.0 +unstructured==0.15.1 +unstructured-client==0.25.5 +urllib3==2.2.2 +uvicorn==0.30.6 +uvloop==0.19.0 +validators==0.33.0 +wandb==0.17.6 +watchfiles==0.23.0 +websockets==12.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From 8777b3c0927348a5021b437da0614ee10de02c00 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:35:37 -0400 Subject: [PATCH 50/53] smoke test --- .github/workflows/pull-test-merge.yaml | 16 ++++++++-------- common/embeddings/milvus_embedding_store.py | 1 - common/llm_services/openai_service.py | 5 ++++- eventual-consistency-service/app/main.py | 7 ++++--- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 20024b34..2c032524 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -12,12 +12,12 @@ jobs: test: runs-on: [ self-hosted, dind ] - services: - milvus: - image: milvusdb/milvus:latest - ports: - - 19530:19530 - - 19121:19121 + # services: + # milvus: + # image: milvusdb/milvus:latest + # ports: + # - 19530:19530 + # - 19121:19121 steps: - name: Checkout code @@ -30,6 +30,8 @@ jobs: - name: Install and Check Python Setup run: | + pip install uv + alias pip='uv pip' python -m venv venv source venv/bin/activate python -m pip install --upgrade pip @@ -64,8 +66,6 @@ jobs: - name: Run pytest run: | source venv/bin/activate - pip install -r copilot/requirements.txt - pip install -U langchain langchain-core cp -r copilot/tests/*test* copilot/tests/create_wandb_report.py copilot/app/ cd copilot/app python -m pytest --disable-warnings diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index c60a8e2e..7169379e 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -8,7 +8,6 @@ from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document from langchain_milvus.vectorstores import Milvus -# from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException diff --git a/common/llm_services/openai_service.py b/common/llm_services/openai_service.py index 4f70b8cf..aad5d44f 100644 --- a/common/llm_services/openai_service.py +++ b/common/llm_services/openai_service.py @@ -1,7 +1,10 @@ import logging import os -from langchain_openai.chat_models import ChatOpenAI +if os.getenv("ECC"): + from langchain_openai.chat_models import ChatOpenAI +else: + from langchain_community.chat_models import ChatOpenAI from common.llm_services import LLM_Model from common.logs.log import req_id_cv diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 34403f1e..2c308074 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,3 +1,6 @@ +import os + +os.environ["ECC"] = True import json import logging from contextlib import asynccontextmanager @@ -190,9 +193,7 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) import time - ecc_status = ( - f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" - ) + ecc_status = f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}" From 69a7db449135d2a9413d53238b3a23906043da54 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:38:32 -0400 Subject: [PATCH 51/53] smoke test --- .github/workflows/pull-test-merge.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 2c032524..19e1ab08 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -12,12 +12,12 @@ jobs: test: runs-on: [ self-hosted, dind ] - # services: - # milvus: - # image: milvusdb/milvus:latest - # ports: - # - 19530:19530 - # - 19121:19121 + services: + milvus: + image: milvusdb/milvus:latest + ports: + - 19530:19530 + - 19121:19121 steps: - name: Checkout code From 4dfa51cefb6b5364894920efb58ecedc54760ef6 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:41:41 -0400 Subject: [PATCH 52/53] smoke test --- .github/workflows/pull-test-merge.yaml | 2 -- common/embeddings/milvus_embedding_store.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull-test-merge.yaml b/.github/workflows/pull-test-merge.yaml index 19e1ab08..654703d8 100644 --- a/.github/workflows/pull-test-merge.yaml +++ b/.github/workflows/pull-test-merge.yaml @@ -30,8 +30,6 @@ jobs: - name: Install and Check Python Setup run: | - pip install uv - alias pip='uv pip' python -m venv venv source venv/bin/activate python -m pip install --upgrade pip diff --git a/common/embeddings/milvus_embedding_store.py b/common/embeddings/milvus_embedding_store.py index 7169379e..de7812fd 100644 --- a/common/embeddings/milvus_embedding_store.py +++ b/common/embeddings/milvus_embedding_store.py @@ -7,7 +7,8 @@ from asyncer import asyncify from langchain_community.vectorstores import Milvus from langchain_core.documents.base import Document -from langchain_milvus.vectorstores import Milvus +# from langchain_milvus.vectorstores import Milvus +from langchain_community.vectorstores.milvus import Milvus from pymilvus import MilvusException, connections, utility from pymilvus.exceptions import MilvusException From 56f8e16bc72fa5dbb0985db81bef71419c274ec2 Mon Sep 17 00:00:00 2001 From: RobRossmiller-TG <165701656+RobRossmiller-TG@users.noreply.github.com> Date: Tue, 13 Aug 2024 22:10:57 -0400 Subject: [PATCH 53/53] working --- copilot/requirements.txt | 1 + eventual-consistency-service/app/graphrag/graph_rag.py | 2 +- eventual-consistency-service/app/main.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/copilot/requirements.txt b/copilot/requirements.txt index 5aed6147..4a5ac3d1 100644 --- a/copilot/requirements.txt +++ b/copilot/requirements.txt @@ -1,3 +1,4 @@ +aiochannel==1.2.1 aiohappyeyeballs==2.3.5 aiohttp==3.10.3 aiosignal==1.3.1 diff --git a/eventual-consistency-service/app/graphrag/graph_rag.py b/eventual-consistency-service/app/graphrag/graph_rag.py index 86f172b8..ecca36b2 100644 --- a/eventual-consistency-service/app/graphrag/graph_rag.py +++ b/eventual-consistency-service/app/graphrag/graph_rag.py @@ -437,5 +437,5 @@ async def run(graphname: str, conn: TigerGraphConnection): end = time.perf_counter() logger.info(f"DONE. graphrag system initializer dT: {init_end-init_start}") logger.info(f"DONE. graphrag entity resolution dT: {entity_end-entity_start}") - logger.info(f"DONE. graphrag initializer dT: {community_end-community_start}") + logger.info(f"DONE. graphrag community initializer dT: {community_end-community_start}") logger.info(f"DONE. graphrag.run() total time elaplsed: {end-init_start}") diff --git a/eventual-consistency-service/app/main.py b/eventual-consistency-service/app/main.py index 2c308074..2ccc10e2 100644 --- a/eventual-consistency-service/app/main.py +++ b/eventual-consistency-service/app/main.py @@ -1,6 +1,6 @@ import os -os.environ["ECC"] = True +os.environ["ECC"] = "true" import json import logging from contextlib import asynccontextmanager @@ -193,7 +193,7 @@ def consistency_status( background.add_task(graphrag.run, graphname, conn) import time - ecc_status = f"GraphRAG initialization: {conn.graphname} ({graphname}) {time.ctime()}" + ecc_status = f"GraphRAG initialization on {conn.graphname} {time.ctime()}" case _: response.status_code = status.HTTP_404_NOT_FOUND return f"Method unsupported, must be {SupportAIMethod.SUPPORTAI}, {SupportAIMethod.GRAPHRAG}"