forked from onlyphantom/llm-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path10_journal.py
103 lines (82 loc) · 2.89 KB
/
10_journal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from dotenv import load_dotenv
load_dotenv()
import argparse
import logging
import sys
from pathlib import Path
from llama_index import (
ObsidianReader,
GPTVectorStoreIndex,
StorageContext,
load_index_from_storage
)
# to see token counter and token usage for the LLM and Embedding
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
OBSIDIAN_DIR = "/home/samuel/vaults/fragments/journals"
docs = ObsidianReader(OBSIDIAN_DIR).load_data()
def read_journal_md(file_path):
from bs4 import BeautifulSoup
import markdown
import re
with open(file_path, "r") as f:
text = f.read()
html = markdown.markdown(text)
soup = BeautifulSoup(html, "html.parser")
# take only the first <p> tag
# p = soup.find("p")
ps = soup.find_all("p")
# take only the p tags that have at least 2 `|` characters
p = [p for p in ps if p.text.count("|") > 1]
# replace all characters before the first `|` character with ''
result = re.sub(r'^[^|]*\|', '', p[0].text)
print(f"Finished processing {file_path}")
return result
def create_journal_nodes(dir_path):
"""
Examples: https://gpt-index.readthedocs.io/en/stable/guides/primer/usage_pattern.html
"""
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
docs = []
parser = SimpleNodeParser()
# loop through each markdown file in the directory
for file_path in Path(dir_path).glob("*.md"):
md = read_journal_md(file_path)
# construct documents manually using the lower level Document struct
docs.append(Document(md))
nodes = parser.get_nodes_from_documents(docs)
return nodes, docs
if Path("./storage").exists():
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)
else:
nodes, docs = create_journal_nodes(OBSIDIAN_DIR)
index = GPTVectorStoreIndex(nodes)
index.storage_context.persist(persist_dir="./storage")
if __name__ == "__main__":
"""
Usage: python 10_journal_x.py -q "what are places I ate at in March and April?"
"""
query_engine = index.as_query_engine()
# cli argument parser
parser = argparse.ArgumentParser(
prog="QueryJournal",
description="Query my bullet journals in Obsidian using Llama Index."
)
parser.add_argument(
"-q",
"--query",
type=str,
help="Ask a question answerable in my journals",
required=True
)
args = parser.parse_args()
query = args.query
if(query):
res = query_engine.query(query)
print(f"Query: {query}")
print(f"Results: \n {res}")
else:
print("No query provided. Exiting...")
exit(0)