-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflask_bigsis.py
113 lines (99 loc) · 4.67 KB
/
flask_bigsis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from flask import Flask, request, jsonify
testapp = Flask(__name__)
import json
import csv
import matplotlib.pyplot as plt
import os
from langchain.llms import OpenAI
import ast
import openai
import pandas as pd
import tiktoken
from scipy import spatial
os.environ["OPENAI_API_KEY"] = "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"
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
# GPT_MODEL = "gpt-3.5-turbo"
GPT_MODEL = "gpt-4"
# download pre-chunked text and pre-computed embeddings
embeddings_path = "bigsiscontent_embeddings.csv"
embedding_df = pd.read_csv(embeddings_path)
# convert embeddings from CSV str type back to list type
embedding_df['embedding'] = embedding_df['embedding'].apply(ast.literal_eval)
# llm initialization
llm = OpenAI(openai_api_key="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")
# search function
def strings_ranked_by_relatedness(
query: str,
df: pd.DataFrame,
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
top_n: int = 100
) -> tuple[list[str], list[float]]:
"""Returns a list of strings and relatednesses, sorted from most related to least."""
query_embedding_response = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=query,
)
query_embedding = query_embedding_response["data"][0]["embedding"]
strings_and_relatednesses = [
(row["text"], relatedness_fn(query_embedding, row["embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
strings, relatednesses = zip(*strings_and_relatednesses)
return strings[:top_n], relatednesses[:top_n]
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
"""Return the number of tokens in a string."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def query_message(
query: str,
df: pd.DataFrame,
model: str,
token_budget: int
) -> str:
"""Return a message for GPT, with relevant source texts pulled from a dataframe."""
strings, relatednesses = strings_ranked_by_relatedness(query, embedding_df)
introduction = 'Use the attached content sets on topics related to sexual and reproductive health to answer the subsequent question. If the answer cannot be found in the content sets, write "I could not find an answer." DO NOT RESPOND WITH ANYTHING OUTSIDE OF THE CONTENT SETS.'
question = f"\n\nQuestion: {query}"
message = introduction
for string in strings:
next_set = f'\n\nContent set:\n"""\n{string}\n"""'
if (
num_tokens(message + next_set + question, model=model)
> token_budget
):
break
else:
message += next_set
return message + question
def ask(
query: str,
df: pd.DataFrame = embedding_df,
model: str = GPT_MODEL,
token_budget: int = 4096 - 500,
print_message: bool = False,
) -> str:
"""Answers a query using GPT and a dataframe of relevant texts and embeddings."""
message = query_message(query, df, model=model, token_budget=token_budget)
if print_message:
print(message)
messages = [
{"role": "system", "content": "You answer questions about topics related to sexual and reproductive health. Use a casual tone similar to the content sets that young people can relate to. Use relevant emojis. An example response might sound like the following examples: 1. '🙌 Glad ur asking about safe sex. There's so much info out there it can be confusing sometimes. I'm here to help give u the accurate information 🤓so u can make ur own choice on what might be right for u if the time comes 👌' or 2. 'The 📖 penis is a male sex organ. It releases sperm during ejaculation. (It's also used for peeing but don't worry, it doesn't happen at the same time! 😂) It's attached to the scrotum, (also called balls). This houses the testicles which are full of sperm & hormones'. Make sure you do only respond with 'I could not find an answer' if the content is not available in the content sets provided to you."},
{"role": "user", "content": message},
]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0.2
)
response_message = response["choices"][0]["message"]["content"]
return response_message
@testapp.route('/myfunction', methods=['POST'])
def my_function():
data = request.get_json(force=True)
input_text = data['input']
output = ask(input_text) # replace with your function
return jsonify({'output': output})
if __name__ == '__main__':
testapp.run(debug=True)