diff --git a/templates/ui/app-config-map.yml b/templates/ui/app-config-map.yml index d4f1c63..59185bc 100644 --- a/templates/ui/app-config-map.yml +++ b/templates/ui/app-config-map.yml @@ -5,4 +5,55 @@ metadata: labels: {{- include "azimuth-llm.labels" . | nindent 4 }} data: -{{ (.Files.Glob "web-app/*").AsConfig | indent 2 }} \ No newline at end of file + app.py: | + import huggingface_hub + from huggingface_hub import InferenceClient + import gradio as gr + from startup import wait_for_backend + + backend_url = "http://{{ .Values.api.service.name }}.{{ .Release.Namespace }}.svc" + wait_for_backend(backend_url) + + client = InferenceClient(model=backend_url) + + def inference(message, history): + + if message == "": + yield "" + + partial_message = "" + try: + for token in client.text_generation(message, max_new_tokens=500, stream=True): + partial_message += token + # Strip text marker from generated output + partial_message = partial_message.replace('<|endoftext|>', '') + yield partial_message + except huggingface_hub.inference._text_generation.ValidationError as e: + raise gr.Error("Context length exceeded. Please clear the chat window.") + + gr.ChatInterface( + inference, + chatbot=gr.Chatbot( + height=500, + show_copy_button=True, + ), + title="Azimuth LLM", + description="This is the demo UI for the Azimuth LLM application.", + textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7), + retry_btn="Retry", + undo_btn="Undo", + clear_btn="Clear", + ).queue().launch(server_name="0.0.0.0") + startup.py: | + import requests, time + + def wait_for_backend(url): + ready = False + while not ready: + try: + ready = (requests.get(f'{url}/health').status_code == 200) + print('Waiting for backend API to start') + time.sleep(5) + except requests.exceptions.ConnectionError as e: + pass + return \ No newline at end of file diff --git a/web-app/app.py b/web-app/app.py deleted file mode 100644 index 20d6422..0000000 --- a/web-app/app.py +++ /dev/null @@ -1,42 +0,0 @@ -import huggingface_hub -from huggingface_hub import InferenceClient -import gradio as gr -from startup import wait_for_backend - -backend_url = "http://text-generation-inference.default.svc" -wait_for_backend(backend_url) - - -client = InferenceClient(model=backend_url) - -def inference(message, history): - - if message == "": - yield "" - - partial_message = "" - try: - for token in client.text_generation(message, max_new_tokens=500, stream=True): - partial_message += token - # Strip text marker from generated output - partial_message = partial_message.replace('<|endoftext|>', '') - yield partial_message - except huggingface_hub.inference._text_generation.ValidationError as e: - # yield "Context length exceeded. Please clear the chat window." - raise gr.Error("Context length exceeded. Please clear the chat window.") - -gr.ChatInterface( - inference, - chatbot=gr.Chatbot( - height=500, - show_copy_button=True, - # layout='panel', - ), - textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7), - # description="This is the demo for Gradio UI consuming TGI endpoint.", - title="Azimuth LLM", - # examples=["What is OpenStack?", "Who are StackHPC?", "Give me the k8s pod yaml for an ubuntu container."], - retry_btn="Retry", - undo_btn="Undo", - clear_btn="Clear", -).queue().launch(server_name="0.0.0.0") \ No newline at end of file diff --git a/web-app/startup.py b/web-app/startup.py deleted file mode 100644 index 465a038..0000000 --- a/web-app/startup.py +++ /dev/null @@ -1,12 +0,0 @@ -import requests, time - -def wait_for_backend(url): - ready = False - while not ready: - try: - ready = (requests.get(f'{url}/health').status_code == 200) - print('Waiting for backend API to start') - time.sleep(5) - except requests.exceptions.ConnectionError as e: - pass - return \ No newline at end of file