Skip to content

Commit

Permalink
support google cloud storage
Browse files Browse the repository at this point in the history
  • Loading branch information
CodingWithTim committed Jan 4, 2025
1 parent f1c6185 commit f7e92e1
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 8 deletions.
3 changes: 3 additions & 0 deletions fastchat/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
IMAGE_MODERATION_MSG = (
"$MODERATION$ YOUR IMAGE VIOLATES OUR CONTENT MODERATION GUIDELINES."
)
PDF_MODERATION_MSG = (
"$MODERATION$ YOUR PDF VIOLATES OUR CONTENT MODERATION GUIDELINES."
)
MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
Expand Down
41 changes: 39 additions & 2 deletions fastchat/serve/gradio_block_arena_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,44 @@ def wrap_pdfchat_query(query, document):
}


# TODO: P1: Integrate this.
def pdf_moderator(images):
import base64
from openai import OpenAI
from io import BytesIO

base64_urls = []
for image in images:
buffer = BytesIO()
image.save(buffer, format="JPEG")

image_bytes = buffer.getvalue()
image_b64 = base64.b64encode(image_bytes).decode("utf-8")

# convert to openai format
base64_urls.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_b64}",
}
})

# OpenAI's maximum number of images is 1 at the moment.
client = OpenAI()
moderations = []
for url in base64_urls:
try:
response = client.moderations.create(
model="omni-moderation-latest",
input=url,
)
moderations.append(response[0].results.flagged)
except Exception as e:
print(e)

return all(moderations)


def detect_language_from_doc(pdf_file_path):
from pdf2image import convert_from_path
from polyglot.detect import Detector
Expand All @@ -272,6 +310,7 @@ def detect_language_from_doc(pdf_file_path):

# Convert pdf into image (first page only for efficiency)
images = convert_from_path(pdf_file_path)

extracted_text = pytesseract.image_to_string(
images[0], lang=TESSERACT_SUPPORTED_LANGS
)
Expand All @@ -291,8 +330,6 @@ def parse_pdf(file_path):
doc_lang = detect_language_from_doc(file_path)
doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]]

print(doc_lang)

for _ in range(LLAMA_PARSE_MAX_RETRY):
try:
documents = LlamaParse(
Expand Down
27 changes: 22 additions & 5 deletions fastchat/serve/gradio_block_arena_vision_anony.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from fastchat.constants import (
TEXT_MODERATION_MSG,
IMAGE_MODERATION_MSG,
PDF_MODERATION_MSG,
MODERATION_MSG,
CONVERSATION_LIMIT_MSG,
SLOW_MODEL_MSG,
Expand Down Expand Up @@ -77,6 +78,8 @@
build_logger,
moderation_filter,
image_moderation_filter,
upload_pdf_file_to_gcs,
hash_pdf,
)

logger = build_logger("gradio_web_server_multi", "gradio_web_server_multi.log")
Expand Down Expand Up @@ -297,10 +300,26 @@ def add_text(
PDFCHAT_SAMPLING_WEIGHTS,
PDFCHAT_SAMPLING_BOOST_MODELS,
)

# Save an unique id for mapping conversation back to the file on google cloud.
unique_id = hash_pdf(pdfs[0])

states = [
State(model_left, is_vision=False),
State(model_right, is_vision=False),
State(
model_left,
is_vision=False,
pdf_id=unique_id
),
State(
model_right,
is_vision=False,
pdf_id=unique_id
),
]
upload_pdf_file_to_gcs(
pdf_file_path=pdfs[0],
filename=unique_id,
)
else:
model_left, model_right = get_battle_pair(
context.all_text_models,
Expand All @@ -309,7 +328,6 @@ def add_text(
SAMPLING_WEIGHTS,
SAMPLING_BOOST_MODELS,
)

states = [
State(model_left, is_vision=False),
State(model_right, is_vision=False),
Expand All @@ -333,12 +351,11 @@ def add_text(

images = convert_images_to_conversation_format(images)

# TODO: add PDF moderator
text, image_flagged, csam_flag = moderate_input(
state0, text, text, model_list, images, ip
)

# TODO: add PDF moderator

conv = states[0].conv
if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT:
logger.info(f"conversation turn limit. ip: {get_ip(request)}. text: {text}")
Expand Down
4 changes: 3 additions & 1 deletion fastchat/serve/gradio_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,14 @@


class State:
def __init__(self, model_name, is_vision=False):
def __init__(self, model_name, is_vision=False, pdf_id=None):
self.conv = get_conversation_template(model_name)
self.conv_id = uuid.uuid4().hex
self.skip_next = False
self.model_name = model_name
self.oai_thread_id = None
self.is_vision = is_vision
self.pdf_id = pdf_id # NOTE(Tim): Version 1 PDFChat Architecture, could be revised later.

# NOTE(chris): This could be sort of a hack since it assumes the user only uploads one image. If they can upload multiple, we should store a list of image hashes.
self.has_csam_image = False
Expand Down Expand Up @@ -151,6 +152,7 @@ def dict(self):
{
"conv_id": self.conv_id,
"model_name": self.model_name,
"pdf_id": self.pdf_id,
}
)

Expand Down
19 changes: 19 additions & 0 deletions fastchat/serve/setup_pdfchat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

# Install Python packages
pip install llama-index-core llama-parse llama-index-readers-file python-dotenv
pip install polyglot
pip install PyICU
pip install pycld2
pip install pytesseract

pip install pdf2image

# Clone the Tesseract tessdata repository
git clone https://github.com/tesseract-ocr/tessdata

# cd into tessdata and set TESSDATA_PREFIX to the current directory
cd tessdata
export TESSDATA_PREFIX="$(pwd)"

echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX"
33 changes: 33 additions & 0 deletions fastchat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,20 @@ def upload_image_file_to_gcs(image, filename):
return blob.public_url


def upload_pdf_file_to_gcs(pdf_file_path, filename):
from google.cloud import storage

storage_client = storage.Client()
# upload file to GCS
bucket = storage_client.get_bucket("arena-pdf-dev")

blob = bucket.blob(filename)
# Automatically opens the file in binary read mode
blob.upload_from_filename(pdf_file_path, content_type="application/pdf")

return blob.public_url


def get_image_file_from_gcs(filename):
from google.cloud import storage

Expand All @@ -441,6 +455,25 @@ def get_image_file_from_gcs(filename):
return contents


def get_pdf_file_from_gcs(filename):
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.get_bucket("arena-pdf-dev")
blob = bucket.blob(f"{filename}")
contents = blob.download_as_bytes()

return contents


def hash_pdf(file_path):
import hashlib

with open(file_path, 'rb') as f:
file_content = f.read()
return hashlib.md5(file_content).hexdigest()


def image_moderation_request(image_bytes, endpoint, api_key):
headers = {"Content-Type": "image/jpeg", "Ocp-Apim-Subscription-Key": api_key}

Expand Down

0 comments on commit f7e92e1

Please sign in to comment.