From 7de608bbcdce23ed0e2a0e6fcf3849a20d921489 Mon Sep 17 00:00:00 2001 From: smallketchup82 Date: Wed, 14 Aug 2024 00:55:27 -0400 Subject: [PATCH] move to c# --- .dockerignore | 7 - .env.example | 11 - .gitignore | 488 +++++++++++++++++- .idea/.idea.galaxygpt/.idea/.gitignore | 13 + .idea/.idea.galaxygpt/.idea/dataSources.xml | 15 + .idea/.idea.galaxygpt/.idea/encodings.xml | 4 + .idea/.idea.galaxygpt/.idea/indexLayout.xml | 8 + .../inspectionProfiles/Project_Default.xml | 19 + .idea/.idea.galaxygpt/.idea/vcs.xml | 6 + .vscode/settings.json | 3 - Dockerfile.api | 13 - Dockerfile.web | 1 - README.md | 1 - api.py | 134 ----- dataset-assistant/Program.cs | 243 +++++++++ dataset-assistant/dataset-assistant.csproj | 19 + dataset.py | 339 ------------ docker-compose.yaml | 18 - dump-database.sh | 24 - galaxygpt-api/ConfigureSwaggerOptions.cs | 23 + galaxygpt-api/Program.cs | 94 ++++ galaxygpt-api/Properties/launchSettings.json | 41 ++ galaxygpt-api/SwaggerDefaultValues.cs | 60 +++ galaxygpt-api/appsettings.Development.json | 8 + galaxygpt-api/galaxygpt-api.csproj | 21 + galaxygpt-api/galaxygpt-api.http | 6 + galaxygpt-tests/GlobalUsings.cs | 1 + galaxygpt-tests/UnitTest1.cs | 11 + galaxygpt-tests/galaxygpt-tests.csproj | 30 ++ galaxygpt.sln | 34 ++ galaxygpt/Database/Chunk.cs | 11 + galaxygpt/Database/Page.cs | 22 + galaxygpt/Database/VectorDB.cs | 12 + galaxygpt/GalaxyGpt.cs | 183 +++++++ galaxygpt/galaxygpt.csproj | 23 + main.py | 362 ------------- requirements.txt | 22 - ruff.toml | 9 - ui/index.css | 97 ---- ui/index.html | 53 -- ui/index.js | 72 --- 41 files changed, 1387 insertions(+), 1174 deletions(-) delete mode 100644 .dockerignore delete mode 100644 .env.example create mode 100644 .idea/.idea.galaxygpt/.idea/.gitignore create mode 100644 .idea/.idea.galaxygpt/.idea/dataSources.xml create mode 100644 .idea/.idea.galaxygpt/.idea/encodings.xml create mode 100644 .idea/.idea.galaxygpt/.idea/indexLayout.xml create mode 100644 .idea/.idea.galaxygpt/.idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/.idea.galaxygpt/.idea/vcs.xml delete mode 100644 .vscode/settings.json delete mode 100644 Dockerfile.api delete mode 100644 Dockerfile.web delete mode 100644 README.md delete mode 100644 api.py create mode 100644 dataset-assistant/Program.cs create mode 100644 dataset-assistant/dataset-assistant.csproj delete mode 100644 dataset.py delete mode 100644 docker-compose.yaml delete mode 100644 dump-database.sh create mode 100644 galaxygpt-api/ConfigureSwaggerOptions.cs create mode 100644 galaxygpt-api/Program.cs create mode 100644 galaxygpt-api/Properties/launchSettings.json create mode 100644 galaxygpt-api/SwaggerDefaultValues.cs create mode 100644 galaxygpt-api/appsettings.Development.json create mode 100644 galaxygpt-api/galaxygpt-api.csproj create mode 100644 galaxygpt-api/galaxygpt-api.http create mode 100644 galaxygpt-tests/GlobalUsings.cs create mode 100644 galaxygpt-tests/UnitTest1.cs create mode 100644 galaxygpt-tests/galaxygpt-tests.csproj create mode 100644 galaxygpt.sln create mode 100644 galaxygpt/Database/Chunk.cs create mode 100644 galaxygpt/Database/Page.cs create mode 100644 galaxygpt/Database/VectorDB.cs create mode 100644 galaxygpt/GalaxyGpt.cs create mode 100644 galaxygpt/galaxygpt.csproj delete mode 100644 main.py delete mode 100644 requirements.txt delete mode 100644 ruff.toml delete mode 100644 ui/index.css delete mode 100644 ui/index.html delete mode 100644 ui/index.js diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 6ae4539..0000000 --- a/.dockerignore +++ /dev/null @@ -1,7 +0,0 @@ -venv -.env -ui -.vscode -.git -**/.tar.gz -__pycache__ \ No newline at end of file diff --git a/.env.example b/.env.example deleted file mode 100644 index 781f9d5..0000000 --- a/.env.example +++ /dev/null @@ -1,11 +0,0 @@ -OPENAI_API_KEY= -OPENAI_ORG_ID= -DATASET= -DEBUG=True -VERSION=0.3.0 -DATABASE_PASSWORD= -DISCORD_WEBHOOK_URL= -ADCS=False -MAX_CONTEXT_LEN= -MODEL=gpt-3.5-turbo -EMBEDDING_MODEL=text-embedding-3-small \ No newline at end of file diff --git a/.gitignore b/.gitignore index 462f3d3..9c07e29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,481 @@ -venv/ +### dotenv template .env -__pycache__ -processed/ -.idea/ -*.tar.gz -*.old -galaxypedia*.csv -dataset-*/ \ No newline at end of file + +### Csharp template +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml + +### Rider template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +.idea/.idea.galaxygpt/.idea/discord.xml diff --git a/.idea/.idea.galaxygpt/.idea/.gitignore b/.idea/.idea.galaxygpt/.idea/.gitignore new file mode 100644 index 0000000..4eabb2d --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/.gitignore @@ -0,0 +1,13 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Rider ignored files +/modules.xml +/projectSettingsUpdater.xml +/contentModel.xml +/.idea.galaxygpt.iml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/.idea.galaxygpt/.idea/dataSources.xml b/.idea/.idea.galaxygpt/.idea/dataSources.xml new file mode 100644 index 0000000..5b49746 --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/dataSources.xml @@ -0,0 +1,15 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:$PROJECT_DIR$/galaxygpt/embeddings.db + + + + $ProjectFileDir$ + + + \ No newline at end of file diff --git a/.idea/.idea.galaxygpt/.idea/encodings.xml b/.idea/.idea.galaxygpt/.idea/encodings.xml new file mode 100644 index 0000000..df87cf9 --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/.idea.galaxygpt/.idea/indexLayout.xml b/.idea/.idea.galaxygpt/.idea/indexLayout.xml new file mode 100644 index 0000000..7b08163 --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/indexLayout.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/.idea.galaxygpt/.idea/inspectionProfiles/Project_Default.xml b/.idea/.idea.galaxygpt/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..29bd053 --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,19 @@ + + + + \ No newline at end of file diff --git a/.idea/.idea.galaxygpt/.idea/vcs.xml b/.idea/.idea.galaxygpt/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/.idea.galaxygpt/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index a6735e5..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.analysis.typeCheckingMode": "off" -} \ No newline at end of file diff --git a/Dockerfile.api b/Dockerfile.api deleted file mode 100644 index c7afe4d..0000000 --- a/Dockerfile.api +++ /dev/null @@ -1,13 +0,0 @@ -FROM python:3.10 - -WORKDIR /app - -COPY requirements.txt requirements.txt - -RUN apt update && apt install build-essential -y - -RUN pip install --no-cache-dir -U -r requirements.txt - -COPY . . - -CMD ["python", "api.py"] diff --git a/Dockerfile.web b/Dockerfile.web deleted file mode 100644 index 8bd0a7f..0000000 --- a/Dockerfile.web +++ /dev/null @@ -1 +0,0 @@ -FROM nginx diff --git a/README.md b/README.md deleted file mode 100644 index 16f5b3a..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# GalaxyGPT \ No newline at end of file diff --git a/api.py b/api.py deleted file mode 100644 index 0862931..0000000 --- a/api.py +++ /dev/null @@ -1,134 +0,0 @@ -import logging -import os -import traceback - -import flask -from flask_limiter import Limiter -from waitress import serve - -from main import ADCS, answer_question, df, strtobool - - -# Load Flask -def get_proxy_remote_address(): - """ - :return: the ip address for the current request (or 127.0.0.1 if none found) - """ - if flask.request.headers.get("X-Forwarded-For") is not None: - return str(flask.request.headers.get("X-Forwarded-For")) - return flask.request.remote_addr or "127.0.0.1" - - -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -app = flask.Flask(__name__) -limiter = Limiter( - get_proxy_remote_address, - app=app, - storage_uri="memory://", -) - -# Check if the dataset is set in the environment variables -if not os.getenv("DATASET"): - raise Exception("Please set the DATASET environment variable") - -adcsservice = ADCS() - -# Create the API to answer questions -@app.route("/api/v1/ask", methods=["POST"]) -@limiter.limit("10/30 seconds") -def ask(): - data: dict = flask.request.get_json() - - if data is not None: - question: str = data["prompt"] - - if question is not None: - print(question + " - " + get_proxy_remote_address()) - - username: str = data["username"] or None - - # If the username is provided, use it to answer the question - if username is not None: - try: - answer = answer_question( - df, question=question, username=username, debug=True - ) - return flask.jsonify(answer), 200 - except Exception as e: - print(traceback.format_exc(), flush=True) - return flask.jsonify({"error": str(e)}), 500 - else: - # Otherwise, answer the question without a username - try: - answer = answer_question(df, question=question, debug=True) - return flask.jsonify(answer), 200 - except Exception as e: - print(traceback.format_exc(), flush=True) - return flask.jsonify({"error": str(e)}), 500 - - - else: - return flask.jsonify({"error": "No prompt"}), 400 - else: - return flask.jsonify({"error": "No data"}), 400 - - -# Create the Web UI -""" @app.route('/') -def index(path): - print(path + "index.html") - try: - return flask.send_from_directory('ui', path) - except werkzeug.exceptions.NotFound as e: - if path.endswith("/"): - return flask.send_from_directory('ui', path + "index.html") - raise e """ - -# ADCS API -@app.route("/api/ADCS/start", methods=["POST"]) -def startADCS(): - adcsservice.start() - return flask.jsonify({"status": "started"}), 200 - -@app.route("/api/ADCS/stop", methods=["POST"]) -def stopADCS(): - adcsservice.stop() - return flask.jsonify({"status": "stopped"}), 200 - -@app.route("/api/ADCS/force-create", methods=["POST"]) -def forceCreateADCS(): - reload = strtobool(flask.request.args.get("reload", default="True")) - noembeddings = strtobool(flask.request.args.get("noembeddings", default="False")) - - # For the time being, set noembeddings to True manually - noembeddings = True - - adcsservice.createDataset(reload=reload, noembeddings=noembeddings) - if reload: - return flask.jsonify({"status": "created a new dataset & reloaded"}), 200 - elif not reload: - return flask.jsonify({"status": "created a new dataset"}), 200 - -@app.route("/api/ADCS/status", methods=["GET"]) -def statusADCS(): - return flask.jsonify({"status": adcsservice.status}), 200 - -if __name__ == "__main__": - debug = os.environ.get("DEBUG", True) - use_waitress = os.getenv("USE_WAITRESS", False) - print("Debug: " + str(debug)) - - if debug is not bool: - debug = strtobool(debug) - - if use_waitress is not bool: - use_waitress = strtobool(use_waitress) - - if debug is True and not use_waitress: - app.run(host="0.0.0.0", port=3636, debug=True) - print("Started with flask", flush=True) - else: - logger = logging.getLogger("waitress") - logger.setLevel(logging.INFO) - serve(app, host="0.0.0.0", port=3636) - print("Started with waitress", flush=True) diff --git a/dataset-assistant/Program.cs b/dataset-assistant/Program.cs new file mode 100644 index 0000000..82dca3f --- /dev/null +++ b/dataset-assistant/Program.cs @@ -0,0 +1,243 @@ +using System.ClientModel; +using System.CommandLine; +using System.Globalization; +using System.Text; +using System.Text.RegularExpressions; +using CsvHelper; +using CsvHelper.Configuration; +using galaxygpt; +using galaxygpt.Database; +using Microsoft.EntityFrameworkCore; +using OpenAI.Embeddings; + +namespace dataset_assistant; + +partial class Program +{ + // Gallery tag regex + [GeneratedRegex(@"(\|image.?=.?)?.*?<\/gallery>\\?\n?", RegexOptions.Singleline)] + private static partial Regex GalleryTagRegex(); + + // File link regex + [GeneratedRegex(@"\[\[File:.*?\]\]\\?", RegexOptions.Singleline)] + private static partial Regex FileLinkRegex(); + + // Magic word regex + [GeneratedRegex(@"__.*?__", RegexOptions.Singleline)] + private static partial Regex MagicWordRegex(); + + // HTML comments regex + [GeneratedRegex(@"\\?\n?", RegexOptions.Singleline)] + private static partial Regex HtmlCommentRegex(); + + // Span & br regex + [GeneratedRegex(@"|<\/span>\\?\n?|\\?\n?", RegexOptions.Singleline)] + private static partial Regex SpanBrRegex(); + + // Div tags regex + [GeneratedRegex(@"|<\/div>\\?\n?", RegexOptions.Singleline)] + private static partial Regex DivTagRegex(); + + private static async Task Main(string[] args) + { + #region Options + + var datasetDirectory = new Option( + ["--directory", "-d"], + "The directory that stores the datasets" + ) + { + IsRequired = true + }; + + var cleanDirOption = new Option( + ["--cleanDir", "-c"], + "Clean the output directory before writing the dataset" + ); + + var noEmbeddingsOption = new Option( + ["--noEmbeddings", "-n"], + "Do not include embeddings in the dataset" + ); + + var dumpDatabaseOption = new Option( + ["--dumpDatabase", "-dd"], + "Dump the database to the output directory" + ); + + var maxLengthOption = new Option( + ["--maxLength", "-m"], + "The maximum length of the content" + ) + { + IsRequired = true + }; + + var compressOldDatasetsOption = new Option( + ["--compressOldDatasets", "-C"], + "Compress old datasets in the output directory" + ); + + var datasetNameOption = new Option( + ["--datasetName", "-N"], + "The name of the dataset" + ) + { + IsRequired = true + }; + + var dumpPathOption = new Option( + ["--dbDumpPath", "-D"], + "The path to the database dump" + ) + { + IsRequired = true + }; + + #endregion + + var rootCommand = new RootCommand("GalaxyGPT Dataset Management Assistant") + { + datasetDirectory, + cleanDirOption, + noEmbeddingsOption, + dumpDatabaseOption, + maxLengthOption, + compressOldDatasetsOption, + datasetNameOption, + dumpPathOption + }; + + rootCommand.SetHandler(async handler => + { + string? datasetDirectoryValue = handler.ParseResult.GetValueForOption(datasetDirectory); + bool? cleanDirOptionValue = handler.ParseResult.GetValueForOption(cleanDirOption); + bool? noEmbeddingsOptionValue = handler.ParseResult.GetValueForOption(noEmbeddingsOption); + bool? dumpDatabaseOptionValue = handler.ParseResult.GetValueForOption(dumpDatabaseOption); + int? maxLengthOptionValue = handler.ParseResult.GetValueForOption(maxLengthOption); + bool? compressOldDatasetsOptionValue = handler.ParseResult.GetValueForOption(compressOldDatasetsOption); + string? datasetNameOptionValue = handler.ParseResult.GetValueForOption(datasetNameOption); + string dumpPathOptionValue = handler.ParseResult.GetValueForOption(dumpPathOption)!; + + await GalaxyGpt.Db.Database.EnsureDeletedAsync(); + await GalaxyGpt.Db.Database.MigrateAsync(); + + string csvData = await File.ReadAllTextAsync(dumpPathOptionValue); + csvData = csvData.Replace("\\\n", "\n"); + + // Read the database dump, which is a csv + using var reader = new StringReader(csvData); + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = true, + BadDataFound = null, + MissingFieldFound = null, + TrimOptions = TrimOptions.Trim, + Encoding = Encoding.UTF8, + Escape = '\\', + Quote = '"', + NewLine = Environment.NewLine, + Mode = CsvMode.RFC4180, + AllowComments = false + }); + + await csv.ReadAsync(); + csv.ReadHeader(); + + // Could possibly run some of this in parallel + while (await csv.ReadAsync()) + { + string? title = csv.GetField("page_title"); + string? content = csv.GetField("content"); + + if (string.IsNullOrEmpty(title) || string.IsNullOrEmpty(content)) + continue; + + // Page title sanitization + title = title.Replace("_", " ").Trim(); + + // Content sanitization + content = content.Replace("\n", " "); + content = GalleryTagRegex().Replace(content, ""); + content = FileLinkRegex().Replace(content, ""); + content = MagicWordRegex().Replace(content, ""); + content = HtmlCommentRegex().Replace(content, ""); + content = SpanBrRegex().Replace(content, ""); + content = DivTagRegex().Replace(content, ""); + content = content.Trim(); + + if (string.IsNullOrEmpty(title) || string.IsNullOrEmpty(content)) + continue; + + var page = new Page + { + Title = title, + Content = content + }; + + GalaxyGpt.Db.Add(page); + } + + await GalaxyGpt.Db.SaveChangesAsync(); + + // Finished adding all the pages to the database. + + const int maxtokens = 8192; + + // Chunk the pages into smaller pages + foreach (Page page in GalaxyGpt.Db.Pages) + { + List chunks = []; + string content = page.Content; + + if (page.Tokens <= maxtokens) continue; + while (true) // Loop until the content is empty + { + int splitIndex = GalaxyGpt.EmbeddingsTokenizer.GetIndexByTokenCount(content, maxtokens, out string? _, out int tokencount); + string chunk = content[..splitIndex]; + Console.WriteLine("Splitting page " + page.Title + " at index " + splitIndex + " with token count " + tokencount); + chunks.Add(new Chunk { Content = chunk }); + + // The last chunk will be the remainder of the content. So we break the loop here + if (splitIndex == content.Length) + break; + + content = content[splitIndex..]; + } + + page.Chunks = chunks; + } + + await GalaxyGpt.Db.SaveChangesAsync(); + + // Create embeddings for each page, or chunks if they exist. Can also be done in parallel + EmbeddingClient? embeddingsClient = GalaxyGpt.OpenAiClient.GetEmbeddingClient("text-embedding-3-small"); + + await Parallel.ForEachAsync(GalaxyGpt.Db.Pages, async (page, cancellationToken) => + { + // Handle the case where the page has no chunks + if (page.Chunks == null || page.Chunks.Count == 0) + { + Console.WriteLine("generating embeddings for " + page.Title); + ClientResult? embedding = await embeddingsClient.GenerateEmbeddingAsync(page.Content, cancellationToken: cancellationToken); + page.Embeddings = embedding.Value.Vector.ToArray().ToList(); + return; + } + + int chunkNumber = 0; + // Handle the case where the page has chunks + foreach (Chunk chunk in page.Chunks) + { + Console.WriteLine($"generating embeddings for {page.Title} chunk {chunkNumber} with token count {GalaxyGpt.GptTokenizer.CountTokens(chunk.Content)}"); + ClientResult? embedding = await embeddingsClient.GenerateEmbeddingAsync(chunk.Content, cancellationToken: cancellationToken); + chunk.Embeddings = embedding.Value.Vector.ToArray().ToList(); + chunkNumber++; + } + }); + + await GalaxyGpt.Db.SaveChangesAsync(); + }); + + return await rootCommand.InvokeAsync(args); + } +} \ No newline at end of file diff --git a/dataset-assistant/dataset-assistant.csproj b/dataset-assistant/dataset-assistant.csproj new file mode 100644 index 0000000..11394ee --- /dev/null +++ b/dataset-assistant/dataset-assistant.csproj @@ -0,0 +1,19 @@ + + + + Exe + net8.0 + dataset_assistant + enable + enable + + + + + + + + + + + diff --git a/dataset.py b/dataset.py deleted file mode 100644 index b6db0a8..0000000 --- a/dataset.py +++ /dev/null @@ -1,339 +0,0 @@ -# Dataset Preparation for GalaxyGPT - -import argparse -import os -import pathlib -import re -import shutil -import subprocess -import time -from glob import glob - -import colorama -import pandas as pd -import tiktoken -from colorama import Fore -from dotenv import load_dotenv -from halo import Halo -from openai import OpenAI -from tqdm import tqdm - -colorama.init() -tqdm.pandas() -load_dotenv() - -olddatasets = [f for f in os.listdir(".") if re.match(r"^dataset-v\d$", f, flags=re.MULTILINE | re.IGNORECASE)] - -parser = argparse.ArgumentParser("GalaxyGPT Dataset Assistant", description="Generate a dataset for use with GalaxyGPT") -parser.add_argument("--outdir", "-o", help="The output directory", type=str, required=True) -parser.add_argument("--cleandir", help="Delete the contents of the output directory if it exists", action=argparse.BooleanOptionalAction, default=None) -parser.add_argument("--no-embeddings", help="Don't generate embeddings", action="store_true") -parser.add_argument("--api-key", help="The OpenAI API key to use (defaults to env.OPENAI_API_KEY)", type=str, default=os.getenv("OPENAI_API_KEY")) -parser.add_argument("--org-id", help="The OpenAI organization ID to use (defaults to env.OPENAI_ORG_ID)", type=str, default=os.getenv("OPENAI_ORG_ID")) -parser.add_argument("--dump-database", help="Generate a new database dump for use with this script", action="store_true", default=False) -parser.add_argument("--max-len", help="The maximum token length of a chunk (HIGHLY ADVISED TO SET THIS AS THE (MAXIMUM CONTEXT LIMIT / 2))", type=int, required=True) -parser.add_argument("--compress-old-datasets", help="Compress old datasets into their own respective tar.gz files so long as they follow the dataset-vX naming scheme", action="store_true", default=False) -parser.add_argument("--dataset", help="The path to the datset to use. Defaults to galaxypedia-*.csv, where * is the highest number found in the directory, or to galaxypedia.csv", type=pathlib.Path, default=None, nargs="?") -args = parser.parse_args() - -if args.compress_old_datasets and len(olddatasets) != 0: - print("Compressing old datasets...") - for f in olddatasets: - spin = Halo(text=f"Compressing {f}...", spinner="dots") - spin.start() - subprocess.run(["tar", "-czf", f"{f}.tar.gz", f], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) - spin.succeed(f"Compressed {f}!") - print("Done!") - deleteq = input("Would you like to delete the old datasets? (Y/n): ") - if deleteq == "y" or deleteq == "": - for f in olddatasets: - spin = Halo(text=f"Deleting {f}...", spinner="dots") - spin.start() - shutil.rmtree(f) - spin.succeed(f"Deleted {f}!") - -if args.api_key is None: - raise Exception("No OpenAI API key specified!") -if args.org_id is None: - raise Exception("No OpenAI organization ID specified!") -openai_client = OpenAI( - api_key=os.getenv("OPENAI_API_KEY"), - organization=os.getenv("OPENAI_ORG_ID") -) -embeddings_model = str(os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")) - -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - - -outdir: str = args.outdir - -if outdir == "" or outdir is None: - raise Exception("No output directory specified!") -print(Fore.GREEN + "Saving results to " + outdir + "!") - -if not os.path.exists(outdir): - os.makedirs(outdir) - -if os.listdir(outdir): - if args.cleandir is None: - os.system("cls" if os.name == "nt" else "clear") - clear_existing_files = input(f"{Fore.YELLOW}{outdir} contains existing files!{Fore.RESET}\nWould you like to delete the contents of {outdir}? (Y/n): ") - - if str(clear_existing_files).strip() == "y" or str(clear_existing_files).strip() == "": - shutil.rmtree(outdir) - os.makedirs(outdir) - print("Deleted the contents of " + outdir + "!") - os.system("cls" if os.name == "nt" else "clear") - elif args.cleandir: - shutil.rmtree(outdir) - os.makedirs(outdir) - print("Deleted the contents of " + outdir + "!") - - -if not args.dump_database: - if args.dataset is None: - # Get the first file that matches the glob, prioritizing the file with the largest number by sorting the list numerically then reversing the list - pathlist = sorted(glob(__location__ + "/galaxypedia*.csv"), reverse=True) - if pathlist == []: - raise Exception("Dataset starting with \'galaxypedia\' and ending with \'.csv\' could not be found!") - datasetpath: str = pathlist[0] - else: - if not str(args.dataset).endswith(".csv"): - raise Exception("Dataset must be a csv file!") - if not os.path.exists(args.dataset): - raise Exception("Dataset does not exist!") - datasetpath: str = args.dataset - - -if args.dump_database: - for file in glob(__location__ + "/galaxypedia*.csv"): - print("Renaming galaxypedia*.csv to galaxypedia*.csv.old") - os.rename(os.path.join(file), os.path.join(file + ".old")) - - print("Generating dataset...") - try: - subprocess.run(["/bin/bash", __location__ + "/dump-database.sh"], cwd=__location__, capture_output=True, check=True) - except Exception as e: - raise Exception("Failed to generate dataset! " + str(e)) - print("Generated dataset!") - - pathlist = sorted(glob(__location__ + "/galaxypedia*.csv"), reverse=True) - datasetpath: str = pathlist[0] - -datasetname = os.path.basename(datasetpath) - -if not os.path.isabs(datasetpath): - datasetpath = os.path.join(__location__, datasetpath) - -############################################################################### -############################################################################### - -def remove_newlines(serie): - serie = serie.str.replace("\n", " ") - serie = serie.str.replace("\\n", " ") - serie = serie.str.replace(" ", " ") - serie = serie.str.replace(" ", " ") - return serie - -spinner = Halo(text=f"Loading {str(datasetname)}", spinner="dots") -spinner.start() -df = pd.read_csv( - datasetpath, - escapechar="\\", - header=0, - names=["page_namespace", "page_title", "content"], -) - -page_titles = df.page_title.str.lower().str.replace("_", " ").str.strip() - -spinner.succeed(f"Loaded {str(datasetname)}!") - -# Sanitize the dataset's contents to make it more readable for the model -spinner = Halo(text="Sanitizing dataset", spinner="dots") - -# Remove newlines -contentprocessed = remove_newlines(df.content) - -# Remove Gallery tags -galleryregex = re.compile(r"(\|image.?=.?)?.*?<\/gallery>\\?\n?", re.S) -contentprocessed = contentprocessed.str.replace( - galleryregex, - "", regex=True, -) - -# Remove links to files -spinner.text = "Sanitizing dataset (removing links to files)" -fileregex = re.compile(r"\[\[File:.*?\]\]\\?", re.S) -contentprocessed = contentprocessed.str.replace(fileregex, "", regex=True) - -# Remove magic words -spinner.text = "Sanitizing dataset (removing magic words)" -magicregex = re.compile(r"__.*?__", re.S) -contentprocessed = contentprocessed.str.replace(magicregex, "", regex=True) - -# Remove HTML comments () -spinner.text = "Sanitizing dataset (removing HTML comments)" -commentregex = re.compile(r"\\?\n?", re.S) -contentprocessed = contentprocessed.str.replace(commentregex, "", regex=True) - -# Remove span and br tags -spinner.text = "Sanitizing dataset (removing span and br tags)" -spanregex = re.compile(r"|<\/span>\\?\n?|\\?\n?", re.S) -contentprocessed = contentprocessed.str.replace(spanregex, "", regex=True) - -# Remove div tags -spinner.text = "Sanitizing dataset (removing div tags)" -divregex = re.compile(r"|<\/div>\\?\n?", re.S) -contentprocessed = contentprocessed.str.replace(divregex, "", regex=True) -spinner.succeed() - -spinner = Halo(text="Saving sanitized dataset", spinner="dots") - -# Remove rows with empty content -df["content"] = contentprocessed.str.strip() -rows_to_drop = df[df["content"]==""].index -df.drop(rows_to_drop, inplace=True) - -def includeNamespaceInTitle(row): - namespace = "" - - if row["page_namespace"] == 4: - namespace = "Galaxypedia:" - - row["page_title"] = namespace + row["page_title"] - return row - -df = df.apply(includeNamespaceInTitle, axis=1) - -df["content"] = df.page_title.str.lower().str.replace("_", " ").str.strip() + ". " + df.content.str.strip() - -df.to_csv(os.path.join(__location__, outdir, "processed.csv")) -spinner.succeed("Saved sanitized dataset!") - -################################################################################ - -tokenizer = tiktoken.get_encoding("cl100k_base") - -spinner = Halo(text="Loading sanitized dataset", spinner="dots") -df = pd.read_csv(os.path.join(__location__, outdir, "processed.csv"), index_col=0) -df.columns = ["page_namespace", "page_title", "content"] -spinner.succeed("Loaded sanitized dataset!") - -# Tokenize the text and save the number of tokens to a new column -tqdm.pandas(desc="Tokenizing", leave=False) -df["n_tokens"] = df.content.progress_apply(lambda x: len(tokenizer.encode(x))) -print(Fore.GREEN + "✔ " + Fore.RESET + "Tokenized!") - -# Max tokens per chunk -max_tokens: int = args.max_len - -# Function to split the text into chunks of a maximum number of tokens -# TODO: Move the comment into docstring -def split_into_many(text, max_tokens=max_tokens): - sentences: str = text.split(". ") - - n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] - - chunks = [] - tokens_so_far = 0 - chunk = [] - - # Loop through the sentences and tokens joined together in a tuple - for sentence, token in zip(sentences, n_tokens): - # If the number of tokens so far plus the number of tokens in the current sentence is greater - # than the max number of tokens, then add the chunk to the list of chunks and reset - # the chunk and tokens so far - if tokens_so_far + token > max_tokens: - chunks.append(". ".join(chunk) + ".") - chunk = [] - tokens_so_far = 0 - - # If the number of tokens in the current sentence is greater than the max number of - # tokens, go to the next sentence - if token > max_tokens: - continue - - # Otherwise, add the sentence to the chunk and add the number of tokens to the total - chunk.append(sentence) - tokens_so_far += token + 1 - - return chunks - - -shortened = [] - -# in embeddings_pages_by_row, each index corresponds to a page title. -# for each iteration, append a number to embeddings_pages_by_row -# that is the amount of pages in that index -embeddings_pages_by_row = [] - -itrows = tqdm(df.iterrows(), total=df.shape[0], desc="Splitting dataset into chunks", leave=False) - -for row in itrows: - if row[1]["content"] is None: - embeddings_pages_by_row.append(0) # there is 0 pages for this pagetitle index - continue - - # If the number of tokens is greater than the max number of tokens, split the text into chunks and - # add them to the list of shortened texts. Append the number of chunks to the embeddings pages by row - if row[1]["n_tokens"] > max_tokens: - chunks = split_into_many(row[1]["content"]) - shortened += chunks - embeddings_pages_by_row.append(chunks.__len__()) # there is chunks.len pages for this pagetitle index - - # Otherwise, add the text to the list of shortened texts - else: - shortened.append(row[1]["content"]) - embeddings_pages_by_row.append(1) # there is 1 page for this pagetitle index - -print(Fore.GREEN + "✔ " + Fore.RESET + "Dataset split into chunks!") - -############################################################################### - -df = pd.DataFrame(shortened, columns=["content"]) - -tqdm.pandas(desc="Tokenizing", leave=False) - -# make a list of the number of embeddings rows for each page -embedding_page_titles = [] -for i, repeats in enumerate(embeddings_pages_by_row): - for j in list(range(repeats)): - embedding_page_titles.append(page_titles[i + 1]) # +1 because page_titles[0] is the column name, not the first entry - -df["page_title"] = embedding_page_titles - -df["n_tokens"] = df.content.progress_apply(lambda x: len(tokenizer.encode(x))) -print(Fore.GREEN + "✔ " + Fore.RESET + "Tokenized!") - -spinner = Halo(text="Saving tokenized dataset", spinner="dots") -df.to_csv(os.path.join(__location__, outdir, "tokenized.csv")) -spinner.succeed("Saved tokenized dataset!") - -if args.no_embeddings is False: - cost = 0 - - baller = tqdm(total=df.shape[0], desc="Embedding", leave=False) - def idk(x): - global cost - cost += (len(tokenizer.encode(x)) / 1000) * 0.0001 - baller.set_postfix_str(str(round(cost, 8))) - baller.update(1) - - return openai_client.embeddings.create(input=x, model=embeddings_model).data[0].embedding - - df["embeddings"] = df.content.apply(idk) - baller.close() - print(Fore.GREEN + "✔ " + Fore.RESET + "Embedded!") - - spinner = Halo(text="Saving embedded dataset", spinner="dots") - df.to_csv(os.path.join(__location__, outdir, "embeddings.csv")) - spinner.succeed("Saved embedded dataset!") - -spinner = Halo(text="Copying initial dataset to output directory", spinner="dots") -shutil.copyfile(datasetpath, os.path.join(__location__, outdir, datasetname)) -spinner.succeed("Copied initial dataset to output directory!") - -with open(os.path.join(__location__, outdir, "METADATA.txt"), "w") as file: - file.write(f"Dataset: {datasetname}\nTimestamp: {time.ctime(time.time())}\nMax_len: {max_tokens}") - -print("Done!") diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index 3b41c7c..0000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,18 +0,0 @@ -version: '3.1' -services: - galaxygpt-api: - container_name: galaxygpt-api - image: galaxygpt-api - build: - context: . - dockerfile: Dockerfile.api -# ports: -# - "3636:3636" - volumes: - - ./dataset-ADCS:/app/dataset-ADCS - restart: always - network_mode: host - env_file: .env - environment: - - DEBUG=True - - USE_WAITRESS=True diff --git a/dump-database.sh b/dump-database.sh deleted file mode 100644 index 7782749..0000000 --- a/dump-database.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Set the -e option so that if any command fails the script will exit immediately -set -e - -# Dump the sql database to a csv file -echo "Dumping the sql database to a csv file..." -mysql -u root -p$1 -e "USE galaxypedia; SELECT page_namespace, page_title \"page_name\", old_text \"content\" FROM page INNER JOIN slots on page_latest = slot_revision_id INNER JOIN slot_roles on slot_role_id = role_id AND role_name = 'main' INNER JOIN content on slot_content_id = content_id INNER JOIN text on substring( content_address, 4 ) = old_id AND left( content_address, 3 ) = \"tt:\" WHERE (page.page_namespace = 0 OR page.page_namespace = 4) AND page.page_is_redirect = 0 into outfile '/tmp/galaxypedia.csv' FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES TERMINATED BY '\n';" - -# Move the csv file to the current directory -echo "Moving the csv file to the current directory..." -sudo mv -f /tmp/galaxypedia.csv ./galaxypedia.csv.temp - -# Change the owner of the file to ubuntu -echo "Changing the owner of the file to ubuntu..." -sudo chown $(whoami):$(whoami) galaxypedia.csv.temp - -# Add the header to the csv file -echo "Adding the header to the csv file..." -(echo "page_namespace, page_title,content"; cat galaxypedia.csv.temp) > galaxypedia-$(date '+%Y-%m-%d').csv - -# Remove the temporary file -echo "Removing the temporary file..." -rm galaxypedia.csv.temp diff --git a/galaxygpt-api/ConfigureSwaggerOptions.cs b/galaxygpt-api/ConfigureSwaggerOptions.cs new file mode 100644 index 0000000..d7b6793 --- /dev/null +++ b/galaxygpt-api/ConfigureSwaggerOptions.cs @@ -0,0 +1,23 @@ +using Asp.Versioning.ApiExplorer; +using Microsoft.Extensions.Options; +using Microsoft.OpenApi.Models; +using Swashbuckle.AspNetCore.SwaggerGen; + +namespace galaxygpt_api; + +public class ConfigureSwaggerOptions(IApiVersionDescriptionProvider provider) : IConfigureOptions +{ + public void Configure( SwaggerGenOptions options ) + { + foreach ( ApiVersionDescription description in provider.ApiVersionDescriptions ) + { + options.SwaggerDoc( + description.GroupName, + new OpenApiInfo + { + Title = "GalaxyGPT API", + Version = description.ApiVersion.ToString(), + } ); + } + } +} \ No newline at end of file diff --git a/galaxygpt-api/Program.cs b/galaxygpt-api/Program.cs new file mode 100644 index 0000000..a3b1d84 --- /dev/null +++ b/galaxygpt-api/Program.cs @@ -0,0 +1,94 @@ + +using Asp.Versioning.ApiExplorer; +using Asp.Versioning.Builder; +using galaxygpt; +using Microsoft.Extensions.Options; +using Swashbuckle.AspNetCore.SwaggerGen; + +namespace galaxygpt_api; + +public class Program +{ + public static void Main(string[] args) + { + WebApplicationBuilder builder = WebApplication.CreateBuilder(args); + + // Add services to the container. + builder.Logging.AddConsole(); + builder.Services.AddProblemDetails(); + builder.Services.AddEndpointsApiExplorer(); + builder.Services.AddApiVersioning(options => + { + options.ReportApiVersions = true; + }).AddApiExplorer(options => + { + options.GroupNameFormat = "'v'VVV"; + options.SubstituteApiVersionInUrl = true; + }).EnableApiVersionBinding(); + + builder.Services.AddTransient, ConfigureSwaggerOptions>(); + builder.Services.AddSwaggerGen(options => options.OperationFilter()); + + builder.Services.AddMemoryCache(); + + builder.Configuration.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true); + + WebApplication app = builder.Build(); + IVersionedEndpointRouteBuilder versionedApi = app.NewVersionedApi("galaxygpt"); + IVersionedEndpointRouteBuilder adcsApi = app.NewVersionedApi("adcs"); + + app.UseHttpsRedirection(); + + app.UseExceptionHandler(exceptionHandlerApp => + exceptionHandlerApp.Run(async context => await Results.Problem().ExecuteAsync(context))); + + #region API + RouteGroupBuilder v1 = versionedApi.MapGroup("/api/v{version:apiVersion}").HasApiVersion(1.0); + + v1.MapPost("ask", async (string prompt, string? model, string? username) => + { + if (string.IsNullOrEmpty(prompt)) + { + return Results.BadRequest("The question cannot be empty."); + } + + string answer = await GalaxyGpt.AnswerQuestion(prompt, model ?? app.Configuration["MODEL"] ?? throw new InvalidOperationException(), 4096, 4096, 4096, username: username); + + var results = new Dictionary + { + { "answer", answer.Trim() }, + { "version", Environment.Version.ToString() } + }; + + return Results.Json(results); + }).WithName("AskQuestion").WithOpenApi(); + + #endregion + + #region ADCS + + RouteGroupBuilder adcsGroup = adcsApi.MapGroup("/api/v{version:apiVersion}/adcs").HasApiVersion(1.0); + adcsGroup.MapPost("start", new Func(() => throw new NotImplementedException())); + + adcsGroup.MapPost("stop", new Func(() => throw new NotImplementedException())); + + adcsGroup.MapPost("force-create", new Func(() => throw new NotImplementedException())); + + #endregion + + app.UseSwagger(); + if (app.Environment.IsDevelopment()) + { + app.UseSwaggerUI(options => + { + foreach ( ApiVersionDescription description in app.DescribeApiVersions() ) + { + options.SwaggerEndpoint( + $"/swagger/{description.GroupName}/swagger.json", + description.GroupName ); + } + }); + } + app.Run(); + } +} diff --git a/galaxygpt-api/Properties/launchSettings.json b/galaxygpt-api/Properties/launchSettings.json new file mode 100644 index 0000000..bdad764 --- /dev/null +++ b/galaxygpt-api/Properties/launchSettings.json @@ -0,0 +1,41 @@ +{ + "$schema": "http://json.schemastore.org/launchsettings.json", + "iisSettings": { + "windowsAuthentication": false, + "anonymousAuthentication": true, + "iisExpress": { + "applicationUrl": "http://localhost:46577", + "sslPort": 44343 + } + }, + "profiles": { + "http": { + "commandName": "Project", + "dotnetRunMessages": true, + "launchBrowser": true, + "launchUrl": "swagger", + "applicationUrl": "http://localhost:5288", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + } + }, + "https": { + "commandName": "Project", + "dotnetRunMessages": true, + "launchBrowser": true, + "launchUrl": "swagger", + "applicationUrl": "https://localhost:7254;http://localhost:5288", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + } + }, + "IIS Express": { + "commandName": "IISExpress", + "launchBrowser": true, + "launchUrl": "swagger", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + } + } + } +} diff --git a/galaxygpt-api/SwaggerDefaultValues.cs b/galaxygpt-api/SwaggerDefaultValues.cs new file mode 100644 index 0000000..934b8cc --- /dev/null +++ b/galaxygpt-api/SwaggerDefaultValues.cs @@ -0,0 +1,60 @@ +using System.Text.Json; +using Microsoft.AspNetCore.Mvc.ApiExplorer; +using Microsoft.AspNetCore.Mvc.ModelBinding; +using Microsoft.OpenApi.Models; +using Swashbuckle.AspNetCore.SwaggerGen; + +namespace galaxygpt_api; + +public class SwaggerDefaultValues : IOperationFilter +{ + /// + public void Apply( OpenApiOperation operation, OperationFilterContext context ) + { + var apiDescription = context.ApiDescription; + + operation.Deprecated |= apiDescription.IsDeprecated(); + + // REF: https://github.com/domaindrivendev/Swashbuckle.AspNetCore/issues/1752#issue-663991077 + foreach ( var responseType in context.ApiDescription.SupportedResponseTypes ) + { + // REF: https://github.com/domaindrivendev/Swashbuckle.AspNetCore/blob/b7cf75e7905050305b115dd96640ddd6e74c7ac9/src/Swashbuckle.AspNetCore.SwaggerGen/SwaggerGenerator/SwaggerGenerator.cs#L383-L387 + var responseKey = responseType.IsDefaultResponse ? "default" : responseType.StatusCode.ToString(); + var response = operation.Responses[responseKey]; + + foreach ( var contentType in response.Content.Keys ) + { + if ( !responseType.ApiResponseFormats.Any( x => x.MediaType == contentType ) ) + { + response.Content.Remove( contentType ); + } + } + } + + if ( operation.Parameters == null ) + { + return; + } + + // REF: https://github.com/domaindrivendev/Swashbuckle.AspNetCore/issues/412 + // REF: https://github.com/domaindrivendev/Swashbuckle.AspNetCore/pull/413 + foreach ( var parameter in operation.Parameters ) + { + var description = apiDescription.ParameterDescriptions.First( p => p.Name == parameter.Name ); + + parameter.Description ??= description.ModelMetadata?.Description; + + if ( parameter.Schema.Default == null && + description.DefaultValue != null && + description.DefaultValue is not DBNull && + description.ModelMetadata is ModelMetadata modelMetadata ) + { + // REF: https://github.com/Microsoft/aspnet-api-versioning/issues/429#issuecomment-605402330 + var json = JsonSerializer.Serialize( description.DefaultValue, modelMetadata.ModelType ); + parameter.Schema.Default = OpenApiAnyFactory.CreateFromJson( json ); + } + + parameter.Required |= description.IsRequired; + } + } +} diff --git a/galaxygpt-api/appsettings.Development.json b/galaxygpt-api/appsettings.Development.json new file mode 100644 index 0000000..0c208ae --- /dev/null +++ b/galaxygpt-api/appsettings.Development.json @@ -0,0 +1,8 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + } + } +} diff --git a/galaxygpt-api/galaxygpt-api.csproj b/galaxygpt-api/galaxygpt-api.csproj new file mode 100644 index 0000000..0203a9c --- /dev/null +++ b/galaxygpt-api/galaxygpt-api.csproj @@ -0,0 +1,21 @@ + + + + net8.0 + enable + enable + galaxygpt_api + + + + + + + + + + + + + + diff --git a/galaxygpt-api/galaxygpt-api.http b/galaxygpt-api/galaxygpt-api.http new file mode 100644 index 0000000..12fbd9a --- /dev/null +++ b/galaxygpt-api/galaxygpt-api.http @@ -0,0 +1,6 @@ +@galaxygpt_api_HostAddress = http://localhost:5288 + +GET {{galaxygpt_api_HostAddress}}/ +Accept: application/json + +### diff --git a/galaxygpt-tests/GlobalUsings.cs b/galaxygpt-tests/GlobalUsings.cs new file mode 100644 index 0000000..8c927eb --- /dev/null +++ b/galaxygpt-tests/GlobalUsings.cs @@ -0,0 +1 @@ +global using Xunit; \ No newline at end of file diff --git a/galaxygpt-tests/UnitTest1.cs b/galaxygpt-tests/UnitTest1.cs new file mode 100644 index 0000000..5b5f8ad --- /dev/null +++ b/galaxygpt-tests/UnitTest1.cs @@ -0,0 +1,11 @@ +using galaxygpt; + +namespace galaxygpt_tests; + +public class UnitTest1 +{ + [Fact] + public void Test1() + { + } +} \ No newline at end of file diff --git a/galaxygpt-tests/galaxygpt-tests.csproj b/galaxygpt-tests/galaxygpt-tests.csproj new file mode 100644 index 0000000..bb4d9b4 --- /dev/null +++ b/galaxygpt-tests/galaxygpt-tests.csproj @@ -0,0 +1,30 @@ + + + + net8.0 + galaxygpt_tests + enable + enable + + false + true + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + + + + diff --git a/galaxygpt.sln b/galaxygpt.sln new file mode 100644 index 0000000..db2c364 --- /dev/null +++ b/galaxygpt.sln @@ -0,0 +1,34 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "galaxygpt", "galaxygpt\galaxygpt.csproj", "{166D8A8A-3D45-4473-B81F-822074C27B8F}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "dataset-assistant", "dataset-assistant\dataset-assistant.csproj", "{DBF1F8D2-14CA-42CB-9BA7-4C2B72B9A681}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "galaxygpt-api", "galaxygpt-api\galaxygpt-api.csproj", "{EDB6CCC4-DCE5-4658-9F4B-A4F37E4C505D}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "galaxygpt-tests", "galaxygpt-tests\galaxygpt-tests.csproj", "{29711175-B0FE-45BF-BEAB-BBA248F7C6DA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {166D8A8A-3D45-4473-B81F-822074C27B8F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {166D8A8A-3D45-4473-B81F-822074C27B8F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {166D8A8A-3D45-4473-B81F-822074C27B8F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {166D8A8A-3D45-4473-B81F-822074C27B8F}.Release|Any CPU.Build.0 = Release|Any CPU + {DBF1F8D2-14CA-42CB-9BA7-4C2B72B9A681}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {DBF1F8D2-14CA-42CB-9BA7-4C2B72B9A681}.Debug|Any CPU.Build.0 = Debug|Any CPU + {DBF1F8D2-14CA-42CB-9BA7-4C2B72B9A681}.Release|Any CPU.ActiveCfg = Release|Any CPU + {DBF1F8D2-14CA-42CB-9BA7-4C2B72B9A681}.Release|Any CPU.Build.0 = Release|Any CPU + {EDB6CCC4-DCE5-4658-9F4B-A4F37E4C505D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EDB6CCC4-DCE5-4658-9F4B-A4F37E4C505D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EDB6CCC4-DCE5-4658-9F4B-A4F37E4C505D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EDB6CCC4-DCE5-4658-9F4B-A4F37E4C505D}.Release|Any CPU.Build.0 = Release|Any CPU + {29711175-B0FE-45BF-BEAB-BBA248F7C6DA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {29711175-B0FE-45BF-BEAB-BBA248F7C6DA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {29711175-B0FE-45BF-BEAB-BBA248F7C6DA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {29711175-B0FE-45BF-BEAB-BBA248F7C6DA}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/galaxygpt/Database/Chunk.cs b/galaxygpt/Database/Chunk.cs new file mode 100644 index 0000000..269e675 --- /dev/null +++ b/galaxygpt/Database/Chunk.cs @@ -0,0 +1,11 @@ +using Microsoft.EntityFrameworkCore; + +namespace galaxygpt.Database; + +public class Chunk +{ + public int Id { get; init; } + public string Content { get; init; } + + public List? Embeddings { get; set; } +} \ No newline at end of file diff --git a/galaxygpt/Database/Page.cs b/galaxygpt/Database/Page.cs new file mode 100644 index 0000000..ccb707f --- /dev/null +++ b/galaxygpt/Database/Page.cs @@ -0,0 +1,22 @@ +using System.ComponentModel.DataAnnotations; + +namespace galaxygpt.Database; + +public class Page +{ + public int Id { get; init; } + + [Required] + public required string Title { get; init; } + + // In the future, we might want to store the entire content as a chunk to simplify logic. + [Required] + public required string Content { get; init; } + + // TODO: Remove the nullable and instead initialize it as an empty list. + public List? Chunks { get; set; } + + public int Tokens => GalaxyGpt.GptTokenizer.CountTokens(Content); + + public List? Embeddings { get; set; } +} \ No newline at end of file diff --git a/galaxygpt/Database/VectorDB.cs b/galaxygpt/Database/VectorDB.cs new file mode 100644 index 0000000..c139784 --- /dev/null +++ b/galaxygpt/Database/VectorDB.cs @@ -0,0 +1,12 @@ +using Microsoft.EntityFrameworkCore; + +namespace galaxygpt.Database; + +public class VectorDb : DbContext +{ + public DbSet Pages { get; set; } + public DbSet Chunks { get; set; } + + protected override void OnConfiguring(DbContextOptionsBuilder options) + => options.UseSqlite("Data Source=" + Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "embeddings.db")); +} \ No newline at end of file diff --git a/galaxygpt/GalaxyGpt.cs b/galaxygpt/GalaxyGpt.cs new file mode 100644 index 0000000..753b421 --- /dev/null +++ b/galaxygpt/GalaxyGpt.cs @@ -0,0 +1,183 @@ +using System.ClientModel; +using System.Data; +using System.Numerics.Tensors; +using System.Text; +using galaxygpt.Database; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Configuration; +using Microsoft.ML.Tokenizers; +using OpenAI; +using OpenAI.Chat; +using OpenAI.Embeddings; +using OpenAI.Moderations; + +namespace galaxygpt; + +public static class GalaxyGpt +{ + // TODO: Migrate to dependency injection + public static readonly VectorDb Db = new(); + private static readonly IConfigurationRoot Configuration = new ConfigurationBuilder().AddJsonFile("appsettings.json", optional: false, true).Build(); + public static readonly OpenAIClient OpenAiClient = new(Configuration["OPENAI_API_KEY"] ?? throw new InvalidOperationException()); + public static readonly TiktokenTokenizer GptTokenizer = TiktokenTokenizer.CreateForModel("gpt-4o-mini"); + public static readonly TiktokenTokenizer EmbeddingsTokenizer = TiktokenTokenizer.CreateForModel("text-embedding-3-small"); + + /// + /// Answer a question using the specified model. + /// + /// What questions to ask + /// A string of which model to use + /// + /// The maximum amount of tokens to return. ds this number. + /// The maximum length that the context should be. The higher the number, the more context there will be, but also the more cost for the request. + /// The model to use for generating embeddings of the question. Typically, this should be the same as your dataset. + /// + /// The username to pass to the bot, used for personalizing the response. + /// The database to use in the context. This defaults to the GalaxyGPT-wide database, but can be manually set to another as needed. + public static async Task AnswerQuestion(string question, string model, int maxInputTokens, int maxOutputTokens, + int maxLength, string embeddingsModel = "text-embedding-3-small", + string moderationModel = "text-moderation-stable", string? username = null, VectorDb? db = null) + { + db ??= Db; + + #region Sanitize & Check the question + + question = question.Trim(); + + if (string.IsNullOrEmpty(question)) + { + throw new ArgumentException("The question cannot be empty."); + } + + if (GptTokenizer.CountTokens(question) > maxInputTokens) + { + throw new ArgumentException("The question is too long to be answered."); + } + + // Check if database is empty + if (!db.Pages.Any()) + { + throw new InvalidOperationException("The database is empty. Please load a dataset first."); + } + + // Throw the question into the moderation API + ClientResult? moderation = await OpenAiClient.GetModerationClient(moderationModel).ClassifyTextInputAsync(question); + + if (moderation.Value.Flagged) + { + throw new InvalidOperationException("The question was flagged by the moderation API."); + } + + #endregion + + // Fetch the context + (string context, int _) = await FetchContext(question, maxLength, embeddingsModel); + + ChatClient? chatClient = OpenAiClient.GetChatClient(model); + + List messages = + [ + new SystemChatMessage(""" + You are GalaxyGPT, a helpful assistant that answers questions about Galaxy, a ROBLOX Space Game. + The Galaxypedia is the game's official wiki and it is your creator. + The Galaxypedia's slogans are "The new era of the Galaxy Wiki" and "A hub for all things Galaxy". + Answer the question based on the supplied context. If the question cannot be answered, politely say you don't know the answer and ask the user for clarification, or if they have any further questions about Galaxy. + If the user has a username, it will be provided and you can address them by it. If a username is not provided (it shows as N/A), do not address/refer the user apart from "you" or "your". + Do not reference or mention the "context provided" in your response, no matter what. + The context will be given in the format of wikitext. You will be given multiple different pages in your context to work with. The different pages will be separated by "###". + If a ship infobox is present in the context, prefer using data from within the infobox. An infobox can be found by looking for a wikitext template that has the word "infobox" in its name. + If the user is not asking a question (e.g. "thank you", "thanks for the help"): Respond to it and ask the user if they have any further questions. + Respond to greetings (e.g. "hi", "hello") with (in this exact order): A greeting, a brief description of yourself, and a question addressed to the user if they have a question or need assistance. + Above all, be polite and helpful to the user. + + Steps for responding: + First check if the user is asking about a ship (e.g. "what is the deity?", "how much shield does the theia have?"), if so, use the ship's wiki page (supplied in the context) and the statistics from the ship's infobox to answer the question. + If you determine the user is not asking about a ship (e.g. "who is ?", "what is ?"), do your best to answer the question based on the context provided. + """), + new UserChatMessage($"Context:\n{context.Trim()}\n\n---\n\nQuestion: {question}\nUsername: {username ?? "N/A"}") + { + ParticipantName = username ?? null + } + ]; + + ClientResult? idk = await chatClient.CompleteChatAsync(messages, new ChatCompletionOptions + { + MaxTokens = maxOutputTokens, + Temperature = 0 + + }); + messages.Add(new AssistantChatMessage(idk)); + + foreach (ChatMessage message in messages) + { + string role = message.GetType().Name; + string text = message.Content[0].Text; + + Console.WriteLine($"{role}: {text}"); + } + + return messages[^1].Content[0].Text; + } + + private static async Task<(string, int)> FetchContext(string question, int maxLength, string model, VectorDb? db = null) + { + db ??= Db; + question = question.Trim(); + + if (string.IsNullOrEmpty(question)) + throw new ArgumentException("The question cannot be empty."); + + if (!db.Pages.Any()) + throw new InvalidOperationException("The database is empty. Please load a dataset first."); + + EmbeddingClient? embeddingsClient = OpenAiClient.GetEmbeddingClient(model); + ClientResult? questionEmbeddings = await embeddingsClient.GenerateEmbeddingAsync(question); + + // TODO: Optimize this to work with the database directly, instead of exporting the data to a list + List pages = db.Pages.Include(page => page.Chunks).ToList(); + DataTable pageEmbeddings = new(); + pageEmbeddings.Columns.Add("Page", typeof(Page)); + pageEmbeddings.Columns.Add("Embeddings", typeof(float[])); + pageEmbeddings.Columns.Add("ChunkId", typeof(int)); + pageEmbeddings.Columns.Add("Distance", typeof(float)); + + foreach (Page page in pages) + { + if ((page.Chunks == null || page.Chunks.Count == 0) && page.Embeddings != null) + pageEmbeddings.Rows.Add(page, page.Embeddings.ToArray(), -1); + else if (page.Chunks != null) + foreach (Chunk chunk in page.Chunks) + { + if (chunk.Embeddings != null) pageEmbeddings.Rows.Add(page, chunk.Embeddings.ToArray(), chunk.Id); + } + } + + foreach (DataRow row in pageEmbeddings.Rows) + { + float[] embeddings = (float[])row["Embeddings"]; + float distance = TensorPrimitives.CosineSimilarity(questionEmbeddings.Value.Vector.ToArray(), embeddings); + row["Distance"] = distance; + } + + pageEmbeddings.DefaultView.Sort = "Distance DESC"; + pageEmbeddings = pageEmbeddings.DefaultView.ToTable(); + + StringBuilder context = new(); + int tokenCount = GptTokenizer.CountTokens(question); + + foreach (DataRow row in pageEmbeddings.Rows) + { + var page = (Page)row["Page"]; + int? chunkId = (int?)row["ChunkId"]; + string content = chunkId == -1|| page.Chunks == null || page.Chunks.Count == 0 ? page.Content : page.Chunks.First(chunk => chunk.Id == chunkId).Content; + + tokenCount += GptTokenizer.CountTokens(content); + if (tokenCount > maxLength) + break; + + context.Append($"Page: {page.Title}\nContent: {content}\n\n###\n\n"); + } + + return (context.ToString(), EmbeddingsTokenizer.CountTokens(question)); + } +} \ No newline at end of file diff --git a/galaxygpt/galaxygpt.csproj b/galaxygpt/galaxygpt.csproj new file mode 100644 index 0000000..43da8b3 --- /dev/null +++ b/galaxygpt/galaxygpt.csproj @@ -0,0 +1,23 @@ + + + + net8.0 + enable + enable + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + diff --git a/main.py b/main.py deleted file mode 100644 index 9bfd862..0000000 --- a/main.py +++ /dev/null @@ -1,362 +0,0 @@ -# Entrypoint for GalaxyGPT - -## Initalization -import hashlib -import os -import subprocess -import threading -import time -import traceback -import warnings - -import colorama -import numpy as np -import pandas as pd -import schedule -import tiktoken -from discord_webhook import DiscordWebhook -from dotenv import load_dotenv -from openai import OpenAI -from scipy.spatial.distance import cosine - -load_dotenv() - -GalaxyGPTVersion = os.getenv("VERSION") -if GalaxyGPTVersion is None: - raise Exception("Please set VERSION in .env") - -if not os.getenv("OPENAI_ORG_ID") or not os.getenv("OPENAI_API_KEY"): - raise Exception("Please set OPENAI_ORG_ID and OPENAI_API_KEY in .env") - -openai_client = OpenAI( - api_key=os.getenv("OPENAI_API_KEY"), - organization=os.getenv("OPENAI_ORG_ID") -) - -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - -dataset = str(os.getenv("DATASET")) # Dataset to use -if dataset is None: - raise Exception("Please set DATASET in .env") -context_len = int(os.getenv("MAX_CONTEXT_LEN")) # Maximum context length in tokens -if context_len is None: - raise Exception("Please set MAX_CONTEXT_LEN in .env") -gpt_model = str(os.getenv("MODEL")) # Model to use -if gpt_model is None: - gpt_model = "gpt-3.5-turbo-0125" -embeddings_model = str(os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")) - -print("GalaxyGPT v" + GalaxyGPTVersion + " - " + dataset + " - " + str(context_len) + " max len") - -################################################################################ -# Load datasets - -def loadDataset(): - print("Loading dataset...") - global df - - df = pd.read_csv(os.path.join(__location__, dataset, "embeddings.csv"), index_col=0) - df["embeddings"] = df["embeddings"].apply(eval).apply(np.array) - - df["page_titles"] = pd.read_csv(os.path.join(__location__, dataset, "processed.csv"), index_col=0)["page_title"] - print("Dataset loaded!") - -loadDataset() - -################################################################################ -# Functions - -def strtobool(string: str): - """ - Returns a boolean value based on the given string. - True values are 'y', 'yes', 't', 'true', 'on', and '1', - False values are 'n', 'no', 'f', 'false', 'off', and '0'. - Raises ValueError if 'string' is anything else. - """ - if string.lower() in ["y", "yes", "t", "true", "on", "1"]: - return True - elif string.lower() in ["n", "no", "f", "false", "off", "0"]: - return False - else: - raise ValueError(f"invalid string given: {string}") - -def create_context(question: str, df: pd.DataFrame, max_len: int=context_len, model: str=gpt_model, debug: bool=True): - """ - Create a context for a question by finding the most similar context from the dataframe - """ - - # Get the embeddings for the question - embeddings = openai_client.embeddings.create(input=question, model=model) - - q_embeddings = embeddings.data[0].embedding - - embeddingsusage = embeddings.usage - # Get the distances from the embeddings - df["distances"] = df["embeddings"].apply(lambda x: cosine(q_embeddings, x)) - - returns: list = [] - cur_len: int = 0 - context_page_titles: list = [] - - # Sort by distance and add the text to the context until the context is too long - for i, row in df.sort_values("distances", ascending=True).iterrows(): - # Add the length of the text to the current length - cur_len += row["n_tokens"] + 4 - - # If the context is too long, break - if cur_len > max_len: - break - - # Else add it to the text that is being returned - returns.append(row["content"].strip()) - - # keep track of the page titles (note: df["page_title"] comes from the embeddings.csv file, not the processed.csv file) - context_page_titles.append(df["page_title"][int(row.name)]) - - if debug: - print("bingus:", flush=True) - print(returns, flush=True) - print("-------------------", flush=True) - print(df["distances"].values.tolist(), flush=True) - - # Return the context - return "\n\n###\n\n".join(returns), embeddingsusage, context_page_titles - - -def answer_question( - df: pd.DataFrame, - model: str=gpt_model, - question: str="Hello!", - max_len: int=context_len, - embeddings_model: str=embeddings_model, - debug: bool=True, - max_tokens: int=250, - stop_sequence=None, - username: str | None = None, -): - """ - Answer a question based on the most similar context from the dataframe texts - """ - - # Make sure the question is not empty - if question == "": - raise Exception("Question is empty") - - # Make sure the dataframe is not empty - if df.empty: - raise Exception("Dataframe is empty") - - # Make sure the question is under 250 tokens - enc = tiktoken.get_encoding("cl100k_base") - questiontokens = enc.encode(question) - if len(questiontokens) > max_tokens: - raise Exception("Question is too long") - - moderation = openai_client.moderations.create(input=question) - - import json - - if debug: - print( - "Moderation:\n" + str(moderation.results[0]), flush=True - ) - print( - "-----------------------------------------------------------------------------", - flush=True, - ) - - if moderation.results[0].flagged: - raise Exception("Flagged by OpenAI Moderation System") - - context = create_context(question, df, max_len=max_len, model=embeddings_model, debug=debug) - embeddingsusage = context[1] - page_titles: list[str] = context[2] - context = context[0].strip() - - if context == "": - warnings.warn("Context is empty") - - - # If debug, print the raw model response - if debug: - print("Context:\n" + context, flush=True) - print( - "-----------------------------------------------------------------------------", - flush=True, - ) - - try: - # Create a completions using the question and context - raah = "\n\n" - response = openai_client.chat.completions.create( - messages=[ - { - "role": "system", - "content": "You are GalaxyGPT, a helpful assistant that answers questions about Galaxy, a ROBLOX Space Game.\n" - + "The Galaxypedia is the game's official wiki and it owns you\n" - + 'The Galaxypedia\'s slogans are "The new era of the Galaxy Wiki" and "A hub for all things Galaxy".\n' - + "Answer the question based on the supplied context. If the question cannot be answered, politely say you don't know the answer and ask the user if they have any further questions about Galaxy.\n" - + "If the user has a username, it will be provided and you can address them by it. If a username is not provided, do not address the user.\n" - + 'Do not reference or mention the "context provided" in your response, no matter what.\n' - + 'If a ship infobox is present in the context, prefer using data from within the infobox. An infobox can be found by looking for a wikitext template that has the word "infobox" in its name.\n' - + 'If the user is not asking a question (e.g. "thank you", "thanks for the help"): Respond to it and ask the user if they have any further questions.\n' - + 'Respond to greetings (e.g. "hi", "hello") with (in this exact order): A greeting, a brief description of yourself, and a question addressed to the user if they have a question or need assistance.\n\n' - + 'Steps for responding:\nFirst check if the user is asking about a ship (e.g. "what is the deity?", "how much shield does the theia have?"), if so, use the ship\'s wiki page (supplied in the context) and the stats from the ship\'s infobox to answer the question. If you determine the user is not asking about a ship (e.g. "who is ?", "what is ?"), do your best to answer the question based on the context provided.', - }, - { - "role": "user", - "content": f'Context: {context}\n\n---\n\nQuestion: {question}{f"{raah}Username: {str(username)}" if username else ""}', - "name": str(username) if username else "None", - }, - ], - temperature=0.2, - max_tokens=max_tokens, - top_p=1, - frequency_penalty=0, - presence_penalty=0, - stop=stop_sequence, - model=model, - user=( - (hashlib.sha256(str(username).encode()).hexdigest()) - if username is not None - else "" - ), - ) - - response = json.loads(response.model_dump_json()) - - if debug: - print( - "User: " - + ( - (hashlib.sha256(str(username).encode()).hexdigest()) - if username is not None - else "" - ), - flush=True, - ) - - return { - "answer": response["choices"][0]["message"]["content"].strip(), - "context": context, - "tokens": response["usage"], - "embeddings_usage": {"prompt_tokens": embeddingsusage.prompt_tokens, "total_tokens": embeddingsusage.total_tokens}, - "stop_reason": response["choices"][0]["finish_reason"], - "dataset": dataset, - "version": GalaxyGPTVersion, - "model": model, - "page_titles": page_titles, - } - else: - return { - "answer": response["choices"][0]["message"]["content"].strip(), - "dataset": dataset, - "version": GalaxyGPTVersion, - } - except Exception as e: - print(traceback.format_exc(), flush=True) - raise e - -# Automatic Dataset Creation System -class ADCS: - timer = None - timerbreak = False - status = "Stopped" # Acceptable Values: "Stopped", "Running" - webhook = DiscordWebhook(url=os.getenv("DISCORD_WEBHOOK_URL")) - - @staticmethod - def reloadDataset(newdataset): - global df, dataset - - if not os.path.exists(os.path.join(__location__, newdataset, "embeddings.csv")): - raise Exception("Embeddings were not generated") - - try: - del df - dataset = newdataset - loadDataset() - except Exception as e: - print(traceback.format_exc(), flush=True) - raise e - - @staticmethod - def createDataset(reload=False, webhook=None, noembeddings=False): - global df, dataset - if os.getenv("DATABASE_PASSWORD") is None: - raise Exception("Please set DATABASE_PASSWORD in .env") - - # Generate the dataset - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Generating a new dataset...") - subprocess.run(["./dump-database.sh", os.getenv("DATABASE_PASSWORD")], cwd=os.path.join(__location__)) - - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Preparing the dataset...") - # Prepare the dataset - if noembeddings is True: - print("Won't be generating embeddings for this dataset") - subprocess.run(["python3", "dataset.py", "-o", "dataset-ADCS", "--max-len", str(int(context_len/2)), "--no-embeddings", "--cleandir"], cwd=os.path.join(__location__)) - else: - subprocess.run(["python3", "dataset.py", "-o", "dataset-ADCS", "--max-len", str(int(context_len/2)), "--cleandir"], cwd=os.path.join(__location__)) - - if reload is True and noembeddings is False: - ADCS.reloadDataset("dataset-ADCS") - - print("Dataset created!") - - if ADCS.webhook: - ADCS.webhook.set_content("Created new dataset") - ADCS.webhook.execute() - - @staticmethod - def start(): - if ADCS.status == "Running": - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Already running!") - return - - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Starting scheduler to run at 00:00...") - ADCS.timer = schedule.every().day.at("00:00").do(ADCS.createDataset, True) - - def loop(): - while ADCS.timerbreak is False: - schedule.run_pending() - time.sleep(1) - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Stopped!") - - threading.Thread(target=loop).start() - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Started!") - ADCS.status = "Running" - - if ADCS.webhook: - ADCS.webhook.set_content("ADCS started") - ADCS.webhook.execute() - - @staticmethod - def stop(): - if ADCS.status == "Stopped": - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Already stopped!") - return - print(colorama.Fore.CYAN + "ADCS:" + colorama.Fore.RESET + " Stopping...") - ADCS.timerbreak = True - ADCS.status = "Stopped" - - if ADCS.webhook: - ADCS.webhook.set_content("ADCS stopped") - ADCS.webhook.execute() - -adcsdefault = strtobool(os.getenv("ADCS", "False")) -if adcsdefault is True: - scheduler = ADCS() - if scheduler.status == "Stopped": - scheduler.start() -elif adcsdefault is False: - print("ADCS is currently disabled") - -if __name__ == "__main__": - print("Type exit or quit to exit") - while True: - question = str(input("\n" + "\x1B[4m" + "User" + "\x1B[0m" + "\n")) - if not question or question.lower() == "exit" or question.lower() == "quit": - break - - question = question.strip() - response = answer_question(df, question=question, debug=False) - print("\n" + "\x1B[4m" + "GalaxyGPT" + "\x1B[0m" + "\n" + response["answer"]) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2b24e9a..0000000 --- a/requirements.txt +++ /dev/null @@ -1,22 +0,0 @@ -numpy==1.24.3 -pandas==2.2.1 -regex==2023.12.25 -requests==2.31.0 -tiktoken==0.6.0 - -# my own custom ones -flask==2.3.2 -python-dotenv==1.0.1 -openai==1.14.1 -scipy==1.11.4 -#openai[embeddings]==0.27.8 -#openai[datalib]==0.27.8 -#chromadb -#mysql-connector-python==8.0.33 -Flask-Limiter==3.3.1 -waitress==2.1.2 -tqdm==4.65 -halo==0.0.31 -colorama==0.4.6 -schedule==1.2.0 -discord-webhook==1.3.1 diff --git a/ruff.toml b/ruff.toml deleted file mode 100644 index 2e983f9..0000000 --- a/ruff.toml +++ /dev/null @@ -1,9 +0,0 @@ -[lint] -extend-select = ["Q", "I", "SIM"] - -[lint.flake8-quotes] -inline-quotes = "double" - -[format] -# Prefer single quotes over double quotes. -quote-style = "double" \ No newline at end of file diff --git a/ui/index.css b/ui/index.css deleted file mode 100644 index c47886f..0000000 --- a/ui/index.css +++ /dev/null @@ -1,97 +0,0 @@ -html { - background-color: #010409; - color: #c9d1d9; -} - -.footer { - position: fixed; - left: 0; - bottom: 0; - width: 100%; - text-align: center; - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; -} - -.titlething { - font-family: "Mona Sans", "Mona Sans Fallback", -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; - font-size: 72px !important; - line-height: 76px !important; - transition: 1s ease, transform 1s cubic-bezier(0.25, 1, 0.5, 1); - opacity: 0; - transform: translateY(-25px) scale(0.8); - color: white; -} - -#titlecard { - text-align: center; - padding: 15px; - border: 1px solid transparent; - border-radius: 12px; - box-shadow: 0px 0px 0px #404244; - width: max-content; - transition: box-shadow 0.5s cubic-bezier(0.85, 0, 0.15, 1); - position: relative; - left: 50%; - transform: translateX(-50%); -} - -#subtitle { - opacity: 0; - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - transition: 1s, opacity 0.5s ease-out; - color: #8b949e; -} - -.pageinput { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - font-size: 16px; - color: #c9d1d9; - background-color: #010409; - border: 1px solid #404244; - border-radius: 6px; - padding: 6px; - transition: 0.5s ease; - width: 100%; -} - -.promptinput { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - font-size: 16px; - color: #c9d1d9; - background-color: #010409; - border: 1px solid #404244; - border-radius: 6px; - padding: 6px; - transition: 0.5s ease; - width: 100%; - padding-right: 1.75rem; - box-sizing: border-box; -} - -.textthingywoo { - position: relative; - width: 25%; - text-align: center; - margin: auto; - display: flex; - gap: .5rem; - -} - -.funnybutton { - background: transparent; - color: white; - border: none; - transition: 0.5s; - padding: .25rem; - box-sizing: border-box; - right: .5rem; - border-radius: .375rem; - bottom: .625rem; - cursor: pointer; -} - -.funnybutton:hover { - background: rgba(255, 255, 255, 0.25); - color: white; -} \ No newline at end of file diff --git a/ui/index.html b/ui/index.html deleted file mode 100644 index 4b01ce0..0000000 --- a/ui/index.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - GalaxyGPT - - - - - - - -
-

GalaxyGPT

-

Introducing GalaxyGPT, the first Galaxy AI Model to be built off of OpenAI's - ChatGPT-3.5
GalaxyGPT is - trained off of data gathered on the Galaxypedia, and can quickly respond to a question you might have

-
- - - -
-

Prompt

-
- - -
-
- -
-

Response

-

-
- - - - - - - \ No newline at end of file diff --git a/ui/index.js b/ui/index.js deleted file mode 100644 index 06d14a3..0000000 --- a/ui/index.js +++ /dev/null @@ -1,72 +0,0 @@ -async function submitthedataidk() { - // var page = document.getElementById("pageinputthing").value; - var promptinput = document.getElementById("ballin") - var prompt = promptinput.value; - var submitbutton = document.getElementById("tomfoolery"); - var responsesection = document.getElementById("response"); - - promptinput.disabled = true; - submitbutton.disabled = true; - submitbutton.style.cursor = "not-allowed"; - responsesection.innerHTML = "Thinking..."; - - - var data = await fetch("/api/v1/ask", { - method: "POST", - body: JSON.stringify({prompt: prompt}), - headers: { - "Content-Type": "application/json" - } - }); - - if (data.status != 200) { - responsesection.innerHTML = "Error: " + (await data.json()); - promptinput.disabled = false; - submitbutton.disabled = false; - submitbutton.style.cursor = "pointer"; - promptinput.focus() - return; - } - - var datajson = await data.json(); - responsesection.innerHTML = datajson.answer; - - promptinput.disabled = false; - submitbutton.disabled = false; - submitbutton.style.cursor = "pointer"; - promptinput.focus() -} - -async function animateTitle() { - const title = document.getElementById("title"); - const titlecard = document.getElementById("titlecard"); - const subtitle = document.getElementById("subtitle"); - const sleep = (milliseconds) => { return new Promise(resolve => setTimeout(resolve, milliseconds)) } - await sleep(100) - - title.style.opacity = 1; - title.style.transform = "translateY(0px) scale(0.8)"; - - await sleep(500); - - title.style.transform = "translateY(0px) scale(1)" - title.style.letterSpacing = "5px"; - titlecard.style.boxShadow = "0px 8px 10px #404244"; - titlecard.style.border = "#404244 solid 1px" - // title.style.textShadow = "0px 0px 10px #fff"; - subtitle.style.opacity = 1; - - -} - -document.onload = animateTitle(); - -const amongus = document.getElementById("ballin"); - -amongus.addEventListener("keypress", (key) => { - - if (key.key === "Enter") { - submitthedataidk(); - } - -}) \ No newline at end of file