-
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2e9bde6
commit af67dd7
Showing
7 changed files
with
254 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// Copyright (c) smallketchup82. Licensed under the GPLv3 Licence. | ||
// See the LICENCE file in the repository root for full licence text. | ||
|
||
using System.ClientModel; | ||
using Microsoft.Extensions.DependencyInjection; | ||
using Microsoft.ML.Tokenizers; | ||
using OpenAI.Chat; | ||
using OpenAI.Moderations; | ||
|
||
namespace galaxygpt; | ||
|
||
public class AiClient(ChatClient chatClient, [FromKeyedServices("gptTokenizer")] TiktokenTokenizer gptTokenizer, ModerationClient? moderationClient = null) | ||
{ | ||
public async Task<string> AnswerQuestion(string question, string context, int maxInputTokens, string? username = null, int? maxOutputTokens = null) | ||
{ | ||
#region Sanitize & Check the question | ||
|
||
question = question.Trim(); | ||
|
||
if (string.IsNullOrWhiteSpace(question)) | ||
throw new ArgumentException("The question cannot be empty."); | ||
|
||
if (gptTokenizer.CountTokens(question) > maxInputTokens) | ||
throw new ArgumentException("The question is too long to be answered."); | ||
|
||
// Throw the question into the moderation API | ||
if (moderationClient != null) | ||
{ | ||
ClientResult<ModerationResult> moderation = await moderationClient.ClassifyTextInputAsync(question); | ||
|
||
if (moderation.Value.Flagged) | ||
throw new InvalidOperationException("The question was flagged by the moderation API."); | ||
} else | ||
Console.WriteLine("Warning: No moderation client was provided. Skipping moderation check. This can be dangerous"); | ||
#endregion | ||
|
||
List<ChatMessage> messages = | ||
[ | ||
new SystemChatMessage(""" | ||
You are GalaxyGPT, a helpful assistant that answers questions about Galaxy, a ROBLOX Space Game. | ||
The Galaxypedia is the game's official wiki and it is your creator. | ||
The Galaxypedia's slogans are "The new era of the Galaxy Wiki" and "A hub for all things Galaxy". | ||
Answer the question based on the supplied context. If the question cannot be answered, politely say you don't know the answer and ask the user for clarification, or if they have any further questions about Galaxy. | ||
If the user has a username, it will be provided and you can address them by it. If a username is not provided (it shows as N/A), do not address/refer the user apart from "you" or "your". | ||
Do not reference or mention the "context provided" in your response, no matter what. | ||
The context will be given in the format of wikitext. You will be given multiple different pages in your context to work with. The different pages will be separated by "###". | ||
If a ship infobox is present in the context, prefer using data from within the infobox. An infobox can be found by looking for a wikitext template that has the word "infobox" in its name. | ||
If the user is not asking a question (e.g. "thank you", "thanks for the help"): Respond to it and ask the user if they have any further questions. | ||
Respond to greetings (e.g. "hi", "hello") with (in this exact order): A greeting, a brief description of yourself, and a question addressed to the user if they have a question or need assistance. | ||
Above all, be polite and helpful to the user. | ||
Steps for responding: | ||
First check if the user is asking about a ship (e.g. "what is the deity?", "how much shield does the theia have?"), if so, use the ship's wiki page (supplied in the context) and the statistics from the ship's infobox to answer the question. | ||
If you determine the user is not asking about a ship (e.g. "who is <player>?", "what is <item>?"), do your best to answer the question based on the context provided. | ||
"""), | ||
new UserChatMessage($"Context:\n{context.Trim()}\n\n---\n\nQuestion: {question}\nUsername: {username ?? "N/A"}") | ||
{ | ||
ParticipantName = username ?? null | ||
} | ||
]; | ||
|
||
ClientResult<ChatCompletion>? idk = await chatClient.CompleteChatAsync(messages, new ChatCompletionOptions | ||
{ | ||
MaxTokens = maxOutputTokens | ||
}); | ||
messages.Add(new AssistantChatMessage(idk)); | ||
|
||
return messages[^1].Content[0].Text; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// Copyright (c) smallketchup82. Licensed under the GPLv3 Licence. | ||
// See the LICENCE file in the repository root for full licence text. | ||
|
||
using System.ClientModel; | ||
using System.Numerics.Tensors; | ||
using System.Text; | ||
using galaxygpt.Database; | ||
using Microsoft.EntityFrameworkCore; | ||
using Microsoft.Extensions.DependencyInjection; | ||
using Microsoft.ML.Tokenizers; | ||
using OpenAI; | ||
using OpenAI.Embeddings; | ||
|
||
namespace galaxygpt; | ||
|
||
/// <summary> | ||
/// Handles context management | ||
/// </summary> | ||
public class ContextManager(VectorDb db, EmbeddingClient embeddingClient, [FromKeyedServices("gptTokenizer")] TiktokenTokenizer gptTokenizer, [FromKeyedServices("embeddingsTokenizer")] TiktokenTokenizer embeddingsTokenizer) | ||
{ | ||
/// <summary> | ||
/// Load all pages from the database into memory | ||
/// </summary> | ||
/// <remarks> | ||
/// Honestly, I tried to avoid this, but considering we'll be doing cosine similarity on everything anyway, it's better to load everything into memory. | ||
/// </remarks> | ||
private List<Page> _pages = db.Pages.Include(chunk => chunk.Chunks).ToList(); | ||
|
||
public async Task<(string, int)> FetchContext(string question, int? maxLength = null) | ||
{ | ||
question = question.Trim(); | ||
|
||
if (string.IsNullOrEmpty(question)) | ||
throw new ArgumentException("The question cannot be empty."); | ||
|
||
if (!db.Pages.Any()) | ||
throw new InvalidOperationException("The database is empty. Please load a dataset first."); | ||
|
||
ClientResult<Embedding>? questionEmbeddings = await embeddingClient.GenerateEmbeddingAsync(question); | ||
|
||
var pageEmbeddings = new List<(Page page, float[] embeddings, int chunkId, float distance)>(); | ||
|
||
foreach (Page page in db.Pages.Include(chunk => chunk.Chunks)) | ||
{ | ||
if (page.Chunks == null || page.Chunks.Count == 0) | ||
{ | ||
if (page.Embeddings == null) continue; | ||
|
||
float distance = TensorPrimitives.CosineSimilarity(questionEmbeddings.Value.Vector.ToArray(), page.Embeddings.ToArray()); | ||
pageEmbeddings.Add((page, page.Embeddings.ToArray(), -1, distance)); | ||
} | ||
else if (page.Chunks != null) | ||
{ | ||
foreach (Chunk chunk in page.Chunks) | ||
{ | ||
if (chunk.Embeddings == null) continue; | ||
|
||
float distance = TensorPrimitives.CosineSimilarity(questionEmbeddings.Value.Vector.ToArray(), chunk.Embeddings.ToArray()); | ||
pageEmbeddings.Add((page, chunk.Embeddings.ToArray(), chunk.Id, distance)); | ||
} | ||
} | ||
} | ||
|
||
pageEmbeddings.Sort((a, b) => b.distance.CompareTo(a.distance)); | ||
|
||
StringBuilder context = new(); | ||
int tokenCount = gptTokenizer.CountTokens(question); | ||
int iterations = 0; | ||
|
||
foreach ((Page page, float[] _, int chunkId, float _) in pageEmbeddings) | ||
{ | ||
string content = chunkId == -1|| page.Chunks == null || page.Chunks.Count == 0 ? page.Content : page.Chunks.First(chunk => chunk.Id == chunkId).Content; | ||
|
||
if (maxLength == null) | ||
{ | ||
if (iterations >= 5) | ||
break; | ||
} | ||
else | ||
{ | ||
tokenCount += gptTokenizer.CountTokens(content); | ||
if (tokenCount > maxLength) | ||
break; | ||
} | ||
|
||
context.Append($"Page: {page.Title}\nContent: {content}\n\n###\n\n"); | ||
iterations++; | ||
} | ||
|
||
return (context.ToString(), embeddingsTokenizer.CountTokens(question)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// Copyright (c) smallketchup82. Licensed under the GPLv3 Licence. | ||
// See the LICENCE file in the repository root for full licence text. | ||
|
||
namespace galaxygpt.Database; | ||
|
||
public class Metadata | ||
{ | ||
public int Id { get; init; } | ||
|
||
/// <summary> | ||
/// The name of the dataset (typically something like "dataset-v1") | ||
/// </summary> | ||
public required string DatasetName { get; init; } | ||
|
||
/// <summary> | ||
/// The date and time the dataset was created at. Use UTC time. | ||
/// </summary> | ||
public required DateTime CreatedAt { get; init; } | ||
|
||
/// <summary> | ||
/// The maximum size of each chunk | ||
/// </summary> | ||
public required int ChunkMaxSize { get; init; } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters