From 49af3be30208f4140afa509c1f99ac8b2dc8f252 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:02:21 +0800 Subject: [PATCH 01/18] chore(deps): add libraries required for RAG - pgvector for psql to support vector storage and operations - neigbor for code easier db migrations including vectors - langchainrb and ruby-openai for LLM services - pdf-reader for reading text from pdff --- Gemfile | 7 +++++++ Gemfile.lock | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/Gemfile b/Gemfile index 513c25e3906..2fe6966273f 100644 --- a/Gemfile +++ b/Gemfile @@ -214,3 +214,10 @@ gem 'rails-html-sanitizer', '>= 1.0.4' gem 'mimemagic', '0.4.3' gem 'ffi', '>= 1.14.2' + +# Retreival Augmented Generation (RAG) Support +gem 'pgvector' +gem 'neighbor' +gem 'langchainrb' +gem 'ruby-openai' +gem 'pdf-reader' diff --git a/Gemfile.lock b/Gemfile.lock index 4fdacf04fda..4227d4887c5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -53,6 +53,7 @@ GIT GEM remote: https://rubygems.org/ specs: + Ascii85 (2.0.1) actioncable (7.2.1) actionpack (= 7.2.1) activesupport (= 7.2.1) @@ -129,6 +130,7 @@ GEM rails (>= 6.0) addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) + afm (0.2.2) after_commit_action (1.1.0) activerecord (>= 3.0.0) activesupport (>= 3.0.0) @@ -149,6 +151,7 @@ GEM aws-sigv4 (~> 1.5) aws-sigv4 (1.10.1) aws-eventstream (~> 1, >= 1.0.2) + baran (0.1.12) base64 (0.2.0) bcrypt (3.1.20) bigdecimal (3.1.8) @@ -215,6 +218,7 @@ GEM erubi (1.13.0) et-orbi (1.2.11) tzinfo + event_stream_parser (1.0.0) excon (1.2.2) exifr (1.4.0) factory_bot (6.5.0) @@ -222,6 +226,14 @@ GEM factory_bot_rails (6.4.3) factory_bot (~> 6.4) railties (>= 5.0.0) + faraday (2.12.0) + faraday-net_http (>= 2.0, < 3.4) + json + logger + faraday-multipart (1.0.4) + multipart-post (~> 2) + faraday-net_http (3.3.0) + net-http ffi (1.17.0) filename (0.1.2) flamegraph (0.9.5) @@ -250,6 +262,7 @@ GEM raabro (~> 1.4) globalid (1.2.1) activesupport (>= 6.1) + hashery (2.1.2) highline (3.0.1) html-pipeline (2.14.3) activesupport (>= 2) @@ -295,6 +308,8 @@ GEM activesupport (>= 5.0.0) jmespath (1.6.2) json (2.7.4) + json-schema (4.3.1) + addressable (>= 2.8) jwt (2.9.3) base64 kaminari (1.2.2) @@ -313,6 +328,12 @@ GEM json (~> 2.6) jwt (~> 2.4) rest-client (~> 2.1) + langchainrb (0.19.2) + baran (~> 0.1.9) + json-schema (~> 4) + matrix + pragmatic_segmenter (~> 0.3.0) + zeitwerk (~> 2.5) language_server-protocol (3.17.0.3) launchy (3.0.1) addressable (~> 2.8) @@ -355,6 +376,11 @@ GEM mini_portile2 (2.8.8) minitest (5.25.1) multi_json (1.15.0) + multipart-post (2.4.1) + neighbor (0.5.0) + activerecord (>= 7) + net-http (0.4.1) + uri net-imap (0.4.14) date net-protocol @@ -377,7 +403,15 @@ GEM parser (3.3.5.0) ast (~> 2.4.1) racc + pdf-reader (2.13.0) + Ascii85 (>= 1.0, < 3.0, != 2.0.0) + afm (~> 0.2.1) + hashery (~> 2.0) + ruby-rc4 + ttfunk pg (1.5.8) + pgvector (0.3.2) + pragmatic_segmenter (0.3.24) progress (3.6.0) psych (5.1.2) stringio @@ -527,7 +561,12 @@ GEM rubocop (>= 1.52.0, < 2.0) rubocop-ast (>= 1.31.1, < 2.0) ruby-oembed (0.18.1) + ruby-openai (7.3.1) + event_stream_parser (>= 0.3.0, < 2.0.0) + faraday (>= 1) + faraday-multipart (>= 1) ruby-progressbar (1.13.0) + ruby-rc4 (0.1.5) ruby-vips (2.2.2) ffi (~> 1.12) logger @@ -585,12 +624,15 @@ GEM timeout (0.4.1) traceroute (0.8.1) rails (>= 3.0.0) + ttfunk (1.8.0) + bigdecimal (~> 3.1) tzinfo (2.0.6) concurrent-ruby (~> 1.0) unicode-display_width (2.6.0) uniform_notifier (1.16.0) unread (0.14.0) activerecord (>= 6.1) + uri (0.13.1) useragent (0.16.10) validates_hostname (1.0.13) activerecord (>= 3.0) @@ -655,6 +697,7 @@ DEPENDENCIES jwt kaminari keycloak + langchainrb listen lograge lograge-sql @@ -662,10 +705,13 @@ DEPENDENCIES loofah (>= 2.2.1) mimemagic (= 0.4.3) mini_magick + neighbor nokogiri (>= 1.8.1) ostruct parallel_tests + pdf-reader pg + pgvector puma rack-cors rack-mini-profiler @@ -686,6 +732,7 @@ DEPENDENCIES rubocop (~> 1.67) rubocop-rails ruby-oembed + ruby-openai rubyzip rwordnet! sanitize (>= 4.6.3) From 038965e9a47fb1cfbb091a7a83d2a04803ce490f Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:04:17 +0800 Subject: [PATCH 02/18] chore(env): add openai api key - openai api key is not complete --- env | 1 + 1 file changed, 1 insertion(+) diff --git a/env b/env index c526f9bb65a..69d1f4ca015 100644 --- a/env +++ b/env @@ -14,3 +14,4 @@ KEYCLOAK_REALM = "coursemology" KEYCLOAK_BE_CLIENT_ID = "5b1af0e1-0dc5-44f6-8b69-13015fd318f5" KEYCLOAK_BE_CLIENT_SECRET = "DIELQjgeZ0UYIkVGwwTjCP7s6VoPYNfK" KEYCLOAK_FE_CLIENT_UUID = "308875ca-cc1a-4c15-921f-893faa1f1156" +OPENAI_API_KEY = "sk-..." From af2d6db43ce5e55fbd4eacd156fb6ca152ec071d Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:05:35 +0800 Subject: [PATCH 03/18] feat(initialiser): add llm initialiser - initialise LLM models that will be used in code - LANGCHAIN_OPENAI model is used for normal RAG operations - RAGAS (Retrieval Augmented Generation Assessment) model is used for evaluation of RAG --- config/initializers/llm_langchain.rb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 config/initializers/llm_langchain.rb diff --git a/config/initializers/llm_langchain.rb b/config/initializers/llm_langchain.rb new file mode 100644 index 00000000000..13834312f0c --- /dev/null +++ b/config/initializers/llm_langchain.rb @@ -0,0 +1,15 @@ +if ENV['OPENAI_API_KEY'].present? + require 'langchain' + # Create a global OpenAI client instance + LANGCHAIN_OPENAI = Langchain::LLM::OpenAI.new( + api_key: ENV['OPENAI_API_KEY'], + default_options: { temperature: 0.5, chat_completion_model_name: 'gpt-4o' } + ) + # RAGAS (Retrieval Augmented Generation Assessment) used to evaluate RAG response + RAGAS = Langchain::LLM::OpenAI.new( + api_key: ENV['OPENAI_API_KEY'], + default_options: { temperature: 0, chat_completion_model_name: 'gpt-4o' } + ) +else + Rails.logger.error('OPENAI_API_KEY is not set in the environment') +end From 897f879f9999562a78fed269b8483dbfab3c719b Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:06:17 +0800 Subject: [PATCH 04/18] feat(db): add and modify tables needed for course material text chunking - add course_material_text_chunks which belongs to course_materials - add course_material_text_chunkings which belongs to course_materials and trackable_jobs - add workflow_state column to course_materials table. --- ...1804_create_course_material_text_chunks.rb | 38 +++++++++++++++++++ ...6_create_course_material_text_chunkings.rb | 11 ++++++ ...1_add_workflow_state_to_course_material.rb | 7 ++++ db/schema.rb | 34 ++++++++++++++++- 4 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 db/migrate/20241203141804_create_course_material_text_chunks.rb create mode 100644 db/migrate/20241203145856_create_course_material_text_chunkings.rb create mode 100644 db/migrate/20241203152111_add_workflow_state_to_course_material.rb diff --git a/db/migrate/20241203141804_create_course_material_text_chunks.rb b/db/migrate/20241203141804_create_course_material_text_chunks.rb new file mode 100644 index 00000000000..86225a94e6c --- /dev/null +++ b/db/migrate/20241203141804_create_course_material_text_chunks.rb @@ -0,0 +1,38 @@ +class CreateCourseMaterialTextChunks < ActiveRecord::Migration[7.2] + def change + # Ensure pgvector extension is enabled + enable_extension "vector" unless extension_enabled?("vector") + + create_table :course_material_text_chunks, id: :serial, force: :cascade do |t| + + # Main association + t.text :content, null: false + t.vector :embedding, limit: 1536, null: false + t.datetime :created_at, precision: nil, null: false + + # Foreign Keys + t.references :creator, + null: false, + foreign_key: { to_table: :users, name: "fk_course_material_text_chunks_creator_id" }, + index: { name: "fk__course_material_text_chunks_creator_id" } + t.references :course, + null: false, + foreign_key: { to_table: :courses, name: "fk_course_material_text_chunks_course_id" }, + index: { name: "fk__course_material_text_chunks_course_id" } + t.references :course_material, + null: false, + foreign_key: { to_table: :course_materials, name: "fk_course_material_text_chunks_material_id" }, + index: { name: "fk__course_material_text_chunks_material_id" } + + # Indexes + t.index :embedding, + name: "index_course_material_text_chunk_embedding", + opclass: :vector_cosine_ops, + using: :hnsw + + t.index [:course_material_id, :content], + unique: true, + name: 'index_text_chunks_on_text_chunk_id_and_content' + end + end +end diff --git a/db/migrate/20241203145856_create_course_material_text_chunkings.rb b/db/migrate/20241203145856_create_course_material_text_chunkings.rb new file mode 100644 index 00000000000..b674f0384b0 --- /dev/null +++ b/db/migrate/20241203145856_create_course_material_text_chunkings.rb @@ -0,0 +1,11 @@ +class CreateCourseMaterialTextChunkings < ActiveRecord::Migration[7.2] + def change + create_table :course_material_text_chunkings, id: :serial do |t| + t.datetime :created_at, null: false + t.datetime :updated_at, null: false + # Foreign Keys + t.references :material, null: false, foreign_key: { to_table: :course_materials, name: "fk_course_material_text_chunkings_material_id" }, index: { name: "fk__course_material_text_chunkings_material_id", unique: true } + t.references :job, type: :uuid, foreign_key: { to_table: :jobs, name: "fk_course_material_text_chunkings_job_id", on_delete: :nullify }, index: { name: "fk__course_material_text_chunkings_job_id", unique: true } + end + end +end diff --git a/db/migrate/20241203152111_add_workflow_state_to_course_material.rb b/db/migrate/20241203152111_add_workflow_state_to_course_material.rb new file mode 100644 index 00000000000..4d95e7ec5bc --- /dev/null +++ b/db/migrate/20241203152111_add_workflow_state_to_course_material.rb @@ -0,0 +1,7 @@ +class AddWorkflowStateToCourseMaterial < ActiveRecord::Migration[7.2] + def change + change_table :course_materials do |t| + t.string :workflow_state, limit: 255, null: false, default: "not_chunked" + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 09e530d78fd..2f7a83e6842 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,10 +10,11 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.2].define(version: 2024_11_18_152013) do +ActiveRecord::Schema[7.2].define(version: 2024_12_03_152111) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" enable_extension "uuid-ossp" + enable_extension "vector" create_table "activities", id: :serial, force: :cascade do |t| t.integer "actor_id", null: false @@ -865,6 +866,29 @@ t.index ["updater_id"], name: "fk__course_material_folders_updater_id" end + create_table "course_material_text_chunkings", id: :serial, force: :cascade do |t| + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.bigint "material_id", null: false + t.uuid "job_id" + t.index ["job_id"], name: "fk__course_material_text_chunkings_job_id", unique: true + t.index ["material_id"], name: "fk__course_material_text_chunkings_material_id", unique: true + end + + create_table "course_material_text_chunks", id: :serial, force: :cascade do |t| + t.text "content", null: false + t.vector "embedding", limit: 1536, null: false + t.datetime "created_at", precision: nil, null: false + t.bigint "creator_id", null: false + t.bigint "course_id", null: false + t.bigint "course_material_id", null: false + t.index ["course_id"], name: "fk__course_material_text_chunks_course_id" + t.index ["course_material_id", "content"], name: "index_text_chunks_on_text_chunk_id_and_content", unique: true + t.index ["course_material_id"], name: "fk__course_material_text_chunks_material_id" + t.index ["creator_id"], name: "fk__course_material_text_chunks_creator_id" + t.index ["embedding"], name: "index_course_material_text_chunk_embedding", opclass: :vector_cosine_ops, using: :hnsw + end + create_table "course_materials", id: :serial, force: :cascade do |t| t.integer "folder_id", null: false t.string "name", limit: 255, null: false @@ -873,6 +897,7 @@ t.integer "updater_id", null: false t.datetime "created_at", precision: nil, null: false t.datetime "updated_at", precision: nil, null: false + t.string "workflow_state", limit: 255, default: "not_chunked", null: false t.index "folder_id, lower((name)::text)", name: "index_course_materials_on_folder_id_and_name", unique: true t.index ["creator_id"], name: "fk__course_materials_creator_id" t.index ["folder_id"], name: "fk__course_materials_folder_id" @@ -1397,7 +1422,7 @@ t.string "type", limit: 255, null: false t.string "name", limit: 255, null: false t.integer "parent_id" - t.serial "weight" + t.serial "weight", null: false t.boolean "enabled", default: true, null: false t.boolean "default_evaluator_whitelisted", default: true, null: false t.boolean "codaveri_evaluator_whitelisted", default: false, null: false @@ -1610,6 +1635,11 @@ add_foreign_key "course_material_folders", "courses", name: "fk_course_material_folders_course_id" add_foreign_key "course_material_folders", "users", column: "creator_id", name: "fk_course_material_folders_creator_id" add_foreign_key "course_material_folders", "users", column: "updater_id", name: "fk_course_material_folders_updater_id" + add_foreign_key "course_material_text_chunkings", "course_materials", column: "material_id", name: "fk_course_material_text_chunkings_material_id" + add_foreign_key "course_material_text_chunkings", "jobs", name: "fk_course_material_text_chunkings_job_id", on_delete: :nullify + add_foreign_key "course_material_text_chunks", "course_materials", name: "fk_course_material_text_chunks_material_id" + add_foreign_key "course_material_text_chunks", "courses", name: "fk_course_material_text_chunks_course_id" + add_foreign_key "course_material_text_chunks", "users", column: "creator_id", name: "fk_course_material_text_chunks_creator_id" add_foreign_key "course_materials", "course_material_folders", column: "folder_id", name: "fk_course_materials_folder_id" add_foreign_key "course_materials", "users", column: "creator_id", name: "fk_course_materials_creator_id" add_foreign_key "course_materials", "users", column: "updater_id", name: "fk_course_materials_updater_id" From 901714d106afbdef577f7acaaba59ce999949bee Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:12:19 +0800 Subject: [PATCH 05/18] feat(model): add text_chunk and text_chunking models - add text_chunk model to represent segments of material after chunking, including content and associated embeddings - add text_chunking model to represent trackable text_chunking jobs --- app/models/course/material/text_chunk.rb | 10 ++++++++++ app/models/course/material/text_chunking.rb | 10 ++++++++++ 2 files changed, 20 insertions(+) create mode 100644 app/models/course/material/text_chunk.rb create mode 100644 app/models/course/material/text_chunking.rb diff --git a/app/models/course/material/text_chunk.rb b/app/models/course/material/text_chunk.rb new file mode 100644 index 00000000000..3975512082e --- /dev/null +++ b/app/models/course/material/text_chunk.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true +class Course::Material::TextChunk < ApplicationRecord + has_neighbors :embedding + belongs_to :material, inverse_of: :text_chunks, class_name: 'Course::Material', + foreign_key: :course_material_id, autosave: true + validates :creator, presence: true + validates :content, presence: true + validates :embedding, presence: true + validates :course_id, presence: true +end diff --git a/app/models/course/material/text_chunking.rb b/app/models/course/material/text_chunking.rb new file mode 100644 index 00000000000..6190cda8dd3 --- /dev/null +++ b/app/models/course/material/text_chunking.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true +class Course::Material::TextChunking < ApplicationRecord + validates :material, presence: true + validates :material_id, uniqueness: { if: :material_id_changed? } + validates :job_id, uniqueness: { if: :job_id_changed? }, allow_nil: true + belongs_to :material, class_name: 'Course::Material', inverse_of: :text_chunking + # @!attribute [r] job + # This might be null if the job has been cleared. + belongs_to :job, class_name: 'TrackableJob::Job', inverse_of: nil, optional: true +end From 0af32f90e5224f8ce680a209f0fd48d29794aa59 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:12:46 +0800 Subject: [PATCH 06/18] feat(job): add text_chunk_job - trackable job that tracks text chunking of course material --- app/jobs/course/material/text_chunk_job.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 app/jobs/course/material/text_chunk_job.rb diff --git a/app/jobs/course/material/text_chunk_job.rb b/app/jobs/course/material/text_chunk_job.rb new file mode 100644 index 00000000000..eef6b3b0819 --- /dev/null +++ b/app/jobs/course/material/text_chunk_job.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true +class Course::Material::TextChunkJob < ApplicationJob + include TrackableJob + queue_as :default + + protected + + def perform_tracked(material, current_user) + material.build_text_chunks(current_user) + material.finish_chunking! + material.save! + rescue StandardError => e + material.cancel_chunking! + material.save! + # re-raise error to make the job have an error + raise e + end +end From 85897aa8f5c1843caa1eff05db5951eb1069b696 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:14:02 +0800 Subject: [PATCH 07/18] feat(controller): update materials_controller - add create_text_chunks that create text chunks from material - add destory_text_chunks that destroy materials's text chunks - modify update so that if the file contents was updated (i.e new file upload) it will destroy current text chunks related to previous material - modify destroy to ensure that material cannot be deleted while material is still undergoing text chunk job --- .../course/material/materials_controller.rb | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/app/controllers/course/material/materials_controller.rb b/app/controllers/course/material/materials_controller.rb index 273dacbeb74..712a94b8ccf 100644 --- a/app/controllers/course/material/materials_controller.rb +++ b/app/controllers/course/material/materials_controller.rb @@ -9,13 +9,16 @@ def show end def update - if @material.update(material_params) + if @material.workflow_state != 'chunking' && @material.update(material_params) + # deletes material's text chunk if file has been changed and file has been chunked + delete_material_text_chunks if material_params['file'] && @material.workflow_state == 'chunked' course_user = @material.attachment.updater.course_users.find_by(course: current_course) user = course_user || @material.attachment.updater render json: { id: @material.id, name: @material.name, description: @material.description, updatedAt: @material.attachment.updated_at, + workflowState: @material.workflow_state, updater: { id: user.id, name: user.name, userUrl: url_to_user_or_course_user(current_course, user) } }, status: :ok @@ -25,7 +28,27 @@ def update end def destroy - if @material.destroy + if @material.destroy && @material.workflow_state != 'chunking' + head :ok + else + render json: { errors: @material.errors.full_messages.to_sentence }, status: :bad_request + end + end + + def create_text_chunks + job = last_text_chunking_job + if job + render partial: 'jobs/submitted', locals: { job: job } + else + job = @material.text_chunking!(current_user) + render partial: 'jobs/submitted', locals: { job: job.job } + end + end + + def destroy_text_chunks + if @material.text_chunks.destroy_all && @material.workflow_state == 'chunked' + @material.delete_chunks! + @material.save head :ok else render json: { errors: @material.errors.full_messages.to_sentence }, status: :bad_request @@ -65,4 +88,18 @@ def log_service @log_service ||= Course::Assessment::SessionLogService.new(@assessment, current_session_id, @submission) end + + def last_text_chunking_job + job = @material.text_chunking&.job + (job&.status == 'submitted') ? job : nil + end + + def delete_material_text_chunks + if @material.text_chunks.destroy_all + @material.delete_chunks! + @material.save + else + render json: { errors: @material.errors.full_messages.to_sentence }, status: :bad_request + end + end end From 2b6735ca1b4e6bb2645cf47420ff280d1bf215cb Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:14:29 +0800 Subject: [PATCH 08/18] feat(ability): update material abillity - only course owner or manager will be allowed to manage text chunks --- .../course/materials_ability_component.rb | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/app/models/components/course/materials_ability_component.rb b/app/models/components/course/materials_ability_component.rb index 9f43dc99ae7..efca7a41902 100644 --- a/app/models/components/course/materials_ability_component.rb +++ b/app/models/components/course/materials_ability_component.rb @@ -8,6 +8,8 @@ def define_permissions allow_upload_materials allow_staff_read_materials if course_user.staff? allow_teaching_staff_manage_materials if course_user.teaching_staff? + disallow_text_chunking if course_user.teaching_staff? + manage_text_chunking if course_user.manager_or_owner? end disallow_superusers_change_root_and_linked_folders @@ -45,6 +47,16 @@ def allow_upload_materials can :manage, Course::Material, creator: user end + def manage_text_chunking + can :create_text_chunks, Course::Material, material_course_hash + can :destroy_text_chunks, Course::Material, material_course_hash + end + + def disallow_text_chunking + cannot :create_text_chunks, Course::Material, material_course_hash + cannot :destroy_text_chunks, Course::Material, material_course_hash + end + def allow_staff_read_materials can :read, Course::Material, material_course_hash can [:read, :download], Course::Material::Folder, { course_id: course.id } @@ -82,7 +94,7 @@ def concrete_folder_hash def opened_material_hashes max_start_at = Time.zone.now # Extend start_at time with self directed time from course settings. - max_start_at += (course.advance_start_at_duration || 0) if course + max_start_at += course.advance_start_at_duration || 0 if course # Add materials with parent assessments that open early due to personalized timeline # Dealing with personal times is too complicated to represent as a hash of conditions From 6cd6ce902086139c6d830094bb1f746b6cf8e2eb Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:15:49 +0800 Subject: [PATCH 09/18] feat(model): update material model - add workflow state to material model - material has_many text_chunks and has_one text_chunking --- app/models/course/material.rb | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/app/models/course/material.rb b/app/models/course/material.rb index 8e15b3814cc..39610b2ac8c 100644 --- a/app/models/course/material.rb +++ b/app/models/course/material.rb @@ -2,8 +2,28 @@ class Course::Material < ApplicationRecord has_one_attachment include DuplicationStateTrackingConcern + include Workflow + + workflow do + state :not_chunked do + event :start_chunking, transitions_to: :chunking + end + # State where there is a job running to chunk course materials + state :chunking do + event :finish_chunking, transitions_to: :chunked + event :cancel_chunking, transitions_to: :not_chunked + end + # The state where chunking job is completed and course_materials is chunked + state :chunked do + event :delete_chunks, transitions_to: :not_chunked + end + end belongs_to :folder, inverse_of: :materials, class_name: 'Course::Material::Folder' + has_many :text_chunks, inverse_of: :material, class_name: 'Course::Material::TextChunk', + dependent: :destroy, foreign_key: :course_material_id, autosave: true + has_one :text_chunking, class_name: 'Course::Material::TextChunking', + dependent: :destroy, inverse_of: :material, autosave: true before_save :touch_folder @@ -17,6 +37,7 @@ class Course::Material < ApplicationRecord if: -> { folder_id? && name_changed? } } validates :folder_id, uniqueness: { scope: [:name], case_sensitive: false, if: -> { name? && folder_id_changed? } } + validates :workflow_state, presence: true scope :in_concrete_folder, -> { joins(:folder).merge(Folder.concrete) } @@ -66,6 +87,30 @@ def before_duplicate_save(_duplicator) self.name = next_valid_name end + def text_chunking!(current_user) + ensure_text_chunking! + Course::Material::TextChunkJob.perform_later(self, current_user).tap do |job| + text_chunking.update_column(:job_id, job.job_id) + end + end + + def build_text_chunks(current_user) + start_chunking! + save! + course_id = folder.course_id + File.open(attachment.path, 'r:ASCII-8BIT') do |file| + llm_service = Rag::LlmService.new + chunking_service = Rag::ChunkingService.new(file: file) + chunks = chunking_service.file_chunking + embeddings = llm_service.generate_embeddings_from_chunks(chunks) + chunks.each_with_index do |chunk, index| + text_chunks.build(embedding: embeddings[index], content: chunk, creator: current_user, + course_id: course_id) + end + end + save! + end + private # TODO: Not threadsafe, consider making all folders as materials @@ -77,4 +122,15 @@ def validate_name_is_unique_among_folders conflicts = folder.children.where('name ILIKE ?', name) errors.add(:name, :taken) unless conflicts.empty? end + + def ensure_text_chunking! + ActiveRecord::Base.transaction(requires_new: true) do + text_chunking || create_text_chunking! + end + rescue ActiveRecord::RecordInvalid, ActiveRecord::RecordNotUnique => e + raise e if e.is_a?(ActiveRecord::RecordInvalid) && e.record.errors[:material_id].empty? + + association(:text_chunking).reload + text_chunking + end end From df52beaee41f3997efbcabcfe421df66f7e71f0f Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:16:28 +0800 Subject: [PATCH 10/18] feat(service): add chunking and llm service - add chunking service that handles the chunking of text and file - add llm service that handles services provided by llm which are text embedding and getting image caption --- app/services/rag/chunking_service.rb | 47 ++++++++++++++++++++++++++++ app/services/rag/llm_service.rb | 47 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 app/services/rag/chunking_service.rb create mode 100644 app/services/rag/llm_service.rb diff --git a/app/services/rag/chunking_service.rb b/app/services/rag/chunking_service.rb new file mode 100644 index 00000000000..5589c367784 --- /dev/null +++ b/app/services/rag/chunking_service.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true +class Rag::ChunkingService + def initialize(text: nil, file: nil) + raise ArgumentError, 'Either text or file must be provided' if text.nil? && file.nil? + + if file + @file = file + @file_type = File.extname(file.path).downcase + else + @text = text.gsub(/\s+/, ' ').strip + end + end + + def file_chunking + if @file_type == '.pdf' + reader = PDF::Reader.new(@file.path) + text = reader.pages.map(&:text).join(' ') + elsif @file_type == '.txt' + text = File.read(@file.path) + else + raise "Unsupported file type: #{@file_type}" + end + @text = text.gsub(/\s+/, ' ').strip + fixed_size_chunk_text(500, 100) + end + + def text_chunking + fixed_size_chunk_text(500, 100) + end + + private + + def fixed_size_chunk_text(chunk_size, overlap_size) + chunks = [] + start = 0 + ending = 0 + while ending < @text.length + # Define the chunk with overlap + chunk = @text[start, chunk_size] + chunks << chunk + ending = start + chunk_size + # Move the starting position forward, keeping the overlap + start += (chunk_size - overlap_size) + end + chunks + end +end diff --git a/app/services/rag/llm_service.rb b/app/services/rag/llm_service.rb new file mode 100644 index 00000000000..eb8f5fb260d --- /dev/null +++ b/app/services/rag/llm_service.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true +class Rag::LlmService + def initialize(_evaluation_service = nil) + @client = LANGCHAIN_OPENAI + end + + def get_image_caption(image) + # Base 64 encode image + base64_image = if image.is_a?(String) + Base64.strict_encode64(image) + else + Base64.strict_encode64(File.read(image.path)) + end + + messages = [ + { + role: 'user', + content: [ + { type: 'text', + text: 'What is in this image? Do not give a summary of image at the end. Make sure response is less than 80 words' }, + { + type: 'image_url', + image_url: { + url: "data:image/jpeg;base64,#{base64_image}" + } + } + ] + } + ] + + @client.chat(messages: messages).chat_completion + end + + def generate_embeddings_from_chunks(chunks) + result = [] + chunks.each_slice(10) do |chunk| + response = @client.embed( + text: chunk, + model: 'text-embedding-ada-002' + ) + response.raw_response['data'].each do |embedding| + result.push(embedding['embedding']) + end + end + result + end +end From 77bca32844cac6f62c09940b8fd17fc502baf178 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:16:51 +0800 Subject: [PATCH 11/18] feat(view): update course folder view - update material view to include workflow state - update folder and subfolder permission view to include canManageKnowledgeBase --- app/views/course/material/folders/show.json.jbuilder | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/views/course/material/folders/show.json.jbuilder b/app/views/course/material/folders/show.json.jbuilder index 431ff316d33..92baf708628 100644 --- a/app/views/course/material/folders/show.json.jbuilder +++ b/app/views/course/material/folders/show.json.jbuilder @@ -26,12 +26,14 @@ json.subfolders @subfolders do |subfolder| json.showSdlWarning show_sdl_warning?(subfolder) json.canEdit can?(:edit, subfolder) json.canDelete can?(:destroy, subfolder) + json.canManageKnowledgeBase current_course_user&.manager_or_owner? end end json.materials @folder.materials.includes(:updater) do |material| json.id material.id json.name material.name + json.workflowState material.workflow_state json.description format_ckeditor_rich_text(material.description) json.materialUrl url_to_material(current_course, @folder, material) json.updatedAt material.attachment.updated_at @@ -59,6 +61,7 @@ json.advanceStartAt current_course.advance_start_at_duration json.permissions do json.isCurrentCourseStudent current_course_user&.student? + json.canManageKnowledgeBase current_course_user&.manager_or_owner? json.canStudentUpload @folder.can_student_upload json.canCreateSubfolder can?(:new_subfolder, @folder) json.canUpload can?(:upload, @folder) From e0c2d65bab4a84ac101355e440e92752611a532b Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:17:13 +0800 Subject: [PATCH 12/18] feat(routes): add routes fro create and destroy text chunks - create_text_chunks handle creation of text_chunks - destroy text_chunks handle deletion of text_chunks --- config/routes.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/routes.rb b/config/routes.rb index 3a81b9444b0..e3dec98fafb 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -414,7 +414,10 @@ post 'create/subfolder', on: :member, as: 'create_subfolder', action: 'create_subfolder' put 'upload_materials', on: :member get 'download', on: :member - resources :materials, path: 'files' + resources :materials, path: 'files' do + put 'create_text_chunks', on: :member + delete 'destroy_text_chunks', on: :member + end end end From b8146b091f8c2cac7130006a48d899eee2595eeb Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:17:40 +0800 Subject: [PATCH 13/18] feat(component): add KnowledgeBaseSwitch - switch that create or destroy course material text chunks --- .../buttons/KnowledgeBaseSwitch.tsx | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx diff --git a/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx b/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx new file mode 100644 index 00000000000..2e4548e08a2 --- /dev/null +++ b/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx @@ -0,0 +1,105 @@ +import { FC, useEffect, useState } from 'react'; +import { defineMessages } from 'react-intl'; +import { Switch } from '@mui/material'; + +import { useAppDispatch } from 'lib/hooks/store'; +import toast from 'lib/hooks/toast'; +import useTranslation from 'lib/hooks/useTranslation'; + +import { chunkMaterial, removeChunks } from '../../operations'; + +interface Props { + currFolderId: number; + itemId: number; + itemName: string; + isConcrete: boolean; + canEdit: boolean; + state: 'not_chunked' | 'chunking' | 'chunked' | null; + type: 'subfolder' | 'material'; +} + +const translations = defineMessages({ + addSuccess: { + id: 'course.material.folders.WorkbinTableButtons.addFailure', + defaultMessage: ' has been added to knowledge base', + }, + addFailure: { + id: 'course.material.folders.WorkbinTableButtons.addFailure', + defaultMessage: ' could not be added to knowledge base', + }, + removeSuccess: { + id: 'course.material.folders.WorkbinTableButtons.removeSuccess', + defaultMessage: ' has been removed from knowledge base', + }, + removeFailure: { + id: 'course.material.folders.WorkbinTableButtons.removeFailure', + defaultMessage: ' could not be removed from knowledge base', + }, +}); + +const KnowledgeBaseSwitch: FC = (props) => { + const { currFolderId, itemId, itemName, isConcrete, canEdit, state, type } = + props; + const { t } = useTranslation(); + const [isLoading, setIsLoading] = useState(false); + const dispatch = useAppDispatch(); + const onAdd = (): void => { + setIsLoading(true); + dispatch( + chunkMaterial( + currFolderId, + itemId, + () => { + setIsLoading(false); + toast.success(`"${itemName}" ${t(translations.addSuccess)}`); + }, + () => { + setIsLoading(false); + toast.error(`"${itemName}" ${t(translations.addFailure)}`); + }, + ), + ); + }; + + const onRemove = (): Promise => { + setIsLoading(true); + return dispatch(removeChunks(currFolderId, itemId)) + .then(() => { + setIsLoading(false); + toast.success(`"${itemName}" ${t(translations.removeSuccess)}`); + }) + .catch((error) => { + setIsLoading(false); + const errorMessage = error.response?.data?.errors + ? error.response.data.errors + : ''; + toast.error( + `"${itemName}" ${t(translations.removeFailure)} - ${errorMessage}`, + ); + throw error; + }); + }; + + useEffect(() => { + if (state === 'chunking' && !isLoading) { + onAdd(); + setIsLoading(true); + } + }, [isLoading]); + + return ( + type === 'material' && + canEdit && + isConcrete && ( + + ) + ); +}; + +export default KnowledgeBaseSwitch; From 7523030e49ebf93b5747c8aa912a7251c118161d Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:18:32 +0800 Subject: [PATCH 14/18] feat(UI): update components to support material text chunking Integrate new backend API changes by updating the UI components and related files: - Updated `store.ts` to handle new state management for the backend changes - Modified `operations.ts` to incorporate new API calls - Adjusted types in `types.ts` to reflect backend schema changes - Updated React components to work with the updated state and API logic --- client/app/api/course/Material/Folders.ts | 21 ++++++++ .../buttons/KnowledgeBaseSwitch.tsx | 4 +- .../buttons/WorkbinTableButtons.tsx | 5 +- .../components/tables/TableMaterialRow.tsx | 26 +++++++++- .../components/tables/TableSubfolderRow.tsx | 21 +++++++- .../components/tables/WorkbinTable.tsx | 20 ++++++- .../course/material/folders/operations.ts | 52 +++++++++++++++++++ .../folders/pages/FolderShow/index.tsx | 1 + .../bundles/course/material/folders/store.ts | 20 +++++++ .../bundles/course/material/folders/types.ts | 12 ++++- client/app/types/course/material/folders.ts | 12 ++++- 11 files changed, 186 insertions(+), 8 deletions(-) diff --git a/client/app/api/course/Material/Folders.ts b/client/app/api/course/Material/Folders.ts index 0088b53fd47..4e577ddf0c3 100644 --- a/client/app/api/course/Material/Folders.ts +++ b/client/app/api/course/Material/Folders.ts @@ -51,6 +51,27 @@ export default class FoldersAPI extends BaseCourseAPI { ); } + /** + * Chunks a material (file) + */ + chunkMaterial( + currFolderId: number, + materialId: number, + ): APIResponse { + return this.client.put( + `${this.#urlPrefix}/${currFolderId}/files/${materialId}/create_text_chunks`, + ); + } + + /** + * Deletes Chunks associated with a material (file) + */ + deleteMaterialChunks(currFolderId: number, materialId: number): APIResponse { + return this.client.delete( + `${this.#urlPrefix}/${currFolderId}/files/${materialId}/destroy_text_chunks`, + ); + } + /** * Uploads materials (files) */ diff --git a/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx b/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx index 2e4548e08a2..042f2100185 100644 --- a/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx +++ b/client/app/bundles/course/material/folders/components/buttons/KnowledgeBaseSwitch.tsx @@ -1,6 +1,8 @@ +// currently not in use import { FC, useEffect, useState } from 'react'; import { defineMessages } from 'react-intl'; import { Switch } from '@mui/material'; +import { MaterialWorkflowState } from 'types/course/material/folders'; import { useAppDispatch } from 'lib/hooks/store'; import toast from 'lib/hooks/toast'; @@ -14,7 +16,7 @@ interface Props { itemName: string; isConcrete: boolean; canEdit: boolean; - state: 'not_chunked' | 'chunking' | 'chunked' | null; + state: MaterialWorkflowState; type: 'subfolder' | 'material'; } diff --git a/client/app/bundles/course/material/folders/components/buttons/WorkbinTableButtons.tsx b/client/app/bundles/course/material/folders/components/buttons/WorkbinTableButtons.tsx index b8ff6a88b13..df8aae0a5ea 100644 --- a/client/app/bundles/course/material/folders/components/buttons/WorkbinTableButtons.tsx +++ b/client/app/bundles/course/material/folders/components/buttons/WorkbinTableButtons.tsx @@ -20,6 +20,7 @@ interface Props { canEdit: boolean; canDelete: boolean; type: 'subfolder' | 'material'; + state: 'not_chunked' | 'chunking' | 'chunked' | null; folderInitialValues?: { name: string; description: string; @@ -61,6 +62,7 @@ const WorkbinTableButtons: FC = (props) => { isConcrete, canEdit, canDelete, + state, type, folderInitialValues, materialInitialValues, @@ -147,6 +149,7 @@ const WorkbinTableButtons: FC = (props) => { {canEdit && isConcrete && ( = (props) => { confirmMessage={`${t( translations.deleteConfirmation, )} "${itemName}"`} - disabled={isDeleting} + disabled={isDeleting || state === 'chunking'} id={`${type}-delete-button-${itemId}`} onClick={onDelete} style={{ padding: 5 }} diff --git a/client/app/bundles/course/material/folders/components/tables/TableMaterialRow.tsx b/client/app/bundles/course/material/folders/components/tables/TableMaterialRow.tsx index e4f735803c4..2f47738ceb9 100644 --- a/client/app/bundles/course/material/folders/components/tables/TableMaterialRow.tsx +++ b/client/app/bundles/course/material/folders/components/tables/TableMaterialRow.tsx @@ -8,6 +8,7 @@ import Link from 'lib/components/core/Link'; import { getCourseId } from 'lib/helpers/url-helpers'; import { formatFullDateTime } from 'lib/moment'; +import KnowledgeBaseSwitch from '../buttons/KnowledgeBaseSwitch'; import WorkbinTableButtons from '../buttons/WorkbinTableButtons'; interface Props { @@ -15,10 +16,17 @@ interface Props { material: MaterialMiniEntity; isCurrentCourseStudent: boolean; isConcrete: boolean; + canManageKnowledgeBase: boolean; } const TableMaterialRow: FC = (props) => { - const { currFolderId, material, isCurrentCourseStudent, isConcrete } = props; + const { + currFolderId, + material, + isCurrentCourseStudent, + isConcrete, + canManageKnowledgeBase, + } = props; return ( @@ -80,6 +88,21 @@ const TableMaterialRow: FC = (props) => { - )} + {/* {canManageKnowledgeBase && ( + + + + + + )} */} = (props) => { }`, }, }} + state={material.workflowState} type="material" /> diff --git a/client/app/bundles/course/material/folders/components/tables/TableSubfolderRow.tsx b/client/app/bundles/course/material/folders/components/tables/TableSubfolderRow.tsx index a98de079ed4..95441faddfd 100644 --- a/client/app/bundles/course/material/folders/components/tables/TableSubfolderRow.tsx +++ b/client/app/bundles/course/material/folders/components/tables/TableSubfolderRow.tsx @@ -21,6 +21,7 @@ interface Props { subfolder: FolderMiniEntity; isCurrentCourseStudent: boolean; isConcrete: boolean; + canManageKnowledgeBase: boolean; } const translations = defineMessages({ @@ -37,7 +38,13 @@ const translations = defineMessages({ }); const TableSubfolderRow: FC = (props) => { - const { currFolderId, subfolder, isCurrentCourseStudent, isConcrete } = props; + const { + currFolderId, + subfolder, + isCurrentCourseStudent, + isConcrete, + canManageKnowledgeBase, + } = props; const { t } = useTranslation(); return ( @@ -52,7 +59,9 @@ const TableSubfolderRow: FC = (props) => { whiteSpace: 'normal', wordBreak: 'break-word', }} - to={`/courses/${getCourseId()}/materials/folders/${subfolder.id}`} + to={`/courses/${getCourseId()}/materials/folders/${ + subfolder.id + }/`} underline="hover" > {`${subfolder.name} (${subfolder.itemCount})`} @@ -113,6 +122,13 @@ const TableSubfolderRow: FC = (props) => { )} + {/* {canManageKnowledgeBase && ( + + + - + + + )} */} = (props) => { isConcrete={isConcrete} itemId={subfolder.id} itemName={subfolder.name} + state={null} type="subfolder" /> diff --git a/client/app/bundles/course/material/folders/components/tables/WorkbinTable.tsx b/client/app/bundles/course/material/folders/components/tables/WorkbinTable.tsx index 270b603676a..98c7d5f834a 100644 --- a/client/app/bundles/course/material/folders/components/tables/WorkbinTable.tsx +++ b/client/app/bundles/course/material/folders/components/tables/WorkbinTable.tsx @@ -27,6 +27,7 @@ interface Props extends WrappedComponentProps { subfolders: FolderMiniEntity[]; materials: MaterialMiniEntity[]; isCurrentCourseStudent: boolean; + canManageKnowledgeBase: boolean; isConcrete: boolean; } @@ -36,6 +37,7 @@ const WorkbinTable: FC = (props) => { subfolders, materials, isCurrentCourseStudent, + canManageKnowledgeBase, isConcrete, } = props; @@ -126,6 +128,18 @@ const WorkbinTable: FC = (props) => { ); }; + // const columnHeaderWithoutSort = (columnName: string): JSX.Element => { + // return ( + // + // ); + // }; + return ( @@ -135,7 +149,9 @@ const WorkbinTable: FC = (props) => { {!isCurrentCourseStudent && ( {columnHeaderWithSort('Start At')} )} - + {/* {canManageKnowledgeBase && ( + {columnHeaderWithoutSort('Knowledge Base')} + )} */} @@ -143,6 +159,7 @@ const WorkbinTable: FC = (props) => { return ( = (props) => { return ( { const payload = new FormData(); @@ -181,6 +182,57 @@ export function deleteMaterial( }); } +export function removeChunks( + currFolderId: number, + materialId: number, +): Operation { + return async (dispatch) => + CourseAPI.folders + .deleteMaterialChunks(currFolderId, materialId) + .then(() => { + dispatch( + actions.updateMaterialWorkflowStateList(materialId, 'not_chunked'), + ); + }); +} + +export function chunkMaterial( + currFolderId: number, + materialId: number, + handleSuccess: () => void, + handleFailure: () => void, +): Operation { + return async (dispatch) => { + // Dispatch initial update to set workflow state to 'chunking' + dispatch(actions.updateMaterialWorkflowStateList(materialId, 'chunking')); + CourseAPI.folders + .chunkMaterial(currFolderId, materialId) + .then((response) => { + const jobUrl = response.data.jobUrl; + pollJob( + jobUrl, + () => { + dispatch( + actions.updateMaterialWorkflowStateList(materialId, 'chunked'), + ); + handleSuccess(); + }, + () => { + dispatch( + actions.updateMaterialWorkflowStateList( + materialId, + 'not_chunked', + ), + ); + handleFailure(); + }, + CHUNK_MATERIAL_JOB_POLL_INTERVAL_MS, + ); + }) + .catch(handleFailure); + }; +} + export function updateMaterial( formData: MaterialFormData, folderId: number, diff --git a/client/app/bundles/course/material/folders/pages/FolderShow/index.tsx b/client/app/bundles/course/material/folders/pages/FolderShow/index.tsx index 216151471ae..3f0124198d4 100644 --- a/client/app/bundles/course/material/folders/pages/FolderShow/index.tsx +++ b/client/app/bundles/course/material/folders/pages/FolderShow/index.tsx @@ -134,6 +134,7 @@ const FolderShow: FC = () => { > { break; } + case UPDATE_MATERIAL_WORKFLOW_STATE_LIST: { + const materialId = action.materialId; + const material = draft.materials.byId[materialId]; + if (material) { + material.workflowState = action.state; + saveListToStore(draft.materials, [material]); + } + break; + } + case DELETE_MATERIAL_LIST: { const materialId = action.materialId; if (draft.materials.byId[materialId]) { @@ -135,6 +149,12 @@ export const actions = { ): SaveMaterialListAction => { return { type: SAVE_MATERIAL_LIST, materialList }; }, + updateMaterialWorkflowStateList: ( + materialId: number, + state: MaterialWorkflowState, + ): UpdateMaterialWorkflowStateAction => { + return { type: UPDATE_MATERIAL_WORKFLOW_STATE_LIST, materialId, state }; + }, }; export default reducer; diff --git a/client/app/bundles/course/material/folders/types.ts b/client/app/bundles/course/material/folders/types.ts index f8309731629..b1bf3f639d0 100644 --- a/client/app/bundles/course/material/folders/types.ts +++ b/client/app/bundles/course/material/folders/types.ts @@ -4,6 +4,7 @@ import { FolderPermissions, MaterialListData, MaterialMiniEntity, + MaterialWorkflowState, } from 'types/course/material/folders'; import { EntityStore } from 'types/store'; @@ -13,6 +14,8 @@ export const DELETE_FOLDER_LIST = 'course/materials/folders/DELETE_FOLDER_LIST'; export const DELETE_MATERIAL_LIST = 'course/materials/folders/DELETE_MATERIAL_LIST'; export const SAVE_MATERIAL_LIST = 'course/materials/folders/SAVE_MATERIAL_LIST'; +export const UPDATE_MATERIAL_WORKFLOW_STATE_LIST = + 'course/materials/folders/UPDATE_MATERIAL_WORKFLOW_STATE_LIST'; // Action Types export interface SaveFolderAction { @@ -46,11 +49,18 @@ export interface DeleteMaterialListAction { materialId: number; } +export interface UpdateMaterialWorkflowStateAction { + type: typeof UPDATE_MATERIAL_WORKFLOW_STATE_LIST; + materialId: number; + state: MaterialWorkflowState; +} + export type FoldersActionType = | SaveFolderAction | DeleteFolderListAction | DeleteMaterialListAction - | SaveMaterialListAction; + | SaveMaterialListAction + | UpdateMaterialWorkflowStateAction; // State Types export interface FoldersState { diff --git a/client/app/types/course/material/folders.ts b/client/app/types/course/material/folders.ts index 7bacfdfad25..13e618bc824 100644 --- a/client/app/types/course/material/folders.ts +++ b/client/app/types/course/material/folders.ts @@ -7,6 +7,7 @@ import { // Permissions for rendering title bar buttons export type FolderPermissions = Permissions< + | 'canManageKnowledgeBase' | 'isCurrentCourseStudent' | 'canStudentUpload' | 'canCreateSubfolder' @@ -15,11 +16,17 @@ export type FolderPermissions = Permissions< >; export type SubfolderPermissions = Permissions< - 'canStudentUpload' | 'showSdlWarning' | 'canEdit' | 'canDelete' + | 'canStudentUpload' + | 'showSdlWarning' + | 'canEdit' + | 'canDelete' + | 'canManageKnowledgeBase' >; export type MaterialPermissions = Permissions<'canEdit' | 'canDelete'>; +export type MaterialWorkflowState = 'not_chunked' | 'chunking' | 'chunked'; + export interface FolderListData { id: number; name: string; @@ -41,6 +48,7 @@ export interface MaterialListData { updatedAt: string; updater: CourseUserBasicListData; permissions: MaterialPermissions; + workflowState: MaterialWorkflowState; } export interface FolderMiniEntity { @@ -63,6 +71,7 @@ export interface MaterialMiniEntity { updatedAt: string; updater: CourseUserBasicMiniEntity; permissions: MaterialPermissions; + workflowState: MaterialWorkflowState; } export interface FolderData { @@ -74,6 +83,7 @@ export interface FolderData { isConcrete: boolean; startAt: string; endAt: string | null; + workflowState: MaterialWorkflowState; }; subfolders: FolderListData[]; materials: MaterialListData[]; From ae457927d3583d83df698065253c13cf7cbdf057 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Mon, 9 Dec 2024 22:49:19 +0800 Subject: [PATCH 15/18] style(hound): fix hound violations --- app/services/rag/chunking_service.rb | 18 ++++++++++-------- app/services/rag/llm_service.rb | 3 ++- config/initializers/llm_langchain.rb | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/app/services/rag/chunking_service.rb b/app/services/rag/chunking_service.rb index 5589c367784..abe249be4b3 100644 --- a/app/services/rag/chunking_service.rb +++ b/app/services/rag/chunking_service.rb @@ -12,14 +12,16 @@ def initialize(text: nil, file: nil) end def file_chunking - if @file_type == '.pdf' - reader = PDF::Reader.new(@file.path) - text = reader.pages.map(&:text).join(' ') - elsif @file_type == '.txt' - text = File.read(@file.path) - else - raise "Unsupported file type: #{@file_type}" - end + text = case @file_type + when '.pdf' + reader = PDF::Reader.new(@file.path) + reader.pages.map(&:text).join(' ') + when '.txt' + File.read(@file.path) + else + raise "Unsupported file type: #{@file_type}" + end + @text = text.gsub(/\s+/, ' ').strip fixed_size_chunk_text(500, 100) end diff --git a/app/services/rag/llm_service.rb b/app/services/rag/llm_service.rb index eb8f5fb260d..1bdece85cfd 100644 --- a/app/services/rag/llm_service.rb +++ b/app/services/rag/llm_service.rb @@ -17,7 +17,8 @@ def get_image_caption(image) role: 'user', content: [ { type: 'text', - text: 'What is in this image? Do not give a summary of image at the end. Make sure response is less than 80 words' }, + text: 'What is in this image? Do not give a summary of image at the end. + Make sure response is less than 80 words' }, { type: 'image_url', image_url: { diff --git a/config/initializers/llm_langchain.rb b/config/initializers/llm_langchain.rb index 13834312f0c..2d3728ae44f 100644 --- a/config/initializers/llm_langchain.rb +++ b/config/initializers/llm_langchain.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true if ENV['OPENAI_API_KEY'].present? require 'langchain' # Create a global OpenAI client instance From ebcac56ac932a52013fd8bdfa3bfdaf57af749b2 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Tue, 7 Jan 2025 17:35:47 +0800 Subject: [PATCH 16/18] feat(db): add and update tables to introduce text_chunk_references - add course_material_text_chunk_references table - update course_material_text_chunks table --- ...e_course_material_text_chunk_references.rb | 24 ++++++++++++++ ...terial_from_course_material_text_chunks.rb | 5 +++ ...add_name_to_course_material_text_chunks.rb | 6 ++++ db/schema.rb | 32 ++++++++++++------- 4 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 db/migrate/20250105150012_create_course_material_text_chunk_references.rb create mode 100644 db/migrate/20250106072815_remove_creator_course_material_from_course_material_text_chunks.rb create mode 100644 db/migrate/20250106073859_add_name_to_course_material_text_chunks.rb diff --git a/db/migrate/20250105150012_create_course_material_text_chunk_references.rb b/db/migrate/20250105150012_create_course_material_text_chunk_references.rb new file mode 100644 index 00000000000..fe97efe4a74 --- /dev/null +++ b/db/migrate/20250105150012_create_course_material_text_chunk_references.rb @@ -0,0 +1,24 @@ +class CreateCourseMaterialTextChunkReferences < ActiveRecord::Migration[7.2] + def change + create_table :course_material_text_chunk_references, id: :uuid, default: -> { "uuid_generate_v4()" } do |t| + t.datetime :created_at, precision: nil, null: false + t.datetime :updated_at, precision: nil, null: false + t.references :material, + null: false, + foreign_key: { to_table: :course_materials, name: "fk_course_material_text_chunk_references_material_id" }, + index: { name: "fk__course_material_text_chunk_references_material_id" } + t.references :text_chunk, + null: false, + foreign_key: { to_table: :course_material_text_chunks, name: "fk_course_material_text_chunk_references_text_chunk_id" }, + index: { name: "fk__course_material_text_chunk_references_text_chunk_id" } + t.references :creator, + null: false, + foreign_key: { to_table: :users, name: "fk_course_material_text_chunk_references_creator_id" }, + index: { name: "fk__course_material_text_chunk_references_creator_id" } + t.references :updater, + null: false, + foreign_key: { to_table: :users, name: "fk_course_material_text_chunk_references_updater_id" }, + index: { name: "fk__course_material_text_chunk_references_updater_id" } + end + end +end diff --git a/db/migrate/20250106072815_remove_creator_course_material_from_course_material_text_chunks.rb b/db/migrate/20250106072815_remove_creator_course_material_from_course_material_text_chunks.rb new file mode 100644 index 00000000000..deb0bba4885 --- /dev/null +++ b/db/migrate/20250106072815_remove_creator_course_material_from_course_material_text_chunks.rb @@ -0,0 +1,5 @@ +class RemoveCreatorCourseMaterialFromCourseMaterialTextChunks < ActiveRecord::Migration[7.2] + def change + remove_columns :course_material_text_chunks, :created_at, :creator_id, :course_material_id, :course_id + end +end \ No newline at end of file diff --git a/db/migrate/20250106073859_add_name_to_course_material_text_chunks.rb b/db/migrate/20250106073859_add_name_to_course_material_text_chunks.rb new file mode 100644 index 00000000000..011d7858391 --- /dev/null +++ b/db/migrate/20250106073859_add_name_to_course_material_text_chunks.rb @@ -0,0 +1,6 @@ +class AddNameToCourseMaterialTextChunks < ActiveRecord::Migration[7.2] + def change + add_column :course_material_text_chunks, :name, :string, limit: 255, null: false + add_index :course_material_text_chunks, :name + end +end diff --git a/db/schema.rb b/db/schema.rb index 2f7a83e6842..7e7f4514d3d 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.2].define(version: 2024_12_03_152111) do +ActiveRecord::Schema[7.2].define(version: 2025_01_06_073859) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" enable_extension "uuid-ossp" @@ -866,6 +866,19 @@ t.index ["updater_id"], name: "fk__course_material_folders_updater_id" end + create_table "course_material_text_chunk_references", id: :uuid, default: -> { "uuid_generate_v4()" }, force: :cascade do |t| + t.datetime "created_at", precision: nil, null: false + t.datetime "updated_at", precision: nil, null: false + t.bigint "material_id", null: false + t.bigint "text_chunk_id", null: false + t.bigint "creator_id", null: false + t.bigint "updater_id", null: false + t.index ["creator_id"], name: "fk__course_material_text_chunk_references_creator_id" + t.index ["material_id"], name: "fk__course_material_text_chunk_references_material_id" + t.index ["text_chunk_id"], name: "fk__course_material_text_chunk_references_text_chunk_id" + t.index ["updater_id"], name: "fk__course_material_text_chunk_references_updater_id" + end + create_table "course_material_text_chunkings", id: :serial, force: :cascade do |t| t.datetime "created_at", null: false t.datetime "updated_at", null: false @@ -878,15 +891,9 @@ create_table "course_material_text_chunks", id: :serial, force: :cascade do |t| t.text "content", null: false t.vector "embedding", limit: 1536, null: false - t.datetime "created_at", precision: nil, null: false - t.bigint "creator_id", null: false - t.bigint "course_id", null: false - t.bigint "course_material_id", null: false - t.index ["course_id"], name: "fk__course_material_text_chunks_course_id" - t.index ["course_material_id", "content"], name: "index_text_chunks_on_text_chunk_id_and_content", unique: true - t.index ["course_material_id"], name: "fk__course_material_text_chunks_material_id" - t.index ["creator_id"], name: "fk__course_material_text_chunks_creator_id" + t.string "name", limit: 255, null: false t.index ["embedding"], name: "index_course_material_text_chunk_embedding", opclass: :vector_cosine_ops, using: :hnsw + t.index ["name"], name: "index_course_material_text_chunks_on_name" end create_table "course_materials", id: :serial, force: :cascade do |t| @@ -1635,11 +1642,12 @@ add_foreign_key "course_material_folders", "courses", name: "fk_course_material_folders_course_id" add_foreign_key "course_material_folders", "users", column: "creator_id", name: "fk_course_material_folders_creator_id" add_foreign_key "course_material_folders", "users", column: "updater_id", name: "fk_course_material_folders_updater_id" + add_foreign_key "course_material_text_chunk_references", "course_material_text_chunks", column: "text_chunk_id", name: "fk_course_material_text_chunk_references_text_chunk_id" + add_foreign_key "course_material_text_chunk_references", "course_materials", column: "material_id", name: "fk_course_material_text_chunk_references_material_id" + add_foreign_key "course_material_text_chunk_references", "users", column: "creator_id", name: "fk_course_material_text_chunk_references_creator_id" + add_foreign_key "course_material_text_chunk_references", "users", column: "updater_id", name: "fk_course_material_text_chunk_references_updater_id" add_foreign_key "course_material_text_chunkings", "course_materials", column: "material_id", name: "fk_course_material_text_chunkings_material_id" add_foreign_key "course_material_text_chunkings", "jobs", name: "fk_course_material_text_chunkings_job_id", on_delete: :nullify - add_foreign_key "course_material_text_chunks", "course_materials", name: "fk_course_material_text_chunks_material_id" - add_foreign_key "course_material_text_chunks", "courses", name: "fk_course_material_text_chunks_course_id" - add_foreign_key "course_material_text_chunks", "users", column: "creator_id", name: "fk_course_material_text_chunks_creator_id" add_foreign_key "course_materials", "course_material_folders", column: "folder_id", name: "fk_course_materials_folder_id" add_foreign_key "course_materials", "users", column: "creator_id", name: "fk_course_materials_creator_id" add_foreign_key "course_materials", "users", column: "updater_id", name: "fk_course_materials_updater_id" From a2c32c45f81b8050dc1dd48adc2c0a0d39d55c53 Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Tue, 7 Jan 2025 18:11:05 +0800 Subject: [PATCH 17/18] feat(chunk_references): add course_material_text_chunk_references --- .../course/material/materials_controller.rb | 6 +-- app/jobs/course/material/text_chunk_job.rb | 2 + app/models/course/material.rb | 51 ++++++++++++++----- app/models/course/material/text_chunk.rb | 22 ++++++-- .../course/material/text_chunk_reference.rb | 29 +++++++++++ 5 files changed, 91 insertions(+), 19 deletions(-) create mode 100644 app/models/course/material/text_chunk_reference.rb diff --git a/app/controllers/course/material/materials_controller.rb b/app/controllers/course/material/materials_controller.rb index 712a94b8ccf..f65148641e8 100644 --- a/app/controllers/course/material/materials_controller.rb +++ b/app/controllers/course/material/materials_controller.rb @@ -46,9 +46,9 @@ def create_text_chunks end def destroy_text_chunks - if @material.text_chunks.destroy_all && @material.workflow_state == 'chunked' + if @material.text_chunk_references.destroy_all && @material.workflow_state == 'chunked' @material.delete_chunks! - @material.save + @material.save! head :ok else render json: { errors: @material.errors.full_messages.to_sentence }, status: :bad_request @@ -95,7 +95,7 @@ def last_text_chunking_job end def delete_material_text_chunks - if @material.text_chunks.destroy_all + if @material.text_chunk_references.destroy_all @material.delete_chunks! @material.save else diff --git a/app/jobs/course/material/text_chunk_job.rb b/app/jobs/course/material/text_chunk_job.rb index eef6b3b0819..e6a4b6aa7b8 100644 --- a/app/jobs/course/material/text_chunk_job.rb +++ b/app/jobs/course/material/text_chunk_job.rb @@ -6,6 +6,8 @@ class Course::Material::TextChunkJob < ApplicationJob protected def perform_tracked(material, current_user) + material.start_chunking! + material.save! material.build_text_chunks(current_user) material.finish_chunking! material.save! diff --git a/app/models/course/material.rb b/app/models/course/material.rb index 39610b2ac8c..c9d78dfd2f9 100644 --- a/app/models/course/material.rb +++ b/app/models/course/material.rb @@ -20,8 +20,8 @@ class Course::Material < ApplicationRecord end belongs_to :folder, inverse_of: :materials, class_name: 'Course::Material::Folder' - has_many :text_chunks, inverse_of: :material, class_name: 'Course::Material::TextChunk', - dependent: :destroy, foreign_key: :course_material_id, autosave: true + has_many :text_chunk_references, inverse_of: :material, class_name: 'Course::Material::TextChunkReference', + dependent: :destroy, autosave: true has_one :text_chunking, class_name: 'Course::Material::TextChunking', dependent: :destroy, inverse_of: :material, autosave: true @@ -66,6 +66,8 @@ def next_valid_name def initialize_duplicate(duplicator, other) self.attachment = duplicator.duplicate(other.attachment) + self.text_chunk_references = other.text_chunk_references. + map { |text_chunk_reference| duplicator.duplicate(text_chunk_reference) } self.folder = if duplicator.duplicated?(other.folder) duplicator.duplicate(other.folder) else @@ -95,17 +97,12 @@ def text_chunking!(current_user) end def build_text_chunks(current_user) - start_chunking! - save! - course_id = folder.course_id File.open(attachment.path, 'r:ASCII-8BIT') do |file| - llm_service = Rag::LlmService.new - chunking_service = Rag::ChunkingService.new(file: file) - chunks = chunking_service.file_chunking - embeddings = llm_service.generate_embeddings_from_chunks(chunks) - chunks.each_with_index do |chunk, index| - text_chunks.build(embedding: embeddings[index], content: chunk, creator: current_user, - course_id: course_id) + existing_text_chunks = Course::Material::TextChunk.existing_chunks(file: file) + if existing_text_chunks.exists? + create_references_for_existing_chunks(existing_text_chunks, current_user) + else + create_new_chunks_and_references(current_user, file) end end save! @@ -133,4 +130,34 @@ def ensure_text_chunking! association(:text_chunking).reload text_chunking end + + def create_references_for_existing_chunks(existing_chunks, current_user) + existing_chunks.find_each do |chunk| + text_chunk_references.build( + text_chunk: chunk, + creator: current_user, + updater: current_user + ) + end + end + + def create_new_chunks_and_references(current_user, file) + llm_service = Rag::LlmService.new + chunking_service = Rag::ChunkingService.new(file: file) + + file_digest = Digest::SHA256.file(file.try(:tempfile) || file).hexdigest + chunks = chunking_service.file_chunking + embeddings = llm_service.generate_embeddings_from_chunks(chunks) + chunks.each_with_index do |chunk, index| + text_chunk_references.build( + text_chunk: Course::Material::TextChunk.new( + name: file_digest, + embedding: embeddings[index], + content: chunk + ), + creator: current_user, + updater: current_user + ) + end + end end diff --git a/app/models/course/material/text_chunk.rb b/app/models/course/material/text_chunk.rb index 3975512082e..149a0cc91c4 100644 --- a/app/models/course/material/text_chunk.rb +++ b/app/models/course/material/text_chunk.rb @@ -1,10 +1,24 @@ # frozen_string_literal: true class Course::Material::TextChunk < ApplicationRecord has_neighbors :embedding - belongs_to :material, inverse_of: :text_chunks, class_name: 'Course::Material', - foreign_key: :course_material_id, autosave: true - validates :creator, presence: true validates :content, presence: true validates :embedding, presence: true - validates :course_id, presence: true + validates :name, presence: true + has_many :text_chunk_references, class_name: 'Course::Material::TextChunkReference', + dependent: :destroy + + class << self + def existing_chunks(attributes) + file = attributes.delete(:file) + attributes[:name] = file_digest(file) + where(attributes) + end + + private + + def file_digest(file) + # Get the actual file by #tempfile if the file is an `ActionDispatch::Http::UploadedFile`. + Digest::SHA256.file(file.try(:tempfile) || file).hexdigest + end + end end diff --git a/app/models/course/material/text_chunk_reference.rb b/app/models/course/material/text_chunk_reference.rb new file mode 100644 index 00000000000..95e4d0f55d9 --- /dev/null +++ b/app/models/course/material/text_chunk_reference.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true +class Course::Material::TextChunkReference < ApplicationRecord + include DuplicationStateTrackingConcern + + validates :creator, presence: true + validates :updater, presence: true + validates :text_chunk, presence: true + belongs_to :text_chunk, inverse_of: :text_chunk_references, + class_name: 'Course::Material::TextChunk' + belongs_to :material, inverse_of: :text_chunk_references, class_name: 'Course::Material' + after_destroy :destroy_text_chunk_if_no_references_left + + def initialize_duplicate(duplicator, other) + self.material = duplicator.duplicate(other.material) + self.updated_at = other.updated_at + self.created_at = other.created_at + self.text_chunk = other.text_chunk + set_duplication_flag + end + + private + + def destroy_text_chunk_if_no_references_left + # Check if there are no other references left for the TextChunk + return unless text_chunk.text_chunk_references.count == 0 + + text_chunk.destroy # This will delete the TextChunk if no references exist + end +end From 0063c3a21305cc99cfcbbbda25ffa9a4539594fe Mon Sep 17 00:00:00 2001 From: Jonaspng Date: Sat, 11 Jan 2025 21:22:14 +0800 Subject: [PATCH 18/18] chore(circle_ci): update circle_ci psql image - update psql image to allow for pgvector --- .circleci/config.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ea811c6a429..a9ee0ecee64 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,10 +27,11 @@ executors: DATABASE_URL: 'postgres://ubuntu@localhost:5432/coursemology_test' COLLECT_COVERAGE: << parameters.collects_rails_coverage >> - - image: cimg/postgres:16.1 + - image: pgvector/pgvector:pg16 environment: POSTGRES_USER: ubuntu POSTGRES_DB: coursemology_test + POSTGRES_PASSWORD: Testing1234 - image: cimg/redis:7.2.3 @@ -129,7 +130,7 @@ commands: - run: name: Create coursemology_keycloak db command: | - DB_CONTAINER_ID=$(docker ps -q --filter ancestor=cimg/postgres:16.1) + DB_CONTAINER_ID=$(docker ps -q --filter ancestor=pgvector/pgvector:pg16) docker exec $DB_CONTAINER_ID psql -c "CREATE DATABASE coursemology_keycloak OWNER ubuntu;" -U ubuntu -d postgres docker exec $DB_CONTAINER_ID psql -c "CREATE DATABASE coursemology OWNER ubuntu;" -U ubuntu -d postgres - run: @@ -148,7 +149,7 @@ commands: working_directory: authentication command: | touch .env - echo KC_NETWORK_MODE="container:$(docker ps -q --filter ancestor=cimg/postgres:16.1)" >> .env + echo KC_NETWORK_MODE="container:$(docker ps -q --filter ancestor=pgvector/pgvector:pg16)" >> .env echo KC_DB="postgres" >> .env echo KC_DB_URL="jdbc:postgresql://localhost:5432/coursemology_keycloak" >> .env echo KC_DB_USERNAME="ubuntu" >> .env