Skip to content

Commit

Permalink
feat(chunk_references): add course_material_text_chunk_references
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonaspng committed Jan 11, 2025
1 parent 132fe59 commit 72307db
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 19 deletions.
6 changes: 3 additions & 3 deletions app/controllers/course/material/materials_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def create_text_chunks
end

def destroy_text_chunks
if @material.text_chunks.destroy_all && @material.workflow_state == 'chunked'
if @material.text_chunk_references.destroy_all && @material.workflow_state == 'chunked'
@material.delete_chunks!
@material.save
@material.save!
head :ok
else
render json: { errors: @material.errors.full_messages.to_sentence }, status: :bad_request
Expand Down Expand Up @@ -95,7 +95,7 @@ def last_text_chunking_job
end

def delete_material_text_chunks
if @material.text_chunks.destroy_all
if @material.text_chunk_references.destroy_all
@material.delete_chunks!
@material.save
else
Expand Down
2 changes: 2 additions & 0 deletions app/jobs/course/material/text_chunk_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ class Course::Material::TextChunkJob < ApplicationJob
protected

def perform_tracked(material, current_user)
material.start_chunking!
material.save!
material.build_text_chunks(current_user)
material.finish_chunking!
material.save!
Expand Down
51 changes: 39 additions & 12 deletions app/models/course/material.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class Course::Material < ApplicationRecord
end

belongs_to :folder, inverse_of: :materials, class_name: 'Course::Material::Folder'
has_many :text_chunks, inverse_of: :material, class_name: 'Course::Material::TextChunk',
dependent: :destroy, foreign_key: :course_material_id, autosave: true
has_many :text_chunk_references, inverse_of: :material, class_name: 'Course::Material::TextChunkReference',
dependent: :destroy, autosave: true
has_one :text_chunking, class_name: 'Course::Material::TextChunking',
dependent: :destroy, inverse_of: :material, autosave: true

Expand Down Expand Up @@ -66,6 +66,8 @@ def next_valid_name

def initialize_duplicate(duplicator, other)
self.attachment = duplicator.duplicate(other.attachment)
self.text_chunk_references = other.text_chunk_references.
map { |text_chunk_reference| duplicator.duplicate(text_chunk_reference) }
self.folder = if duplicator.duplicated?(other.folder)
duplicator.duplicate(other.folder)
else
Expand Down Expand Up @@ -95,17 +97,12 @@ def text_chunking!(current_user)
end

def build_text_chunks(current_user)
start_chunking!
save!
course_id = folder.course_id
File.open(attachment.path, 'r:ASCII-8BIT') do |file|
llm_service = Rag::LlmService.new
chunking_service = Rag::ChunkingService.new(file: file)
chunks = chunking_service.file_chunking
embeddings = llm_service.generate_embeddings_from_chunks(chunks)
chunks.each_with_index do |chunk, index|
text_chunks.build(embedding: embeddings[index], content: chunk, creator: current_user,
course_id: course_id)
existing_text_chunks = Course::Material::TextChunk.existing_chunks(file: file)
if existing_text_chunks.exists?
create_references_for_existing_chunks(existing_text_chunks, current_user)
else
create_new_chunks_and_references(current_user, file)
end
end
save!
Expand Down Expand Up @@ -133,4 +130,34 @@ def ensure_text_chunking!
association(:text_chunking).reload
text_chunking
end

def create_references_for_existing_chunks(existing_chunks, current_user)
existing_chunks.find_each do |chunk|
text_chunk_references.build(
text_chunk: chunk,
creator: current_user,
updater: current_user
)
end
end

def create_new_chunks_and_references(current_user, file)
llm_service = Rag::LlmService.new
chunking_service = Rag::ChunkingService.new(file: file)

file_digest = Digest::SHA256.file(file.try(:tempfile) || file).hexdigest
chunks = chunking_service.file_chunking
embeddings = llm_service.generate_embeddings_from_chunks(chunks)
chunks.each_with_index do |chunk, index|
text_chunk_references.build(
text_chunk: Course::Material::TextChunk.new(
name: file_digest,
embedding: embeddings[index],
content: chunk
),
creator: current_user,
updater: current_user
)
end
end
end
22 changes: 18 additions & 4 deletions app/models/course/material/text_chunk.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
# frozen_string_literal: true
class Course::Material::TextChunk < ApplicationRecord
has_neighbors :embedding
belongs_to :material, inverse_of: :text_chunks, class_name: 'Course::Material',
foreign_key: :course_material_id, autosave: true
validates :creator, presence: true
validates :content, presence: true
validates :embedding, presence: true
validates :course_id, presence: true
validates :name, presence: true
has_many :text_chunk_references, class_name: 'Course::Material::TextChunkReference',
dependent: :destroy

class << self
def existing_chunks(attributes)
file = attributes.delete(:file)
attributes[:name] = file_digest(file)
where(attributes)
end

private

def file_digest(file)
# Get the actual file by #tempfile if the file is an `ActionDispatch::Http::UploadedFile`.
Digest::SHA256.file(file.try(:tempfile) || file).hexdigest
end
end
end
29 changes: 29 additions & 0 deletions app/models/course/material/text_chunk_reference.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# frozen_string_literal: true
class Course::Material::TextChunkReference < ApplicationRecord
include DuplicationStateTrackingConcern

validates :creator, presence: true
validates :updater, presence: true
validates :text_chunk, presence: true
belongs_to :text_chunk, inverse_of: :text_chunk_references,
class_name: 'Course::Material::TextChunk'
belongs_to :material, inverse_of: :text_chunk_references, class_name: 'Course::Material'
after_destroy :destroy_text_chunk_if_no_references_left

def initialize_duplicate(duplicator, other)
self.material = duplicator.duplicate(other.material)
self.updated_at = other.updated_at
self.created_at = other.created_at
self.text_chunk = other.text_chunk
set_duplication_flag
end

private

def destroy_text_chunk_if_no_references_left
# Check if there are no other references left for the TextChunk
return unless text_chunk.text_chunk_references.count == 0

text_chunk.destroy # This will delete the TextChunk if no references exist
end
end

0 comments on commit 72307db

Please sign in to comment.