diff --git a/app/evals/test_sentiment_accuracy_eval.rb b/app/evals/test_sentiment_accuracy_eval.rb new file mode 100644 index 0000000..b79ec7d --- /dev/null +++ b/app/evals/test_sentiment_accuracy_eval.rb @@ -0,0 +1,6 @@ +class TestSentimentAccuracyEval < Leva::BaseEval + def evaluate(prediction, expected) + score = prediction == expected ? 1.0 : 0.0 + Leva::Result.new(label: 'sentiment_accuracy', score: score) + end +end \ No newline at end of file diff --git a/app/models/leva/dataset.rb b/app/models/leva/dataset.rb index 7fb84dc..32c6f29 100644 --- a/app/models/leva/dataset.rb +++ b/app/models/leva/dataset.rb @@ -10,7 +10,10 @@ module Leva class Dataset < ApplicationRecord has_many :dataset_records, dependent: :destroy - has_many :records, through: :dataset_records, source: :recordable has_many :experiments, dependent: :destroy + + def add_record(record) + dataset_records.create(recordable: record) + end end end diff --git a/app/models/leva/experiment.rb b/app/models/leva/experiment.rb index 1ed7287..aef133a 100644 --- a/app/models/leva/experiment.rb +++ b/app/models/leva/experiment.rb @@ -9,7 +9,7 @@ # created_at :datetime not null # updated_at :datetime not null # dataset_id :integer not null -# prompt_id :integer not null +# prompt_id :integer # # Indexes # diff --git a/app/runners/test_sentiment_run.rb b/app/runners/test_sentiment_run.rb new file mode 100644 index 0000000..8be7fa5 --- /dev/null +++ b/app/runners/test_sentiment_run.rb @@ -0,0 +1,13 @@ +class TestSentimentRun < Leva::BaseRun + def execute(record) + # Simple sentiment analysis logic for testing + case record.content.downcase + when /love|great|excellent/ + "Positive" + when /terrible|bad|awful/ + "Negative" + else + "Neutral" + end + end +end \ No newline at end of file diff --git a/db/migrate/20240813173222_create_leva_experiments.rb b/db/migrate/20240813173222_create_leva_experiments.rb index 4645c6e..be8e7b6 100644 --- a/db/migrate/20240813173222_create_leva_experiments.rb +++ b/db/migrate/20240813173222_create_leva_experiments.rb @@ -3,7 +3,7 @@ def change create_table :leva_experiments do |t| t.string :name t.references :dataset, null: false, foreign_key: true - t.references :prompt, null: false, foreign_key: true + t.references :prompt, null: true, foreign_key: true t.integer :status t.text :metadata diff --git a/test/dummy/db/schema.rb b/test/dummy/db/schema.rb index ac9555f..09ca119 100644 --- a/test/dummy/db/schema.rb +++ b/test/dummy/db/schema.rb @@ -42,7 +42,7 @@ create_table "leva_experiments", force: :cascade do |t| t.string "name" t.integer "dataset_id", null: false - t.integer "prompt_id", null: false + t.integer "prompt_id" t.integer "status" t.text "metadata" t.datetime "created_at", null: false diff --git a/test/fixtures/leva/dataset_records.yml b/test/fixtures/leva/dataset_records.yml deleted file mode 100644 index 1efe9c6..0000000 --- a/test/fixtures/leva/dataset_records.yml +++ /dev/null @@ -1,30 +0,0 @@ -# == Schema Information -# -# Table name: leva_dataset_records -# -# id :integer not null, primary key -# recordable_type :string not null -# created_at :datetime not null -# updated_at :datetime not null -# dataset_id :integer not null -# recordable_id :integer not null -# -# Indexes -# -# index_leva_dataset_records_on_dataset_id (dataset_id) -# index_leva_dataset_records_on_recordable (recordable_type,recordable_id) -# -# Foreign Keys -# -# dataset_id (dataset_id => leva_datasets.id) -# - -one: - dataset: one - recordable: one - recordable_type: Recordable - -two: - dataset: two - recordable: two - recordable_type: Recordable diff --git a/test/fixtures/leva/datasets.yml b/test/fixtures/leva/datasets.yml deleted file mode 100644 index 3c42756..0000000 --- a/test/fixtures/leva/datasets.yml +++ /dev/null @@ -1,15 +0,0 @@ -# == Schema Information -# -# Table name: leva_datasets -# -# id :integer not null, primary key -# name :string -# created_at :datetime not null -# updated_at :datetime not null -# - -one: - name: MyString - -two: - name: MyString diff --git a/test/fixtures/leva/evaluation_results.yml b/test/fixtures/leva/evaluation_results.yml deleted file mode 100644 index a688061..0000000 --- a/test/fixtures/leva/evaluation_results.yml +++ /dev/null @@ -1,37 +0,0 @@ -# == Schema Information -# -# Table name: leva_evaluation_results -# -# id :integer not null, primary key -# label :string -# prediction :string -# score :float -# created_at :datetime not null -# updated_at :datetime not null -# dataset_record_id :integer not null -# experiment_id :integer not null -# -# Indexes -# -# index_leva_evaluation_results_on_dataset_record_id (dataset_record_id) -# index_leva_evaluation_results_on_experiment_id (experiment_id) -# -# Foreign Keys -# -# dataset_record_id (dataset_record_id => leva_dataset_records.id) -# experiment_id (experiment_id => leva_experiments.id) -# - -one: - experiment: one - dataset_record: one - prediction: MyString - score: 1.5 - label: MyString - -two: - experiment: two - dataset_record: two - prediction: MyString - score: 1.5 - label: MyString diff --git a/test/fixtures/leva/experiments.yml b/test/fixtures/leva/experiments.yml deleted file mode 100644 index 6211a4a..0000000 --- a/test/fixtures/leva/experiments.yml +++ /dev/null @@ -1,37 +0,0 @@ -# == Schema Information -# -# Table name: leva_experiments -# -# id :integer not null, primary key -# metadata :text -# name :string -# status :integer -# created_at :datetime not null -# updated_at :datetime not null -# dataset_id :integer not null -# prompt_id :integer not null -# -# Indexes -# -# index_leva_experiments_on_dataset_id (dataset_id) -# index_leva_experiments_on_prompt_id (prompt_id) -# -# Foreign Keys -# -# dataset_id (dataset_id => leva_datasets.id) -# prompt_id (prompt_id => leva_prompts.id) -# - -one: - name: MyString - dataset: one - prompt: one - status: 1 - metadata: MyText - -two: - name: MyString - dataset: two - prompt: two - status: 1 - metadata: MyText diff --git a/test/fixtures/leva/prompts.yml b/test/fixtures/leva/prompts.yml deleted file mode 100644 index 10616c9..0000000 --- a/test/fixtures/leva/prompts.yml +++ /dev/null @@ -1,27 +0,0 @@ -# == Schema Information -# -# Table name: leva_prompts -# -# id :integer not null, primary key -# metadata :text -# name :string -# system_prompt :text -# user_prompt :text -# version :integer -# created_at :datetime not null -# updated_at :datetime not null -# - -one: - name: MyString - version: 1 - system_prompt: MyText - user_prompt: MyText - metadata: MyText - -two: - name: MyString - version: 1 - system_prompt: MyText - user_prompt: MyText - metadata: MyText diff --git a/test/models/leva/experiment_test.rb b/test/models/leva/experiment_test.rb index a9e7ce8..5537a3f 100644 --- a/test/models/leva/experiment_test.rb +++ b/test/models/leva/experiment_test.rb @@ -9,7 +9,7 @@ # created_at :datetime not null # updated_at :datetime not null # dataset_id :integer not null -# prompt_id :integer not null +# prompt_id :integer # # Indexes # @@ -25,8 +25,33 @@ module Leva class ExperimentTest < ActiveSupport::TestCase - # test "the truth" do - # assert true - # end + def setup + dataset = Leva::Dataset.create(name: "Sentiment Analysis Dataset") + dataset.add_record TextContent.create(text: "I love this product!", expected_label: "Positive") + dataset.add_record TextContent.create(text: "Terrible experience", expected_label: "Negative") + dataset.add_record TextContent.create(text: "I's ok", expected_label: "Neutral") + @experiment = Leva::Experiment.create!(name: "Sentiment Analysis", dataset: dataset) + + @run = TestSentimentRun.new + @evals = [TestSentimentAccuracyEval.new, TestSentimentF1Eval.new] + end + + test "run evaluation with two evals and one runner" do + Leva.run_evaluation(experiment: @experiment, run: @run, evals: @evals) + + assert_equal 6, @experiment.evaluation_results.count, "Should have 6 evaluation results (1 run * 3 records * 2 evals)" + + accuracy_results = @experiment.evaluation_results.where(label: 'sentiment_accuracy') + f1_results = @experiment.evaluation_results.where(label: 'sentiment_f1') + + assert_equal 3, accuracy_results.count, "Should have 3 accuracy results" + assert_equal 3, f1_results.count, "Should have 3 F1 results" + + average_accuracy = accuracy_results.average(:score) + average_f1 = f1_results.average(:score) + + assert_in_delta 0.67, average_accuracy, 0.01, "Average accuracy should be about 0.67 (2 out of 3 correct)" + assert_in_delta 0.67, average_f1, 0.01, "Average F1 score should be about 0.67 (2 out of 3 correct)" + end end end