Skip to content

Commit

Permalink
Merge pull request #21 from forcedotcom/mdonnalley/update-hro
Browse files Browse the repository at this point in the history
fix: update HRO
  • Loading branch information
mdonnalley authored Dec 18, 2024
2 parents ca0b196 + f0a4d12 commit 61d8240
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 75 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"@salesforce/kit": "^3.2.3",
"@salesforce/sf-plugins-core": "^12.1.0",
"@salesforce/source-deploy-retrieve": "^12.10.3",
"ansis": "^3.4.0",
"fast-xml-parser": "^4",
"nock": "^13.5.6"
},
Expand Down
127 changes: 116 additions & 11 deletions src/agentTester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/
import { Connection, Lifecycle, PollingClient, StatusResult } from '@salesforce/core';
import { Duration } from '@salesforce/kit';
import ansis from 'ansis';
import { MaybeMock } from './maybe-mock';

export type TestStatus = 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';
Expand All @@ -25,6 +26,7 @@ export type AgentTestStatusResponse = {
export type TestCaseResult = {
status: TestStatus;
number: string;
utterance: string;
startTime: string;
endTime?: string;
generatedData: {
Expand Down Expand Up @@ -180,28 +182,131 @@ export class AgentTester {
}
}

function humanFriendlyName(name: string): string {
switch (name) {
case 'topic_sequence_match':
return 'Topic';
case 'action_sequence_match':
return 'Action';
case 'bot_response_rating':
return 'Outcome';
default:
return name;
}
}

function truncate(value: number, decimals = 2): string {
const remainder = value % 1;
// truncate remainder to specified decimals
const fractionalPart = remainder ? remainder.toString().split('.')[1].slice(0, decimals) : '0'.repeat(decimals);
const wholeNumberPart = Math.floor(value).toString();
return decimals ? `${wholeNumberPart}.${fractionalPart}` : wholeNumberPart;
}

function readableTime(time: number, decimalPlaces = 2): string {
if (time < 1000) {
return '< 1s';
}

// if time < 1000ms, return time in ms
if (time < 1000) {
return `${time}ms`;
}

// if time < 60s, return time in seconds
if (time < 60_000) {
return `${truncate(time / 1000, decimalPlaces)}s`;
}

// if time < 60m, return time in minutes and seconds
if (time < 3_600_000) {
const minutes = Math.floor(time / 60_000);
const seconds = truncate((time % 60_000) / 1000, decimalPlaces);
return `${minutes}m ${seconds}s`;
}

// if time >= 60m, return time in hours and minutes
const hours = Math.floor(time / 3_600_000);
const minutes = Math.floor((time % 3_600_000) / 60_000);
return `${hours}h ${minutes}m`;
}

function makeSimpleTable(data: Record<string, string>, title: string): string {
if (Object.keys(data).length === 0) {
return '';
}

const longestKey = Object.keys(data).reduce((acc, key) => (key.length > acc ? key.length : acc), 0);
const longestValue = Object.values(data).reduce((acc, value) => (value.length > acc ? value.length : acc), 0);
const table = Object.entries(data)
.map(([key, value]) => `${key.padEnd(longestKey)} ${value.padEnd(longestValue)}`)
.join('\n');

return `${title}\n${table}`;
}

export async function humanFormat(details: AgentTestDetailsResponse): Promise<string> {
const { Ux } = await import('@salesforce/sf-plugins-core');
const ux = new Ux();

const tables: string[] = [];
for (const testCase of details.testSet.testCases) {
const table = ux.makeTable({
title: `Test Case #${testCase.number}`,
title: `${ansis.bold(`Test Case #${testCase.number}`)}\n${ansis.dim('Utterance')}: ${testCase.utterance}`,
overflow: 'wrap',
data: testCase.expectationResults.map((r) => ({
name: r.name,
outcome: r.result === 'Passed' ? 'Pass' : 'Fail',
actualValue: r.actualValue,
expectedValue: r.expectedValue,
score: r.score,
'metric label': r.metricLabel,
message: r.errorMessage ?? '',
'runtime (MS)': r.endTime ? new Date(r.endTime).getTime() - new Date(r.startTime).getTime() : 0,
test: humanFriendlyName(r.name),
result: r.result === 'Passed' ? ansis.green('Pass') : ansis.red('Fail'),
expected: r.expectedValue,
actual: r.actualValue,
})),
});
tables.push(table);
}
return tables.join('\n');

const topicPassCount = details.testSet.testCases.reduce((acc, tc) => {
const topic = tc.expectationResults.find((r) => r.name === 'topic_sequence_match');
return topic?.result === 'Passed' ? acc + 1 : acc;
}, 0);
const topicPassPercent = (topicPassCount / details.testSet.testCases.length) * 100;

const actionPassCount = details.testSet.testCases.reduce((acc, tc) => {
const action = tc.expectationResults.find((r) => r.name === 'action_sequence_match');
return action?.result === 'Passed' ? acc + 1 : acc;
}, 0);
const actionPassPercent = (actionPassCount / details.testSet.testCases.length) * 100;

const outcomePassCount = details.testSet.testCases.reduce((acc, tc) => {
const outcome = tc.expectationResults.find((r) => r.name === 'bot_response_rating');
return outcome?.result === 'Passed' ? acc + 1 : acc;
}, 0);
const outcomePassPercent = (outcomePassCount / details.testSet.testCases.length) * 100;

const results = {
Status: details.status,
Duration: details.endTime
? readableTime(new Date(details.endTime).getTime() - new Date(details.startTime).getTime())
: 'Unknown',
'Topic Pass %': `${topicPassPercent.toFixed(2)}%`,
'Action Pass %': `${actionPassPercent.toFixed(2)}%`,
'Outcome Pass %': `${outcomePassPercent.toFixed(2)}%`,
};

const resultsTable = makeSimpleTable(results, ansis.bold.blue('Test Results'));

const failedTestCases = details.testSet.testCases.filter((tc) => tc.status === 'ERROR');
const failedTestCasesObj = Object.fromEntries(
Object.entries(failedTestCases).map(([, tc]) => [
`Test Case #${tc.number}`,
tc.expectationResults
.filter((r) => r.result === 'Failed')
.map((r) => r.name)
.join(', '),
])
);
const failedTestCasesTable = makeSimpleTable(failedTestCasesObj, ansis.red.bold('Failed Test Cases'));

return tables.join('\n') + `\n${resultsTable}\n\n${failedTestCasesTable}\n`;
}

export async function jsonFormat(details: AgentTestDetailsResponse): Promise<string> {
Expand Down Expand Up @@ -249,7 +354,7 @@ export async function junitFormat(details: AgentTestDetailsResponse): Promise<st
failure: testCase.expectationResults
.map((r) => {
if (r.result === 'Failed') {
return { $message: r.errorMessage ?? 'Unknown error' };
return { $message: r.errorMessage ?? 'Unknown error', $name: r.name };
}
})
.filter((f) => f),
Expand Down
45 changes: 28 additions & 17 deletions test/agentTester.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { readFile } from 'node:fs/promises';
import { expect } from 'chai';
import { MockTestOrgData, TestContext } from '@salesforce/core/testSetup';
import { Connection } from '@salesforce/core';
import { AgentTestDetailsResponse, AgentTester, junitFormat, tapFormat } from '../src/agentTester';
import { AgentTestDetailsResponse, AgentTester, humanFormat, junitFormat, tapFormat } from '../src/agentTester';

describe('AgentTester', () => {
const $$ = new TestContext();
Expand Down Expand Up @@ -82,6 +82,15 @@ describe('AgentTester', () => {
});
});

describe('humanFormat', () => {
it('should transform test results to human readable format', async () => {
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json', 'utf8');
const input = JSON.parse(raw) as AgentTestDetailsResponse;
const output = await humanFormat(input);
expect(output).to.be.ok;
});
});

describe('junitFormatter', () => {
it('should transform test results to JUnit format', async () => {
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json', 'utf8');
Expand All @@ -91,11 +100,11 @@ describe('junitFormatter', () => {
<testsuites name="Copilot_for_Salesforce" tests="2" failures="1" time="20000">
<property name="status" value="COMPLETED"></property>
<property name="start-time" value="2024-11-28T12:00:00Z"></property>
<property name="end-time" value="2024-11-28T12:05:00Z"></property>
<testsuite name="CRM_Sanity_v1.1" time="10000" assertions="2"></testsuite>
<testsuite name="CRM_Sanity_v1.2" time="10000" assertions="2">
<failure message="Expected &quot;Result D&quot; but got &quot;Result C&quot;."></failure>
<failure message="Expected &quot;Result D&quot; but got &quot;Result C&quot;."></failure>
<property name="end-time" value="2024-11-28T12:00:48.56Z"></property>
<testsuite name="CRM_Sanity_v1.1" time="10000" assertions="3"></testsuite>
<testsuite name="CRM_Sanity_v1.2" time="10000" assertions="3">
<failure message="Actual response does not match the expected response" name="action_sequence_match"></failure>
<failure message="Actual response does not match the expected response" name="bot_response_rating"></failure>
</testsuite>
</testsuites>`);
});
Expand All @@ -107,22 +116,24 @@ describe('tapFormatter', () => {
const input = JSON.parse(raw) as AgentTestDetailsResponse;
const output = await tapFormat(input);
expect(output).to.deep.equal(`Tap Version 14
1..4
1..6
ok 1 CRM_Sanity_v1.1
ok 2 CRM_Sanity_v1.1
not ok 3 CRM_Sanity_v1.2
ok 3 CRM_Sanity_v1.1
ok 4 CRM_Sanity_v1.2
not ok 5 CRM_Sanity_v1.2
---
message: Expected "Result D" but got "Result C".
expectation: topic_sequence_match
actual: Result C
expected: Result D
message: Actual response does not match the expected response
expectation: action_sequence_match
actual: ["IdentifyRecordByName","QueryRecords"]
expected: ["IdentifyRecordByName","QueryRecords","GetActivitiesTimeline"]
...
not ok 4 CRM_Sanity_v1.2
not ok 6 CRM_Sanity_v1.2
---
message: Expected "Result D" but got "Result C".
expectation: topic_sequence_match
actual: Result C
expected: Result D
message: Actual response does not match the expected response
expectation: bot_response_rating
actual: It looks like I am unable to find the information you are looking for due to access restrictions. How else can I assist you?
expected: Summary of open cases and activities associated with timeline
...`);
});
});
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"status": "COMPLETED",
"startTime": "2024-11-28T12:00:00Z",
"endTime": "2024-11-28T12:05:00Z",
"endTime": "2024-11-28T12:00:48.56Z",
"errorMessage": null,
"subjectName": "Copilot_for_Salesforce",
"testSet": {
Expand All @@ -10,6 +10,7 @@
{
"status": "COMPLETED",
"number": 1,
"utterance": "Summarize account Acme",
"startTime": "2024-11-28T12:00:10Z",
"endTime": "2024-11-28T12:00:20Z",
"generatedData": {
Expand All @@ -23,8 +24,8 @@
"expectationResults": [
{
"name": "topic_sequence_match",
"actualValue": "Result A",
"expectedValue": "Result A",
"actualValue": "GeneralCRM",
"expectedValue": "GeneralCRM",
"score": 1.0,
"result": "Passed",
"metricLabel": "Accuracy",
Expand All @@ -37,8 +38,22 @@
},
{
"name": "action_sequence_match",
"actualValue": "Result B",
"expectedValue": "Result B",
"actualValue": "[\"IdentifyRecordByName\",\"SummarizeRecord\"]",
"expectedValue": "[\"IdentifyRecordByName\",\"SummarizeRecord\"]",
"score": 1.0,
"result": "Passed",
"metricLabel": "Precision",
"metricExplainability": "Measures the precision of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:14Z",
"endTime": "2024-11-28T12:00:15Z",
"errorCode": null,
"errorMessage": null
},
{
"name": "bot_response_rating",
"actualValue": "Here is the summary of the account Acme. How else can I assist you? Acme is a customer since 2019. They have 3 open opportunities and 2 open cases.",
"expectedValue": "Summary of account details are shown",
"score": 0.9,
"result": "Passed",
"metricLabel": "Precision",
Expand All @@ -55,6 +70,7 @@
"status": "ERROR",
"number": 2,
"startTime": "2024-11-28T12:00:30Z",
"utterance": "Summarize the open cases and Activities of acme from sep to nov 2024",
"endTime": "2024-11-28T12:00:40Z",
"generatedData": {
"type": "AGENT",
Expand All @@ -67,31 +83,45 @@
"expectationResults": [
{
"name": "topic_sequence_match",
"actualValue": "Result C",
"expectedValue": "Result D",
"score": 0.5,
"result": "Failed",
"actualValue": "GeneralCRM",
"expectedValue": "GeneralCRM",
"score": 1,
"result": "Passed",
"metricLabel": "Accuracy",
"metricExplainability": "Measures the correctness of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:32Z",
"endTime": "2024-11-28T12:00:33Z",
"errorCode": null,
"errorMessage": "Expected \"Result D\" but got \"Result C\"."
"errorMessage": null
},
{
"name": "topic_sequence_match",
"actualValue": "Result C",
"expectedValue": "Result D",
"name": "action_sequence_match",
"actualValue": "[\"IdentifyRecordByName\",\"QueryRecords\"]",
"expectedValue": "[\"IdentifyRecordByName\",\"QueryRecords\",\"GetActivitiesTimeline\"]",
"score": 0.5,
"result": "Failed",
"metricLabel": "Accuracy",
"metricExplainability": "Measures the correctness of the result.",
"metricLabel": "Precision",
"metricExplainability": "Measures the precision of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:32Z",
"endTime": "2024-11-28T12:00:33Z",
"errorCode": null,
"errorMessage": "Expected \"Result D\" but got \"Result C\"."
"startTime": "2024-11-28T12:00:14Z",
"endTime": "2024-11-28T12:00:15Z",
"errorCode": 1,
"errorMessage": "Actual response does not match the expected response"
},
{
"name": "bot_response_rating",
"actualValue": "It looks like I am unable to find the information you are looking for due to access restrictions. How else can I assist you?",
"expectedValue": "Summary of open cases and activities associated with timeline",
"score": 0.1,
"result": "Failed",
"metricLabel": "Precision",
"metricExplainability": "Measures the precision of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:14Z",
"endTime": "2024-11-28T12:00:15Z",
"errorCode": 1,
"errorMessage": "Actual response does not match the expected response"
}
]
}
Expand Down
Loading

0 comments on commit 61d8240

Please sign in to comment.