Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OPIK-622] Remove deprecated dataset item fields #1003

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@
public record DatasetItem(
@JsonView( {
DatasetItem.View.Public.class, DatasetItem.View.Write.class}) UUID id,
@JsonView({DatasetItem.View.Public.class,
DatasetItem.View.Write.class}) @Schema(deprecated = true, description = "to be deprecated soon, please use data field") JsonNode input,
@JsonView({DatasetItem.View.Public.class,
DatasetItem.View.Write.class}) @Schema(deprecated = true, description = "to be deprecated soon, please use data field") JsonNode expectedOutput,
@JsonView({DatasetItem.View.Public.class,
DatasetItem.View.Write.class}) @Schema(deprecated = true, description = "to be deprecated soon, please use data field") JsonNode metadata,
@JsonView({DatasetItem.View.Public.class, DatasetItem.View.Write.class}) UUID traceId,
@JsonView({DatasetItem.View.Public.class, DatasetItem.View.Write.class}) UUID spanId,
@JsonView({DatasetItem.View.Public.class, DatasetItem.View.Write.class}) @NotNull DatasetItemSource source,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ public class DatasetItemInputValidator implements ConstraintValidator<DatasetIte

@Override
public boolean isValid(DatasetItem datasetItem, ConstraintValidatorContext context) {
boolean result = datasetItem.input() != null || MapUtils.isNotEmpty(datasetItem.data());
boolean result = MapUtils.isNotEmpty(datasetItem.data());
Copy link
Collaborator

@andrescrz andrescrz Jan 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this whole class doesn't make sense anymore. You should completely remove it, along with the related annotation and then simply add a @NotEmpty annotation to the data field of DatasetItem, which works with collections (null or not empty on the map).

Please make sure there's a test covering that this works fine.


if (!result) {
context.disableDefaultConstraintViolation();
context.buildConstraintViolationWithTemplate("must provide either input or data field")
.addPropertyNode("input")
context.buildConstraintViolationWithTemplate("must provide data field")
.addPropertyNode("data")
.addConstraintViolation();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,7 @@ INSERT INTO dataset_items (
source,
trace_id,
span_id,
input,
data,
expected_output,
metadata,
created_at,
workspace_id,
created_by,
Expand All @@ -91,10 +88,7 @@ INSERT INTO dataset_items (
:source<item.index>,
:traceId<item.index>,
:spanId<item.index>,
:input<item.index>,
:data<item.index>,
:expectedOutput<item.index>,
:metadata<item.index>,
now64(9),
:workspace_id,
:createdBy<item.index>,
Expand Down Expand Up @@ -144,10 +138,7 @@ INSERT INTO dataset_items (
SELECT
id,
dataset_id,
input,
<if(truncate)> mapApply((k, v) -> (k, replaceRegexpAll(v, '<truncate>', '"[image]"')), data) as data <else> data <endif>,
expected_output,
metadata,
trace_id,
span_id,
source,
Expand Down Expand Up @@ -410,10 +401,7 @@ AND id IN (
SELECT
di.id AS id,
di.dataset_id AS dataset_id,
di.input AS input,
<if(truncate)> mapApply((k, v) -> (k, replaceRegexpAll(v, '<truncate>', '"[image]"')), di.data) as data <else> di.data <endif>,
di.expected_output AS expected_output,
di.metadata AS metadata,
di.trace_id AS trace_id,
di.span_id AS span_id,
di.source AS source,
Expand Down Expand Up @@ -462,10 +450,7 @@ AND id IN (SELECT trace_id FROM experiment_items_final)
GROUP BY
di.id,
di.dataset_id,
di.input,
di.data,
di.expected_output,
di.metadata,
di.trace_id,
di.span_id,
di.source,
Expand Down Expand Up @@ -571,28 +556,12 @@ private Mono<Long> mapAndInsert(
for (DatasetItem item : items) {
Map<String, JsonNode> data = new HashMap<>(Optional.ofNullable(item.data()).orElse(Map.of()));

if (!data.containsKey("input") && item.input() != null) {
data.put("input", item.input());
}

if (!data.containsKey("expected_output") && item.expectedOutput() != null) {
data.put("expected_output", item.expectedOutput());
}

if (!data.containsKey("metadata") && item.metadata() != null) {
data.put("metadata", item.metadata());
}

statement.bind("id" + i, item.id());
statement.bind("datasetId" + i, datasetId);
statement.bind("source" + i, item.source().getValue());
statement.bind("traceId" + i, DatasetItemResultMapper.getOrDefault(item.traceId()));
statement.bind("spanId" + i, DatasetItemResultMapper.getOrDefault(item.spanId()));
statement.bind("input" + i, DatasetItemResultMapper.getOrDefault(item.input()));
statement.bind("data" + i, DatasetItemResultMapper.getOrDefault(data));
statement.bind("expectedOutput" + i,
DatasetItemResultMapper.getOrDefault(item.expectedOutput()));
statement.bind("metadata" + i, DatasetItemResultMapper.getOrDefault(item.metadata()));
statement.bind("createdBy" + i, userName);
statement.bind("lastUpdatedBy" + i, userName);
i++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,9 @@ static Publisher<DatasetItem> mapItem(Result results) {

Map<String, JsonNode> data = getData(row);

JsonNode input = getJsonNode(row, data, "input");
JsonNode expectedOutput = getJsonNode(row, data, "expected_output");
JsonNode metadata = getJsonNode(row, data, "metadata");
Comment on lines -108 to -110
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The getJsonNode method is no longer used. Please clean up all unused methods in this class.


if (!data.containsKey("input")) {
data.put("input", input);
}

if (!data.containsKey("expected_output")) {
data.put("expected_output", expectedOutput);
}

if (!data.containsKey("metadata")) {
data.put("metadata", metadata);
}

return DatasetItem.builder()
.id(row.get("id", UUID.class))
.input(input)
.data(data)
.expectedOutput(expectedOutput)
.metadata(metadata)
.source(DatasetItemSource.fromString(row.get("source", String.class)))
.traceId(Optional.ofNullable(row.get("trace_id", String.class))
.filter(s -> !s.isBlank())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -2770,18 +2769,16 @@ public Stream<Arguments> invalidDatasetItems() {
return Stream.of(
arguments(factory.manufacturePojo(DatasetItemBatch.class).toBuilder()
.items(List.of(factory.manufacturePojo(DatasetItem.class).toBuilder()
.input(null)
.data(null)
.build()))
.build(),
"items[0].input must provide either input or data field"),
"items[0].data must provide data field"),
arguments(factory.manufacturePojo(DatasetItemBatch.class).toBuilder()
.items(List.of(factory.manufacturePojo(DatasetItem.class).toBuilder()
.input(null)
.data(Map.of())
.build()))
.build(),
"items[0].input must provide either input or data field"),
"items[0].data must provide data field"),
arguments(factory.manufacturePojo(DatasetItemBatch.class).toBuilder()
.items(List.of(factory.manufacturePojo(DatasetItem.class).toBuilder()
.source(null)
Expand Down Expand Up @@ -2973,28 +2970,10 @@ void createDatasetItem__whenDatasetItemWorkspaceAndSpanWorkspaceDoesNotMatch__th
}
}

@Test
@DisplayName("when data is null, the accept the request")
void create__whenDataIsNull__thenAcceptTheRequest() {
var item = factory.manufacturePojo(DatasetItem.class).toBuilder()
.data(null)
.build();

var batch = factory.manufacturePojo(DatasetItemBatch.class).toBuilder()
.items(List.of(item))
.datasetId(null)
.build();

putAndAssert(batch, TEST_WORKSPACE, API_KEY);

getItemAndAssert(item, TEST_WORKSPACE, API_KEY);
}

@Test
@DisplayName("when input is null but data is present, the accept the request")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please review the display name of the test and everything else.

void create__whenInputIsNullButDataIsPresent__thenAcceptTheRequest() {
var item = factory.manufacturePojo(DatasetItem.class).toBuilder()
.input(null)
.build();

var batch = factory.manufacturePojo(DatasetItemBatch.class).toBuilder()
Expand Down Expand Up @@ -3160,7 +3139,6 @@ void streamDataItems__whenStreamingHasMaxSize__thenReturnItemsSortedByCreatedDat
var items = IntStream.range(0, 1000)
.mapToObj(i -> factory.manufacturePojo(DatasetItem.class).toBuilder()
.experimentItems(null)
.metadata(null)
.createdAt(null)
.lastUpdatedAt(null)
.build())
Expand Down Expand Up @@ -3232,8 +3210,6 @@ private void getItemAndAssert(DatasetItem expectedDatasetItem, String workspaceN
Map<String, JsonNode> data = Optional.ofNullable(expectedDatasetItem.data())
.orElse(Map.of());

expectedDatasetItem = mergeInputMap(expectedDatasetItem, data);

assertThat(actualEntity.id()).isEqualTo(expectedDatasetItem.id());
assertThat(actualEntity).usingRecursiveComparison()
.ignoringFields(IGNORED_FIELDS_DATA_ITEM)
Expand All @@ -3243,37 +3219,6 @@ private void getItemAndAssert(DatasetItem expectedDatasetItem, String workspaceN
assertThat(actualEntity.lastUpdatedAt()).isInThePast();
}

private DatasetItem mergeInputMap(DatasetItem expectedDatasetItem, Map<String, JsonNode> data) {

Map<String, JsonNode> newMap = new HashMap<>();

if (expectedDatasetItem.expectedOutput() != null) {
newMap.put("expected_output", expectedDatasetItem.expectedOutput());
}

if (expectedDatasetItem.input() != null) {
newMap.put("input", expectedDatasetItem.input());
}

if (expectedDatasetItem.metadata() != null) {
newMap.put("metadata", expectedDatasetItem.metadata());
}

Map<String, JsonNode> mergedMap = Stream
.concat(data.entrySet().stream(), newMap.entrySet().stream())
.collect(toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(v1, v2) -> v2 // In case of conflict, use the value from map2
));

expectedDatasetItem = expectedDatasetItem.toBuilder()
.data(new HashMap<>(mergedMap))
.build();

return expectedDatasetItem;
}

@Nested
@DisplayName("Delete items:")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
Expand Down Expand Up @@ -3474,7 +3419,10 @@ void getDatasetItemsByDatasetId__whenItemsWereUpdated__thenReturnCorrectItemsCou

var updatedItems = items
.stream()
.map(item -> item.toBuilder().input(factory.manufacturePojo(JsonNode.class)).build())
.map(item -> item.toBuilder()
.data(Map.of(factory.manufacturePojo(String.class),
factory.manufacturePojo(JsonNode.class)))
.build())
.toList();

var updatedBatch = batch.toBuilder()
Expand Down Expand Up @@ -3646,8 +3594,6 @@ private void assertPage(List<DatasetItem> expectedItems, List<DatasetItem> actua
Map<String, JsonNode> data = Optional.ofNullable(expectedDatasetItem.data())
.orElse(Map.of());

expectedDatasetItem = mergeInputMap(expectedDatasetItem, data);

assertThat(actualDatasetItem.data()).isEqualTo(expectedDatasetItem.data());
}
}
Expand Down Expand Up @@ -4137,12 +4083,6 @@ Stream<Arguments> find__whenFilteringBySupportedFields__thenReturnMatchingRows()
FieldType.FEEDBACK_SCORES_NUMBER, Operator.EQUAL, "sql_cost", "10")),
arguments(new ExperimentsComparisonFilter("feedback_scores",
FieldType.FEEDBACK_SCORES_NUMBER, Operator.NOT_EQUAL, "sql_cost", "10")),
arguments(new ExperimentsComparisonFilter("output",
FieldType.STRING, Operator.CONTAINS, null, "sql_cost")),
arguments(new ExperimentsComparisonFilter("expected_output",
FieldType.DICTIONARY, Operator.CONTAINS, "output", "sql_cost")),
arguments(new ExperimentsComparisonFilter("metadata",
FieldType.DICTIONARY, Operator.EQUAL, "sql_cost", "10")),
Comment on lines -4140 to -4145
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you shouldn't remove these tests, as it's still legitimate to filter on those fields, but within data field.

arguments(new ExperimentsComparisonFilter("meta_field",
FieldType.DICTIONARY, Operator.CONTAINS, "version[*]", "10")),
arguments(new ExperimentsComparisonFilter("releases",
Expand Down Expand Up @@ -4197,8 +4137,6 @@ private void createTraces(List<DatasetItem> items, String projectName, String wo
var item = items.get(i);
var trace = Trace.builder()
.id(GENERATOR.generate())
.input(item.input())
.output(item.expectedOutput())
Comment on lines -4200 to -4201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related: legitimate payload in the trace, but should come from data.

.projectName(projectName)
.startTime(Instant.now())
.name("trace-" + i)
Expand All @@ -4214,12 +4152,6 @@ private void createDatasetItems(List<DatasetItem> items) {
if (i == 0) {
DatasetItem item = factory.manufacturePojo(DatasetItem.class)
.toBuilder()
.input(JsonUtils
.getJsonNodeFromString(JsonUtils.writeValueAsString(Map.of("input", "sql_cost"))))
.expectedOutput(JsonUtils
.getJsonNodeFromString(JsonUtils.writeValueAsString(Map.of("output", "sql_cost"))))
.metadata(JsonUtils
.getJsonNodeFromString(JsonUtils.writeValueAsString(Map.of("sql_cost", 10))))
Comment on lines -4217 to -4222
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems legitimate, but within data.

.source(DatasetItemSource.SDK)
.data(Map.of(
"sql_tag", JsonUtils.readTree("sql_test"),
Expand Down Expand Up @@ -4748,18 +4680,12 @@ void setUp() {
.flatMap(connection -> Mono.from(connection.createStatement("""
INSERT INTO %s.%s (
id,
input,
expected_output,
metadata,
source,
dataset_id,
workspace_id
) VALUES (:id, :input, :expected_output, :metadata, :source, :dataset_id, :workspace_id)
) VALUES (:id, :source, :dataset_id, :workspace_id)
""".formatted(DATABASE_NAME, "dataset_items"))
.bind("id", datasetItem.id())
.bind("input", datasetItem.input().toString())
.bind("expected_output", datasetItem.expectedOutput().toString())
.bind("metadata", datasetItem.metadata().toString())
Comment on lines -4760 to -4762
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably the whole TestNoMigratedDatasetItemRetrieval doesn't make sense anymore, as we're removing the deprecated fields.

.bind("source", DatasetItemSource.SDK.getValue())
.bind("dataset_id", datasetId)
.bind("workspace_id", WORKSPACE_ID)
Expand All @@ -4768,22 +4694,6 @@ void setUp() {

this.datasetItem = datasetItem;
}

@Test
void findById__whenDatasetItemNotMigrated__thenReturnDatasetItemWithData() {

var item = datasetItem.toBuilder()
.spanId(null)
.traceId(null)
.experimentItems(null)
.source(DatasetItemSource.SDK)
.data(Map.of("input", datasetItem.input(),
"expected_output", datasetItem.expectedOutput(),
"metadata", datasetItem.metadata()))
.build();

getItemAndAssert(item, TEST_WORKSPACE, API_KEY);
}
}

private Set<Column> addDeprecatedFields(List<Map<String, JsonNode>> data) {
Expand All @@ -4799,12 +4709,6 @@ private Set<Column> addDeprecatedFields(List<Map<String, JsonNode>> data) {
.build())
.collect(Collectors.toCollection(HashSet::new));

columns.add(Column.builder().name("input").types(Set.of(ColumnType.OBJECT)).filterFieldPrefix("data").build());
columns.add(Column.builder().name("expected_output").types(Set.of(ColumnType.OBJECT)).filterFieldPrefix("data")
.build());
columns.add(
Column.builder().name("metadata").types(Set.of(ColumnType.OBJECT)).filterFieldPrefix("data").build());
Comment on lines -4802 to -4806
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably the whole addDeprecatedFields doesn't make sense anymore, as we're removing the deprecated fields entirely.


Map<String, Set<ColumnType>> results = columns.stream()
.collect(groupingBy(Column::name, mapping(Column::types, flatMapping(Set::stream, toSet()))));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ public DatasetItem getType(DataProviderStrategy strategy, AttributeMetadata meta
.traceId(traceId)
.spanId(spanId)
.id(strategy.getTypeValue(metadata, context, UUID.class))
.input(strategy.getTypeValue(metadata, context, JsonNode.class))
.expectedOutput(strategy.getTypeValue(metadata, context, JsonNode.class))
.metadata(strategy.getTypeValue(metadata, context, JsonNode.class))
.data(data)
.createdAt(Instant.now())
.lastUpdatedAt(Instant.now())
Expand Down
Loading