From ccb000b2247e403ed724ea59bec388176848740e Mon Sep 17 00:00:00 2001 From: Xinli Shang Date: Mon, 29 Dec 2025 10:20:52 -0800 Subject: [PATCH] fix: add identifier-field-ids JSON serialization/deserialization - Add serialization of identifier-field-ids in ToJson(Schema) - Add deserialization of identifier-field-ids in SchemaFromJson() - Add comprehensive test coverage for: * Schema with single identifier field * Schema without identifier fields * Schema with multiple identifier fields * Round-trip serialization/deserialization Fixes the TODO in json_internal.cc for identifier-field-ids support. --- src/iceberg/json_internal.cc | 22 +++++++++-- src/iceberg/test/metadata_serde_test.cc | 3 +- src/iceberg/test/schema_json_test.cc | 49 +++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 4a36ebf26..e35e36cfd 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -42,6 +42,7 @@ #include "iceberg/table_properties.h" #include "iceberg/transform.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" @@ -309,7 +310,9 @@ nlohmann::json ToJson(const Type& type) { nlohmann::json ToJson(const Schema& schema) { nlohmann::json json = ToJson(static_cast(schema)); json[kSchemaId] = schema.schema_id(); - // TODO(gangwu): add identifier-field-ids. + if (!schema.IdentifierFieldIds().empty()) { + json[kIdentifierFieldIds] = schema.IdentifierFieldIds(); + } return json; } @@ -473,10 +476,21 @@ Result> SchemaFromJson(const nlohmann::json& json) { if (type->type_id() != TypeId::kStruct) [[unlikely]] { return JsonParseError("Schema must be a struct type, but got {}", SafeDumpJson(json)); } + auto& struct_type = internal::checked_cast(*type); + + std::vector fields; + fields.reserve(struct_type.fields().size()); + for (auto& field : struct_type.fields()) { + fields.emplace_back(std::move(field)); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto identifier_field_ids, + GetJsonValueOrDefault>(json, kIdentifierFieldIds)); - auto& struct_type = static_cast(*type); - auto schema_id = schema_id_opt.value_or(Schema::kInitialSchemaId); - return FromStructType(std::move(struct_type), schema_id); + return std::make_unique(std::move(fields), + schema_id_opt.value_or(Schema::kInitialSchemaId), + std::move(identifier_field_ids)); } nlohmann::json ToJson(const PartitionField& partition_field) { diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index 62682b801..6b8e37ad6 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -143,7 +143,8 @@ TEST(MetadataSerdeTest, DeserializeV2Valid) { std::vector{SchemaField::MakeRequired(1, "x", int64()), SchemaField::MakeRequired(2, "y", int64()), SchemaField::MakeRequired(3, "z", int64())}, - /*schema_id=*/1); + /*schema_id=*/1, + /*identifier_field_ids=*/std::vector{1, 2}); auto expected_spec_result = PartitionSpec::Make( /*spec_id=*/0, diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 1e934cce3..da4c779aa 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -27,6 +27,7 @@ #include "iceberg/json_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" +#include "iceberg/test/matchers.h" #include "iceberg/type.h" namespace iceberg { @@ -133,4 +134,52 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(dumped_json, json); } +TEST(SchemaJsonTest, IdentifierFieldIds) { + // Test schema with identifier-field-ids + constexpr std::string_view json_with_identifier_str = + R"({"fields":[{"id":1,"name":"id","required":true,"type":"long"}, + {"id":2,"name":"data","required":false,"type":"string"}], + "identifier-field-ids":[1], + "schema-id":1, + "type":"struct"})"; + + auto json_with_identifiers = nlohmann::json::parse(json_with_identifier_str); + ICEBERG_UNWRAP_OR_FAIL(auto schema_with_identifers, + SchemaFromJson(json_with_identifiers)); + ASSERT_EQ(schema_with_identifers->fields().size(), 2); + ASSERT_EQ(schema_with_identifers->schema_id(), 1); + ASSERT_EQ(schema_with_identifers->IdentifierFieldIds().size(), 1); + ASSERT_EQ(schema_with_identifers->IdentifierFieldIds()[0], 1); + ASSERT_EQ(ToJson(*schema_with_identifers), json_with_identifiers); + + // Test schema without identifier-field-ids + constexpr std::string_view json_without_identifiers_str = + R"({"fields":[{"id":1,"name":"id","required":true,"type":"int"}, + {"id":2,"name":"name","required":false,"type":"string"}], + "schema-id":1, + "type":"struct"})"; + + auto json_without_identifiers = nlohmann::json::parse(json_without_identifiers_str); + ICEBERG_UNWRAP_OR_FAIL(auto schema_without_identifiers, + SchemaFromJson(json_without_identifiers)); + ASSERT_TRUE(schema_without_identifiers->IdentifierFieldIds().empty()); + ASSERT_EQ(ToJson(*schema_without_identifiers), json_without_identifiers); + + // Test schema with multiple identifier fields + constexpr std::string_view json_multi_identifiers_str = + R"({"fields":[{"id":1,"name":"user_id","required":true,"type":"long"}, + {"id":2,"name":"org_id","required":true,"type":"long"}, + {"id":3,"name":"data","required":false,"type":"string"}], + "identifier-field-ids":[1,2], + "schema-id":2, + "type":"struct"})"; + auto json_multi_identifiers = nlohmann::json::parse(json_multi_identifiers_str); + ICEBERG_UNWRAP_OR_FAIL(auto schema_multi_identifiers, + SchemaFromJson(json_multi_identifiers)); + ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds().size(), 2); + ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[0], 1); + ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[1], 2); + ASSERT_EQ(ToJson(*schema_multi_identifiers), json_multi_identifiers); +} + } // namespace iceberg