From 44b4473c25585afa4e77cc9a3e3720b920bdd6d0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 20:55:38 +0000 Subject: [PATCH 01/59] Begin examining how to best add structured array support to Zarr v3 driver --- tensorstore/driver/zarr3/BUILD | 21 +- tensorstore/driver/zarr3/driver.cc | 41 +- tensorstore/driver/zarr3/dtype.cc | 298 +++++++++++++ tensorstore/driver/zarr3/dtype.h | 144 ++++++ tensorstore/driver/zarr3/dtype_test.cc | 293 ++++++++++++ tensorstore/driver/zarr3/metadata.cc | 514 ++++++++++++++++------ tensorstore/driver/zarr3/metadata.h | 51 ++- tensorstore/driver/zarr3/metadata_test.cc | 45 +- 8 files changed, 1251 insertions(+), 156 deletions(-) create mode 100644 tensorstore/driver/zarr3/dtype.cc create mode 100644 tensorstore/driver/zarr3/dtype.h create mode 100644 tensorstore/driver/zarr3/dtype_test.cc diff --git a/tensorstore/driver/zarr3/BUILD b/tensorstore/driver/zarr3/BUILD index 6e0613d5b..d67f58935 100644 --- a/tensorstore/driver/zarr3/BUILD +++ b/tensorstore/driver/zarr3/BUILD @@ -94,8 +94,8 @@ tensorstore_cc_library( tensorstore_cc_library( name = "metadata", - srcs = ["metadata.cc"], - hdrs = ["metadata.h"], + srcs = ["metadata.cc", "dtype.cc"], + hdrs = ["metadata.h", "dtype.h"], deps = [ ":default_nan", ":name_configuration_json_binder", @@ -145,6 +145,23 @@ tensorstore_cc_library( ], ) +tensorstore_cc_test( + name = "dtype_test", + size = "small", + srcs = ["dtype_test.cc"], + deps = [ + ":metadata", + "//tensorstore:data_type", + "//tensorstore:index", + "//tensorstore/internal/testing:json_gtest", + "//tensorstore/util:status_testutil", + "//tensorstore/util:str_cat", + "@abseil-cpp//absl/status", + "@googletest//:gtest_main", + "@nlohmann_json//:json", + ], +) + tensorstore_cc_test( name = "driver_test", size = "small", diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index a516c1a7b..15faced0a 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -121,8 +121,19 @@ class ZarrDriverSpec "metadata", jb::Validate( [](const auto& options, auto* obj) { - TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( - obj->metadata_constraints.data_type.value_or(DataType()))); + if (obj->metadata_constraints.data_type) { + if (auto dtype = GetScalarDataType( + *obj->metadata_constraints.data_type)) { + TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set(*dtype)); + } else if (obj->schema.dtype().valid()) { + return absl::InvalidArgumentError( + "schema dtype must be unspecified for structured " + "zarr3 data types"); + } else { + // Leave dtype unspecified; structured dtypes are handled + // at metadata level only. + } + } TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); @@ -146,8 +157,8 @@ class ZarrDriverSpec SharedArray fill_value{schema.fill_value()}; const auto& metadata = metadata_constraints; - if (metadata.fill_value) { - fill_value = *metadata.fill_value; + if (metadata.fill_value && !metadata.fill_value->empty()) { + fill_value = (*metadata.fill_value)[0]; } return fill_value; @@ -274,8 +285,10 @@ class DataCacheBase static internal::ChunkGridSpecification GetChunkGridSpecification( const ZarrMetadata& metadata) { - auto fill_value = - BroadcastArray(metadata.fill_value, BoxView<>(metadata.rank)).value(); + assert(!metadata.fill_value.empty()); + auto fill_value = BroadcastArray(metadata.fill_value[0], + BoxView<>(metadata.rank)) + .value(); internal::ChunkGridSpecification::ComponentList components; auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ @@ -402,9 +415,16 @@ class DataCacheBase const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); ChunkLayout chunk_layout; + SpecRankAndFieldInfo info; + info.chunked_rank = metadata.rank; + if (!metadata.data_type.fields.empty()) { + info.field = &metadata.data_type.fields[0]; + } + std::optional> chunk_shape_span; + chunk_shape_span.emplace(metadata.chunk_shape.data(), + metadata.chunk_shape.size()); TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( - metadata.data_type, metadata.rank, metadata.chunk_shape, - &metadata.codec_specs, chunk_layout)); + info, chunk_shape_span, &metadata.codec_specs, chunk_layout)); TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Finalize()); return chunk_layout; } @@ -470,7 +490,10 @@ class ZarrDriver : public ZarrDriverBase { Result> GetFillValue( IndexTransformView<> transform) override { const auto& metadata = this->metadata(); - return metadata.fill_value; + if (metadata.fill_value.empty()) { + return SharedArray(); + } + return metadata.fill_value[0]; } Future GetStorageStatistics( diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc new file mode 100644 index 000000000..8d1c9d49e --- /dev/null +++ b/tensorstore/driver/zarr3/dtype.cc @@ -0,0 +1,298 @@ +// Copyright 2020 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tensorstore/driver/zarr3/dtype.h" + +#include + +#include + +#include "absl/base/optimization.h" +#include "tensorstore/data_type.h" +#include "tensorstore/internal/json_binding/json_binding.h" +#include "tensorstore/util/endian.h" +#include "tensorstore/util/extents.h" +#include "tensorstore/util/quote_string.h" +#include "tensorstore/util/str_cat.h" + +namespace tensorstore { +namespace internal_zarr3 { + +Result ParseBaseDType(std::string_view dtype) { + using D = ZarrDType::BaseDType; + const auto make_dtype = [&](DataType result_dtype) -> Result { + return D{std::string(dtype), result_dtype, {}}; + }; + + if (dtype == "bool") return make_dtype(dtype_v); + if (dtype == "uint8") return make_dtype(dtype_v); + if (dtype == "uint16") return make_dtype(dtype_v); + if (dtype == "uint32") return make_dtype(dtype_v); + if (dtype == "uint64") return make_dtype(dtype_v); + if (dtype == "int8") return make_dtype(dtype_v); + if (dtype == "int16") return make_dtype(dtype_v); + if (dtype == "int32") return make_dtype(dtype_v); + if (dtype == "int64") return make_dtype(dtype_v); + if (dtype == "bfloat16") + return make_dtype(dtype_v<::tensorstore::dtypes::bfloat16_t>); + if (dtype == "float16") + return make_dtype(dtype_v<::tensorstore::dtypes::float16_t>); + if (dtype == "float32") + return make_dtype(dtype_v<::tensorstore::dtypes::float32_t>); + if (dtype == "float64") + return make_dtype(dtype_v<::tensorstore::dtypes::float64_t>); + if (dtype == "complex64") + return make_dtype(dtype_v<::tensorstore::dtypes::complex64_t>); + if (dtype == "complex128") + return make_dtype(dtype_v<::tensorstore::dtypes::complex128_t>); + + constexpr std::string_view kSupported = + "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " + "bfloat16, float16, float32, float64, complex64, complex128"; + return absl::InvalidArgumentError( + tensorstore::StrCat(dtype, " data type is not one of the supported " + "data types: ", + kSupported)); +} + +namespace { + +/// Parses a zarr metadata "dtype" JSON specification, but does not compute any +/// derived values, and does not check for duplicate field names. +/// +/// This is called by `ParseDType`. +/// +/// \param value The zarr metadata "dtype" JSON specification. +/// \param out[out] Must be non-null. Filled with the parsed dtype on success. +/// \error `absl::StatusCode::kInvalidArgument' if `value` is invalid. +Result ParseDTypeNoDerived(const nlohmann::json& value) { + ZarrDType out; + if (value.is_string()) { + // Single field. + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(value.get())); + return out; + } + out.has_fields = true; + auto parse_result = internal_json::JsonParseArray( + value, + [&](ptrdiff_t size) { + out.fields.resize(size); + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& x, ptrdiff_t field_i) { + auto& field = out.fields[field_i]; + return internal_json::JsonParseArray( + x, + [&](ptrdiff_t size) { + if (size < 2 || size > 3) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected array of size 2 or 3, but received: ", x.dump())); + } + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& v, ptrdiff_t i) { + switch (i) { + case 0: + if (internal_json::JsonRequireValueAs(v, &field.name).ok()) { + if (!field.name.empty()) return absl::OkStatus(); + } + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected non-empty string, but received: ", v.dump())); + case 1: { + std::string dtype_string; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(v, &dtype_string)); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(field), + ParseBaseDType(dtype_string)); + return absl::OkStatus(); + } + case 2: { + return internal_json::JsonParseArray( + v, + [&](ptrdiff_t size) { + field.outer_shape.resize(size); + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& x, ptrdiff_t j) { + return internal_json::JsonRequireInteger( + x, &field.outer_shape[j], /*strict=*/true, 1, + kInfIndex); + }); + } + default: + ABSL_UNREACHABLE(); // COV_NF_LINE + } + }); + }); + if (!parse_result.ok()) return parse_result; + return out; +} + +} // namespace + +absl::Status ValidateDType(ZarrDType& dtype) { + dtype.bytes_per_outer_element = 0; + for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + auto& field = dtype.fields[field_i]; + if (std::any_of( + dtype.fields.begin(), dtype.fields.begin() + field_i, + [&](const ZarrDType::Field& f) { return f.name == field.name; })) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Field name ", QuoteString(field.name), " occurs more than once")); + } + field.field_shape.resize(field.flexible_shape.size() + + field.outer_shape.size()); + std::copy(field.flexible_shape.begin(), field.flexible_shape.end(), + std::copy(field.outer_shape.begin(), field.outer_shape.end(), + field.field_shape.begin())); + + field.num_inner_elements = ProductOfExtents(span(field.field_shape)); + if (field.num_inner_elements == std::numeric_limits::max()) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Product of dimensions ", span(field.field_shape), " is too large")); + } + if (internal::MulOverflow(field.num_inner_elements, + static_cast(field.dtype->size), + &field.num_bytes)) { + return absl::InvalidArgumentError("Field size in bytes is too large"); + } + field.byte_offset = dtype.bytes_per_outer_element; + if (internal::AddOverflow(dtype.bytes_per_outer_element, field.num_bytes, + &dtype.bytes_per_outer_element)) { + return absl::InvalidArgumentError( + "Total number of bytes per outer array element is too large"); + } + } + return absl::OkStatus(); +} + +std::optional GetScalarDataType(const ZarrDType& dtype) { + if (!dtype.has_fields && !dtype.fields.empty()) { + return dtype.fields[0].dtype; + } + return std::nullopt; +} + +Result ParseDType(const nlohmann::json& value) { + TENSORSTORE_ASSIGN_OR_RETURN(ZarrDType dtype, ParseDTypeNoDerived(value)); + TENSORSTORE_RETURN_IF_ERROR(ValidateDType(dtype)); + return dtype; +} + +bool operator==(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b) { + return a.encoded_dtype == b.encoded_dtype && a.dtype == b.dtype && + a.flexible_shape == b.flexible_shape; +} + +bool operator!=(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b) { + return !(a == b); +} + +bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b) { + return static_cast(a) == + static_cast(b) && + a.outer_shape == b.outer_shape && a.name == b.name && + a.field_shape == b.field_shape && + a.num_inner_elements == b.num_inner_elements && + a.byte_offset == b.byte_offset && a.num_bytes == b.num_bytes; +} + +bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b) { + return !(a == b); +} + +bool operator==(const ZarrDType& a, const ZarrDType& b) { + return a.has_fields == b.has_fields && + a.bytes_per_outer_element == b.bytes_per_outer_element && + a.fields == b.fields; +} + +bool operator!=(const ZarrDType& a, const ZarrDType& b) { return !(a == b); } + +void to_json(::nlohmann::json& out, const ZarrDType::Field& field) { + using array_t = ::nlohmann::json::array_t; + if (field.outer_shape.empty()) { + out = array_t{field.name, field.encoded_dtype}; + } else { + out = array_t{field.name, field.encoded_dtype, field.outer_shape}; + } +} + +void to_json(::nlohmann::json& out, // NOLINT + const ZarrDType& dtype) { + if (!dtype.has_fields) { + out = dtype.fields[0].encoded_dtype; + } else { + out = dtype.fields; + } +} + +TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER(ZarrDType, [](auto is_loading, + const auto& options, + auto* obj, auto* j) { + if constexpr (is_loading) { + TENSORSTORE_ASSIGN_OR_RETURN(*obj, ParseDType(*j)); + } else { + to_json(*j, *obj); + } + return absl::OkStatus(); +}) + +namespace { + +Result MakeBaseDType(std::string_view name, + DataType dtype) { + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype; + base_dtype.encoded_dtype = std::string(name); + return base_dtype; +} + +} // namespace + +Result ChooseBaseDType(DataType dtype) { + if (dtype == dtype_v) return MakeBaseDType("bool", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint8", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint16", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint32", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint64", dtype); + if (dtype == dtype_v) return MakeBaseDType("int8", dtype); + if (dtype == dtype_v) return MakeBaseDType("int16", dtype); + if (dtype == dtype_v) return MakeBaseDType("int32", dtype); + if (dtype == dtype_v) return MakeBaseDType("int64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::bfloat16_t>) + return MakeBaseDType("bfloat16", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float16_t>) + return MakeBaseDType("float16", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float32_t>) + return MakeBaseDType("float32", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float64_t>) + return MakeBaseDType("float64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::complex64_t>) + return MakeBaseDType("complex64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) + return MakeBaseDType("complex128", dtype); + return absl::InvalidArgumentError( + tensorstore::StrCat("Data type not supported: ", dtype)); +} + +} // namespace internal_zarr3 +} // namespace tensorstore diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h new file mode 100644 index 000000000..430dd8849 --- /dev/null +++ b/tensorstore/driver/zarr3/dtype.h @@ -0,0 +1,144 @@ +// Copyright 2020 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ +#define TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ + +/// \file +/// Support for encoding/decoding zarr "dtype" specifications. +/// See: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-type + +#include +#include +#include "tensorstore/data_type.h" +#include "tensorstore/internal/json_binding/bindable.h" +#include "tensorstore/util/endian.h" +#include "tensorstore/util/result.h" + +namespace tensorstore { +namespace internal_zarr3 { + +/// Decoded representation of a zarr "dtype" specification. +/// +/// A zarr "dtype" is a JSON value that is either: +/// +/// 1. A string, which specifies a single data type (e.g. "int32"). +/// In this case, the zarr array is considered to have a single, unnamed field. +/// +/// 2. An array, where each element of the array is of the form: +/// `[name, type]` or `[name, type, shape]`, where `name` is a JSON +/// string specifying the unique, non-empty field name, `type` is a data type +/// string, and `shape` is an optional "inner" array shape (specified +/// as a JSON array of non-negative integers) which defaults to the rank-0 +/// shape `[]` if not specified. +/// +/// Each field is encoded according to `type` into a fixed-size sequence of +/// bytes. If the optional "inner" array `shape` is specified, the individual +/// elements are encoded in C order. The encoding of each multi-field array +/// element is simply the concatenation of the encodings of each field. +struct ZarrDType { + /// Decoded representation of single value. + struct BaseDType { + /// Data type string. + std::string encoded_dtype; + + /// Corresponding DataType used for in-memory representation. + DataType dtype; + + /// For "flexible" data types that are themselves arrays, this specifies the + /// shape. For regular data types, this is empty. + std::vector flexible_shape; + }; + + /// Decoded representation of a single field. + struct Field : public BaseDType { + /// Optional `shape` dimensions specified by a zarr "dtype" field specified + /// as a JSON array. If the zarr dtype was specified as a single `typestr` + /// value, or as a two-element array, this is empty. + std::vector outer_shape; + + /// Field name. Must be non-empty and unique if the zarr "dtype" was + /// specified as an array. Otherwise, is empty. + std::string name; + + /// The inner array dimensions of this field, equal to the concatenation of + /// `outer_shape` and `flexible_shape` (derived value). + std::vector field_shape; + + /// Product of `field_shape` dimensions (derived value). + Index num_inner_elements; + + /// Byte offset of this field within an "outer" element (derived value). + Index byte_offset; + + /// Number of bytes occupied by this field within an "outer" element + /// (derived value). + Index num_bytes; + }; + + /// Equal to `true` if the zarr "dtype" was specified as an array, in which + /// case all fields must have a unique, non-empty `name`. If `false`, there + /// must be a single field with an empty `name`. + bool has_fields; + + /// Decoded representation of the fields. + std::vector fields; + + /// Bytes per "outer" element (derived value). + Index bytes_per_outer_element; + + TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrDType, + internal_json_binding::NoOptions) + + friend void to_json(::nlohmann::json& out, // NOLINT + const ZarrDType& dtype); +}; + +bool operator==(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b); +bool operator!=(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b); +bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b); +bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b); +bool operator==(const ZarrDType& a, const ZarrDType& b); +bool operator!=(const ZarrDType& a, const ZarrDType& b); + +/// Parses a zarr metadata "dtype" JSON specification. +/// +/// \error `absl::StatusCode::kInvalidArgument` if `value` is not valid. +Result ParseDType(const ::nlohmann::json& value); + +/// Validates `dtype and computes derived values. +/// +/// \error `absl::StatusCode::kInvalidArgument` if two fields have the same +/// name. +/// \error `absl::StatusCode::kInvalidArgument` if the field size is too large. +absl::Status ValidateDType(ZarrDType& dtype); + +/// Returns the underlying TensorStore `DataType` if `dtype` represents an +/// unstructured scalar array, otherwise `std::nullopt`. +std::optional GetScalarDataType(const ZarrDType& dtype); + + /// Parses a Zarr 3 data type string. + /// + /// \error `absl::StatusCode::kInvalidArgument` if `dtype` is not valid. + Result ParseBaseDType(std::string_view dtype); + + /// Chooses a zarr data type corresponding to `dtype`. + Result ChooseBaseDType(DataType dtype); + +} // namespace internal_zarr3 +} // namespace tensorstore + +#endif // TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc new file mode 100644 index 000000000..cbb7acbfb --- /dev/null +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -0,0 +1,293 @@ +// Copyright 2023 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tensorstore/driver/zarr3/dtype.h" + +#include +#include + +#include // for std::byte +#include +#include + +#include +#include +#include "absl/status/status.h" +#include +#include "tensorstore/data_type.h" +#include "tensorstore/index.h" +#include "tensorstore/internal/testing/json_gtest.h" +#include "tensorstore/util/status_testutil.h" +#include "tensorstore/util/str_cat.h" + +namespace { + +using ::tensorstore::DataType; +using ::tensorstore::dtype_v; +using ::tensorstore::Index; +using ::tensorstore::kInfIndex; +using ::tensorstore::StatusIs; +using ::tensorstore::internal_zarr3::ChooseBaseDType; +using ::tensorstore::internal_zarr3::ParseBaseDType; +using ::tensorstore::internal_zarr3::ParseDType; +using ::tensorstore::internal_zarr3::ZarrDType; +using ::testing::HasSubstr; +using ::testing::MatchesRegex; + +void CheckBaseDType(std::string dtype, DataType r, + std::vector flexible_shape) { + EXPECT_THAT(ParseBaseDType(dtype), ::testing::Optional(ZarrDType::BaseDType{ + dtype, r, flexible_shape})) + << dtype; +} + +TEST(ParseBaseDType, Success) { + CheckBaseDType("bool", dtype_v, {}); + CheckBaseDType("int8", dtype_v, {}); + CheckBaseDType("uint8", dtype_v, {}); + CheckBaseDType("int16", dtype_v, {}); + CheckBaseDType("uint16", dtype_v, {}); + CheckBaseDType("int32", dtype_v, {}); + CheckBaseDType("uint32", dtype_v, {}); + CheckBaseDType("int64", dtype_v, {}); + CheckBaseDType("uint64", dtype_v, {}); + CheckBaseDType("float16", dtype_v, {}); + CheckBaseDType("bfloat16", dtype_v, {}); + CheckBaseDType("float32", dtype_v, {}); + CheckBaseDType("float64", dtype_v, {}); + CheckBaseDType("complex64", dtype_v, {}); + CheckBaseDType("complex128", dtype_v, {}); +} + +TEST(ParseBaseDType, Failure) { + EXPECT_THAT( + ParseBaseDType(""), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is not one of the supported data types"))); + EXPECT_THAT(ParseBaseDType("float"), + StatusIs(absl::StatusCode::kInvalidArgument)); + EXPECT_THAT(ParseBaseDType("string"), + StatusIs(absl::StatusCode::kInvalidArgument)); + EXPECT_THAT(ParseBaseDType(", + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{}, + /*.num_inner_elements=*/1, + /*.byte_offset=*/0, + /*.num_bytes=*/1}, + }, + /*.bytes_per_outer_element=*/1, + }); +} + +TEST(ParseDType, SingleNamedFieldChar) { + // Zarr 3 doesn't support fixed size strings natively in core, so we use uint8 for testing bytes + CheckDType(::nlohmann::json::array_t{{"x", "uint8"}}, + ZarrDType{ + /*.has_fields=*/true, + /*.fields=*/ + { + {{ + /*.encoded_dtype=*/"uint8", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{}, + /*.name=*/"x", + /*.field_shape=*/{}, + /*.num_inner_elements=*/1, + /*.byte_offset=*/0, + /*.num_bytes=*/1}, + }, + /*.bytes_per_outer_element=*/1, + }); +} + +TEST(ParseDType, TwoNamedFields) { + CheckDType( + ::nlohmann::json::array_t{{"x", "int8", {2, 3}}, {"y", "int16", {5}}}, + ZarrDType{ + /*.has_fields=*/true, + /*.fields=*/ + { + {{ + /*.encoded_dtype=*/"int8", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{2, 3}, + /*.name=*/"x", + /*.field_shape=*/{2, 3}, + /*.num_inner_elements=*/2 * 3, + /*.byte_offset=*/0, + /*.num_bytes=*/1 * 2 * 3}, + {{ + /*.encoded_dtype=*/"int16", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{5}, + /*.name=*/"y", + /*.field_shape=*/{5}, + /*.num_inner_elements=*/5, + /*.byte_offset=*/1 * 2 * 3, + /*.num_bytes=*/2 * 5}, + }, + /*.bytes_per_outer_element=*/1 * 2 * 3 + 2 * 5, + }); +} + +TEST(ParseDType, FieldSpecTooShort) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x"}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Expected array of size 2 or 3, but received: [\"x\"]"))); +} + +TEST(ParseDType, FieldSpecTooLong) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16", {2, 3}, 5}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Expected array of size 2 or 3, but received: " + "[\"x\",\"int16\",[2,3],5]"))); +} + +TEST(ParseDType, InvalidFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{3, "int16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 0: " + "Expected non-empty string, but received: 3"))); +} + +TEST(ParseDType, EmptyFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"", "int16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 0: " + "Expected non-empty string, but received: \"\""))); +} + +TEST(ParseDType, DuplicateFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16"}, {"x", "uint16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Field name \"x\" occurs more than once"))); +} + +TEST(ParseDType, NonStringFieldBaseDType) { + EXPECT_THAT(ParseDType(::nlohmann::json::array_t{{"x", 3}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 1: " + "Expected string, but received: 3"))); +} + +TEST(ParseDType, InvalidFieldBaseDType) { + EXPECT_THAT(ParseDType(::nlohmann::json::array_t{{"x", "unknown"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 1: " + "unknown data type is not one of the " + "supported data types"))); +} + +TEST(ParseDType, ProductOfDimensionsOverflow) { + EXPECT_THAT( + ParseDType( + ::nlohmann::json::array_t{{"x", "int8", {kInfIndex, kInfIndex}}}), + StatusIs(absl::StatusCode::kInvalidArgument, + MatchesRegex(".*Product of dimensions .* is too large.*"))); +} + +TEST(ParseDType, FieldSizeInBytesOverflow) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "float64", {kInfIndex}}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Field size in bytes is too large"))); +} + +TEST(ParseDType, BytesPerOuterElementOverflow) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16", {kInfIndex}}, + {"y", "int16", {kInfIndex}}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr( + "Total number of bytes per outer array element is too large"))); +} + +TEST(ChooseBaseDTypeTest, RoundTrip) { + constexpr tensorstore::DataType kSupportedDataTypes[] = { + dtype_v, dtype_v, dtype_v, dtype_v, + dtype_v, dtype_v, dtype_v, + dtype_v, dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + }; + for (auto dtype : kSupportedDataTypes) { + SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto base_zarr_dtype, + ChooseBaseDType(dtype)); + EXPECT_EQ(dtype, base_zarr_dtype.dtype); + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto parsed, ParseBaseDType(base_zarr_dtype.encoded_dtype)); + EXPECT_EQ(dtype, parsed.dtype); + EXPECT_EQ(base_zarr_dtype.flexible_shape, parsed.flexible_shape); + EXPECT_EQ(base_zarr_dtype.encoded_dtype, parsed.encoded_dtype); + } +} + +TEST(ChooseBaseDTypeTest, Invalid) { + struct X {}; + EXPECT_THAT(ChooseBaseDType(dtype_v), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Data type not supported"))); + EXPECT_THAT(ChooseBaseDType(dtype_v<::tensorstore::dtypes::string_t>), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Data type not supported: string"))); +} + +} // namespace diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 528d373ae..c96c31426 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -50,6 +50,7 @@ #include "tensorstore/driver/zarr3/codec/codec_spec.h" #include "tensorstore/driver/zarr3/codec/sharding_indexed.h" #include "tensorstore/driver/zarr3/default_nan.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/driver/zarr3/name_configuration_json_binder.h" #include "tensorstore/index.h" #include "tensorstore/index_space/dimension_units.h" @@ -252,24 +253,110 @@ constexpr std::array } // namespace -absl::Status FillValueJsonBinder::operator()(std::true_type is_loading, - internal_json_binding::NoOptions, - SharedArray* obj, - ::nlohmann::json* j) const { +FillValueJsonBinder::FillValueJsonBinder(ZarrDType dtype, + bool allow_missing_dtype) + : dtype(std::move(dtype)), allow_missing_dtype(allow_missing_dtype) {} + +FillValueJsonBinder::FillValueJsonBinder(DataType data_type, + bool allow_missing_dtype) + : allow_missing_dtype(allow_missing_dtype) { + dtype.has_fields = false; + dtype.fields.resize(1); + auto& field = dtype.fields[0]; + field.name.clear(); + field.outer_shape.clear(); + field.flexible_shape.clear(); + field.field_shape.clear(); + field.num_inner_elements = 1; + field.byte_offset = 0; + field.num_bytes = data_type->size; + field.dtype = data_type; + field.encoded_dtype = std::string(data_type.name()); +} + +absl::Status FillValueJsonBinder::operator()( + std::true_type is_loading, internal_json_binding::NoOptions, + std::vector>* obj, ::nlohmann::json* j) const { + obj->resize(dtype.fields.size()); + if (dtype.fields.size() == 1) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + } else { + if (!j->is_array()) { + return internal_json::ExpectedError(*j, "array"); + } + if (j->size() != dtype.fields.size()) { + return internal_json::ExpectedError( + *j, tensorstore::StrCat("array of size ", dtype.fields.size())); + } + for (size_t i = 0; i < dtype.fields.size(); ++i) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + } + } + return absl::OkStatus(); +} + +absl::Status FillValueJsonBinder::operator()( + std::false_type is_loading, internal_json_binding::NoOptions, + const std::vector>* obj, + ::nlohmann::json* j) const { + if (dtype.fields.size() == 1) { + return EncodeSingle((*obj)[0], dtype.fields[0].dtype, *j); + } + // Structured fill value + *j = ::nlohmann::json::array(); + for (size_t i = 0; i < dtype.fields.size(); ++i) { + ::nlohmann::json item; + TENSORSTORE_RETURN_IF_ERROR( + EncodeSingle((*obj)[i], dtype.fields[i].dtype, item)); + j->push_back(std::move(item)); + } + return absl::OkStatus(); +} + +absl::Status FillValueJsonBinder::DecodeSingle(::nlohmann::json& j, + DataType data_type, + SharedArray& out) const { + if (!data_type.valid()) { + if (allow_missing_dtype) { + out = SharedArray(); + return absl::OkStatus(); + } + return absl::InvalidArgumentError( + "data_type must be specified before fill_value"); + } auto arr = AllocateArray(span{}, c_order, default_init, data_type); void* data = arr.data(); - *obj = std::move(arr); - return kFillValueDataTypeFunctions[static_cast(data_type.id())] - .decode(data, *j); + out = std::move(arr); + const auto& functions = + kFillValueDataTypeFunctions[static_cast(data_type.id())]; + if (!functions.decode) { + if (allow_missing_dtype) { + out = SharedArray(); + return absl::OkStatus(); + } + return absl::FailedPreconditionError( + "fill_value unsupported for specified data_type"); + } + return functions.decode(data, j); } -absl::Status FillValueJsonBinder::operator()(std::false_type is_loading, - internal_json_binding::NoOptions, - const SharedArray* obj, - ::nlohmann::json* j) const { - return kFillValueDataTypeFunctions[static_cast(data_type.id())] - .encode(obj->data(), *j); +absl::Status FillValueJsonBinder::EncodeSingle( + const SharedArray& arr, DataType data_type, + ::nlohmann::json& j) const { + if (!data_type.valid()) { + return absl::InvalidArgumentError( + "data_type must be specified before fill_value"); + } + const auto& functions = + kFillValueDataTypeFunctions[static_cast(data_type.id())]; + if (!functions.encode) { + return absl::FailedPreconditionError( + "fill_value unsupported for specified data_type"); + } + return functions.encode(arr.data(), j); } TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER(ChunkKeyEncoding, [](auto is_loading, @@ -357,7 +444,7 @@ constexpr auto MetadataJsonBinder = [] { rank = &obj->rank; } - auto ensure_data_type = [&]() -> Result { + auto ensure_data_type = [&]() -> Result { if constexpr (std::is_same_v) { return obj->data_type; } @@ -378,19 +465,18 @@ constexpr auto MetadataJsonBinder = [] { maybe_optional_member("node_type", jb::Constant([] { return "array"; })), jb::Member("data_type", - jb::Projection<&Self::data_type>(maybe_optional(jb::Validate( - [](const auto& options, auto* obj) { - return ValidateDataType(*obj); - }, - jb::DataTypeJsonBinder)))), + jb::Projection<&Self::data_type>(maybe_optional( + jb::DefaultBinder<>))), jb::Member( "fill_value", jb::Projection<&Self::fill_value>(maybe_optional( [&](auto is_loading, const auto& options, auto* obj, auto* j) { TENSORSTORE_ASSIGN_OR_RETURN(auto data_type, ensure_data_type()); - return FillValueJsonBinder{data_type}(is_loading, options, - obj, j); + constexpr bool allow_missing_dtype = + std::is_same_v; + return FillValueJsonBinder{data_type, allow_missing_dtype}( + is_loading, options, obj, j); }))), non_compatibility_field( jb::Member("shape", jb::Projection<&Self::shape>( @@ -477,9 +563,28 @@ std::string ZarrMetadata::GetCompatibilityKey() const { absl::Status ValidateMetadata(ZarrMetadata& metadata) { if (!metadata.codecs) { ArrayCodecResolveParameters decoded; - decoded.dtype = metadata.data_type; + if (metadata.data_type.fields.size() == 1 && + metadata.data_type.fields[0].outer_shape.empty()) { + decoded.dtype = metadata.data_type.fields[0].dtype; + } else { + decoded.dtype = dtype_v; + // TODO: Verify this works for structured types. + // Zarr2 uses a "scalar" array concept with byte storage for chunks. + } decoded.rank = metadata.rank; - decoded.fill_value = metadata.fill_value; + // Fill value for codec resolve might be complex. + // Zarr3 codecs usually don't depend on fill value except for some like + // "sharding_indexed"? Sharding uses fill_value for missing chunks. + if (metadata.fill_value.size() == 1) { + decoded.fill_value = metadata.fill_value[0]; + } else { + // How to represent structured fill value for codec? + // Sharding expects a single array. + // If we use structured type, the "array" is bytes. + // We might need to encode the fill value to bytes. + // For now, leave empty if multiple fields. + } + BytesCodecResolveParameters encoded; TENSORSTORE_ASSIGN_OR_RETURN( metadata.codecs, @@ -488,7 +593,14 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { // Get codec chunk layout info. ArrayDataTypeAndShapeInfo array_info; - array_info.dtype = metadata.data_type; + // array_info.dtype used here to validate codec compatibility. + if (metadata.data_type.fields.size() == 1 && + metadata.data_type.fields[0].outer_shape.empty()) { + array_info.dtype = metadata.data_type.fields[0].dtype; + } else { + array_info.dtype = dtype_v; + } + array_info.rank = metadata.rank; std::copy_n(metadata.chunk_shape.begin(), metadata.rank, array_info.shape.emplace().begin()); @@ -512,17 +624,34 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { absl::Status ValidateMetadata(const ZarrMetadata& metadata, const ZarrMetadataConstraints& constraints) { using internal::MetadataMismatchError; - if (constraints.data_type && *constraints.data_type != metadata.data_type) { - return MetadataMismatchError("data_type", constraints.data_type->name(), - metadata.data_type.name()); - } - if (constraints.fill_value && - !AreArraysIdenticallyEqual(*constraints.fill_value, - metadata.fill_value)) { - auto binder = FillValueJsonBinder{metadata.data_type}; - auto constraint_json = jb::ToJson(*constraints.fill_value, binder).value(); - auto metadata_json = jb::ToJson(metadata.fill_value, binder).value(); - return MetadataMismatchError("fill_value", constraint_json, metadata_json); + if (constraints.data_type) { + // Compare ZarrDType + if (::nlohmann::json(*constraints.data_type) != + ::nlohmann::json(metadata.data_type)) { + return MetadataMismatchError( + "data_type", ::nlohmann::json(*constraints.data_type).dump(), + ::nlohmann::json(metadata.data_type).dump()); + } + } + if (constraints.fill_value) { + // Compare vector of arrays + if (constraints.fill_value->size() != metadata.fill_value.size()) { + return MetadataMismatchError("fill_value size", + constraints.fill_value->size(), + metadata.fill_value.size()); + } + for (size_t i = 0; i < metadata.fill_value.size(); ++i) { + if (!AreArraysIdenticallyEqual((*constraints.fill_value)[i], + metadata.fill_value[i])) { + auto binder = FillValueJsonBinder{metadata.data_type}; + auto constraint_json = + jb::ToJson(*constraints.fill_value, binder).value(); + auto metadata_json = + jb::ToJson(metadata.fill_value, binder).value(); + return MetadataMismatchError("fill_value", constraint_json, + metadata_json); + } + } } if (constraints.shape && *constraints.shape != metadata.shape) { return MetadataMismatchError("shape", *constraints.shape, metadata.shape); @@ -574,23 +703,64 @@ absl::Status ValidateMetadata(const ZarrMetadata& metadata, metadata.unknown_extension_attributes); } +namespace { +std::string GetFieldNames(const ZarrDType& dtype) { + std::vector field_names; + for (const auto& field : dtype.fields) { + field_names.push_back(field.name); + } + return ::nlohmann::json(field_names).dump(); +} +} // namespace + +Result GetFieldIndex(const ZarrDType& dtype, + std::string_view selected_field) { + if (selected_field.empty()) { + if (dtype.fields.size() != 1) { + return absl::FailedPreconditionError(tensorstore::StrCat( + "Must specify a \"field\" that is one of: ", GetFieldNames(dtype))); + } + return 0; + } + if (!dtype.has_fields) { + return absl::FailedPreconditionError( + tensorstore::StrCat("Requested field ", QuoteString(selected_field), + " but dtype does not have named fields")); + } + for (size_t field_index = 0; field_index < dtype.fields.size(); + ++field_index) { + if (dtype.fields[field_index].name == selected_field) return field_index; + } + return absl::FailedPreconditionError( + tensorstore::StrCat("Requested field ", QuoteString(selected_field), + " is not one of: ", GetFieldNames(dtype))); +} + +SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, + size_t field_index) { + SpecRankAndFieldInfo info; + info.chunked_rank = metadata.rank; + info.field = &metadata.data_type.fields[field_index]; + return info; +} + Result> GetEffectiveDomain( - DimensionIndex rank, std::optional> shape, + const SpecRankAndFieldInfo& info, + std::optional> metadata_shape, std::optional>> dimension_names, - const Schema& schema, bool* dimension_names_used = nullptr) { + const Schema& schema, bool* dimension_names_used) { + const DimensionIndex rank = info.chunked_rank; if (dimension_names_used) *dimension_names_used = false; auto domain = schema.domain(); - if (!shape && !dimension_names && !domain.valid()) { + if (!metadata_shape && !dimension_names && !domain.valid()) { if (schema.rank() == 0) return {std::in_place, 0}; - // No information about the domain available. return {std::in_place}; } - // Rank is already validated by caller. assert(RankConstraint::EqualOrUnspecified(schema.rank(), rank)); IndexDomainBuilder builder(std::max(schema.rank().rank, rank)); - if (shape) { - builder.shape(*shape); + if (metadata_shape) { + builder.shape(*metadata_shape); builder.implicit_upper_bounds(true); } else { builder.origin(GetConstantVector(builder.rank())); @@ -602,12 +772,12 @@ Result> GetEffectiveDomain( normalized_dimension_names[i] = *name; } } - // Use dimension_names as labels if they are valid. - if (internal::ValidateDimensionLabelsAreUnique(normalized_dimension_names) + if (internal::ValidateDimensionLabelsAreUnique( + span(&normalized_dimension_names[0], rank)) .ok()) { - if (dimension_names_used) *dimension_names_used = true; builder.labels( span(&normalized_dimension_names[0], rank)); + if (dimension_names_used) *dimension_names_used = true; } } @@ -618,36 +788,53 @@ Result> GetEffectiveDomain( tensorstore::MaybeAnnotateStatus( _, "Mismatch between metadata and schema"))); return WithImplicitDimensions(domain, false, true); - return domain; } Result> GetEffectiveDomain( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, bool* dimension_names_used) { - return GetEffectiveDomain( - metadata_constraints.rank, metadata_constraints.shape, - metadata_constraints.dimension_names, schema, dimension_names_used); + SpecRankAndFieldInfo info; + info.chunked_rank = metadata_constraints.rank; + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + + std::optional> shape_span; + if (metadata_constraints.shape) { + shape_span.emplace(metadata_constraints.shape->data(), + metadata_constraints.shape->size()); + } + std::optional>> names_span; + if (metadata_constraints.dimension_names) { + names_span.emplace(metadata_constraints.dimension_names->data(), + metadata_constraints.dimension_names->size()); + } + + return GetEffectiveDomain(info, shape_span, names_span, schema, + dimension_names_used); } absl::Status SetChunkLayoutFromMetadata( - DataType dtype, DimensionIndex rank, + const SpecRankAndFieldInfo& info, std::optional> chunk_shape, const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout) { - TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set(RankConstraint{rank})); - rank = chunk_layout.rank(); - if (rank == dynamic_rank) return absl::OkStatus(); + const DimensionIndex rank = info.chunked_rank; + if (rank == dynamic_rank) { + return absl::OkStatus(); + } + TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set(RankConstraint(rank))); + TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set( + ChunkLayout::GridOrigin(GetConstantVector(rank)))); if (chunk_shape) { assert(chunk_shape->size() == rank); TENSORSTORE_RETURN_IF_ERROR( chunk_layout.Set(ChunkLayout::WriteChunkShape(*chunk_shape))); } - TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set( - ChunkLayout::GridOrigin(GetConstantVector(rank)))); if (codecs) { ArrayDataTypeAndShapeInfo array_info; - array_info.dtype = dtype; + array_info.dtype = info.field ? info.field->dtype : dtype_v; array_info.rank = rank; if (chunk_shape) { std::copy_n(chunk_shape->begin(), rank, @@ -669,30 +856,47 @@ absl::Status SetChunkLayoutFromMetadata( span(layout_info.codec_chunk_shape->data(), rank)))); } } + return absl::OkStatus(); } -Result GetEffectiveChunkLayout( +absl::Status SetChunkLayoutFromMetadata( DataType dtype, DimensionIndex rank, std::optional> chunk_shape, - const ZarrCodecChainSpec* codecs, const Schema& schema) { - auto chunk_layout = schema.chunk_layout(); - TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( - dtype, rank, chunk_shape, codecs, chunk_layout)); - return chunk_layout; + const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout) { + SpecRankAndFieldInfo info; + info.chunked_rank = rank; + info.field = nullptr; + return SetChunkLayoutFromMetadata(info, chunk_shape, codecs, chunk_layout); } Result GetEffectiveChunkLayout( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema) { - assert(RankConstraint::EqualOrUnspecified(metadata_constraints.rank, - schema.rank())); - return GetEffectiveChunkLayout( - metadata_constraints.data_type.value_or(DataType{}), - std::max(metadata_constraints.rank, schema.rank().rank), - metadata_constraints.chunk_shape, + // Approximation: assume whole array access or simple array + SpecRankAndFieldInfo info; + info.chunked_rank = std::max(metadata_constraints.rank, schema.rank().rank); + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + if (info.chunked_rank == dynamic_rank && metadata_constraints.chunk_shape) { + info.chunked_rank = metadata_constraints.chunk_shape->size(); + } + // We can't easily know field info from constraints unless we parse data_type. + // If data_type is present and has 1 field, we can check it. + // For now, basic implementation. + + ChunkLayout chunk_layout = schema.chunk_layout(); + std::optional> chunk_shape_span; + if (metadata_constraints.chunk_shape) { + chunk_shape_span.emplace(metadata_constraints.chunk_shape->data(), + metadata_constraints.chunk_shape->size()); + } + TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( + info, chunk_shape_span, metadata_constraints.codec_specs ? &*metadata_constraints.codec_specs : nullptr, - schema); + chunk_layout)); + return chunk_layout; } Result GetDimensionUnits( @@ -732,53 +936,63 @@ CodecSpec GetCodecFromMetadata(const ZarrMetadata& metadata) { } absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, - const Schema& schema) { - if (!RankConstraint::EqualOrUnspecified(metadata.rank, schema.rank())) { + size_t field_index, const Schema& schema) { + auto info = GetSpecRankAndFieldInfo(metadata, field_index); + const auto& field = metadata.data_type.fields[field_index]; + + if (!RankConstraint::EqualOrUnspecified(schema.rank(), info.chunked_rank)) { return absl::FailedPreconditionError(tensorstore::StrCat( "Rank specified by schema (", schema.rank(), - ") does not match rank specified by metadata (", metadata.rank, ")")); + ") does not match rank specified by metadata (", info.chunked_rank, + ")")); } if (schema.domain().valid()) { + std::optional> metadata_shape_span; + metadata_shape_span.emplace(metadata.shape.data(), metadata.shape.size()); + std::optional>> dimension_names_span; + dimension_names_span.emplace(metadata.dimension_names.data(), + metadata.dimension_names.size()); TENSORSTORE_RETURN_IF_ERROR(GetEffectiveDomain( - metadata.rank, metadata.shape, metadata.dimension_names, schema)); + info, metadata_shape_span, dimension_names_span, schema, + /*dimension_names_used=*/nullptr)); } if (auto dtype = schema.dtype(); - !IsPossiblySameDataType(metadata.data_type, dtype)) { + !IsPossiblySameDataType(field.dtype, dtype)) { return absl::FailedPreconditionError( - tensorstore::StrCat("data_type from metadata (", metadata.data_type, + tensorstore::StrCat("data_type from metadata (", field.dtype, ") does not match dtype in schema (", dtype, ")")); } if (schema.chunk_layout().rank() != dynamic_rank) { - TENSORSTORE_ASSIGN_OR_RETURN( - auto chunk_layout, - GetEffectiveChunkLayout(metadata.data_type, metadata.rank, - metadata.chunk_shape, &metadata.codec_specs, - schema)); + ChunkLayout chunk_layout = schema.chunk_layout(); + std::optional> chunk_shape_span; + chunk_shape_span.emplace(metadata.chunk_shape.data(), + metadata.chunk_shape.size()); + TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( + info, chunk_shape_span, &metadata.codec_specs, chunk_layout)); if (chunk_layout.codec_chunk_shape().hard_constraint) { return absl::InvalidArgumentError("codec_chunk_shape not supported"); } } if (auto schema_fill_value = schema.fill_value(); schema_fill_value.valid()) { - const auto& fill_value = metadata.fill_value; + const auto& fill_value = metadata.fill_value[field_index]; TENSORSTORE_ASSIGN_OR_RETURN( auto broadcast_fill_value, tensorstore::BroadcastArray(schema_fill_value, span{})); TENSORSTORE_ASSIGN_OR_RETURN( SharedArray converted_fill_value, tensorstore::MakeCopy(std::move(broadcast_fill_value), - skip_repeated_elements, metadata.data_type)); + skip_repeated_elements, field.dtype)); if (!AreArraysIdenticallyEqual(converted_fill_value, fill_value)) { auto binder = FillValueJsonBinder{metadata.data_type}; - auto schema_json = jb::ToJson(converted_fill_value, binder).value(); - auto metadata_json = jb::ToJson(metadata.fill_value, binder).value(); + // Error message generation might be tricky with binder return absl::FailedPreconditionError(tensorstore::StrCat( "Invalid fill_value: schema requires fill value of ", - schema_json.dump(), ", but metadata specifies fill value of ", - metadata_json.dump())); + schema_fill_value, ", but metadata specifies fill value of ", + fill_value)); } } @@ -804,8 +1018,14 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, return absl::OkStatus(); } +absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, + const Schema& schema) { + return ValidateMetadataSchema(metadata, /*field_index=*/0, schema); +} + Result> GetNewMetadata( - const ZarrMetadataConstraints& metadata_constraints, const Schema& schema) { + const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, + std::string_view selected_field) { auto metadata = std::make_shared(); metadata->zarr_format = metadata_constraints.zarr_format.value_or(3); @@ -813,51 +1033,85 @@ Result> GetNewMetadata( metadata_constraints.chunk_key_encoding.value_or(ChunkKeyEncoding{ /*.kind=*/ChunkKeyEncoding::kDefault, /*.separator=*/'/'}); + // Determine data type first + if (metadata_constraints.data_type) { + metadata->data_type = *metadata_constraints.data_type; + } else if (!selected_field.empty()) { + return absl::InvalidArgumentError( + "\"dtype\" must be specified in \"metadata\" if \"field\" is " + "specified"); + } else if (auto dtype = schema.dtype(); dtype.valid()) { + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast( + metadata->data_type.fields.emplace_back()), + ChooseBaseDType(dtype)); + metadata->data_type.has_fields = false; + TENSORSTORE_RETURN_IF_ERROR(ValidateDType(metadata->data_type)); + } else { + return absl::InvalidArgumentError("dtype must be specified"); + } + + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, GetFieldIndex(metadata->data_type, selected_field)); + SpecRankAndFieldInfo info; + info.field = &metadata->data_type.fields[field_index]; + info.chunked_rank = metadata_constraints.rank; + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + if (info.chunked_rank == dynamic_rank && + schema.rank().rank != dynamic_rank) { + info.chunked_rank = schema.rank().rank; + } + // Set domain - bool dimension_names_used; + bool dimension_names_used = false; + std::optional> constraint_shape_span; + if (metadata_constraints.shape) { + constraint_shape_span.emplace(metadata_constraints.shape->data(), + metadata_constraints.shape->size()); + } + std::optional>> constraint_names_span; + if (metadata_constraints.dimension_names) { + constraint_names_span.emplace( + metadata_constraints.dimension_names->data(), + metadata_constraints.dimension_names->size()); + } TENSORSTORE_ASSIGN_OR_RETURN( - auto domain, - GetEffectiveDomain(metadata_constraints, schema, &dimension_names_used)); + auto domain, GetEffectiveDomain(info, constraint_shape_span, + constraint_names_span, schema, + &dimension_names_used)); if (!domain.valid() || !IsFinite(domain.box())) { return absl::InvalidArgumentError("domain must be specified"); } - const DimensionIndex rank = metadata->rank = domain.rank(); - metadata->shape.assign(domain.shape().begin(), domain.shape().end()); + const DimensionIndex rank = domain.rank(); + metadata->rank = rank; + info.chunked_rank = rank; + metadata->shape.assign(domain.shape().begin(), + domain.shape().begin() + rank); metadata->dimension_names.assign(domain.labels().begin(), - domain.labels().end()); - // Normalize empty string dimension names to `std::nullopt`. This is more - // consistent with the zarr v3 dimension name semantics, and ensures that the - // `dimension_names` metadata field will be excluded entirely if all dimension - // names are the empty string. - // - // However, if empty string dimension names were specified explicitly in - // `metadata_constraints`, leave them exactly as specified. + domain.labels().begin() + rank); + for (DimensionIndex i = 0; i < rank; ++i) { auto& name = metadata->dimension_names[i]; if (!name || !name->empty()) continue; - // Dimension name equals the empty string. - if (dimension_names_used && (*metadata_constraints.dimension_names)[i]) { - // Empty dimension name was explicitly specified in - // `metadata_constraints`, leave it as is. + if (dimension_names_used && metadata_constraints.dimension_names && + (*metadata_constraints.dimension_names)[i]) { assert((*metadata_constraints.dimension_names)[i]->empty()); continue; } - // Name was not explicitly specified in `metadata_constraints` as an empty - // string. Normalize it to `std::nullopt`. name = std::nullopt; } - // Set dtype - auto dtype = schema.dtype(); - if (!dtype.valid()) { - return absl::InvalidArgumentError("dtype must be specified"); - } - TENSORSTORE_RETURN_IF_ERROR(ValidateDataType(dtype)); - metadata->data_type = dtype; - if (metadata_constraints.fill_value) { metadata->fill_value = *metadata_constraints.fill_value; } else if (auto fill_value = schema.fill_value(); fill_value.valid()) { + // Assuming single field if setting from schema + if (metadata->data_type.fields.size() != 1) { + return absl::InvalidArgumentError( + "Cannot specify fill_value through schema for structured zarr data " + "type"); + } const auto status = [&] { TENSORSTORE_ASSIGN_OR_RETURN( auto broadcast_fill_value, @@ -865,23 +1119,26 @@ Result> GetNewMetadata( TENSORSTORE_ASSIGN_OR_RETURN( auto converted_fill_value, tensorstore::MakeCopy(std::move(broadcast_fill_value), - skip_repeated_elements, metadata->data_type)); - metadata->fill_value = std::move(converted_fill_value); + skip_repeated_elements, + metadata->data_type.fields[0].dtype)); + metadata->fill_value.push_back(std::move(converted_fill_value)); return absl::OkStatus(); }(); TENSORSTORE_RETURN_IF_ERROR( status, tensorstore::MaybeAnnotateStatus(_, "Invalid fill_value")); } else { - metadata->fill_value = tensorstore::AllocateArray( - /*shape=*/span(), c_order, value_init, - metadata->data_type); + metadata->fill_value.resize(metadata->data_type.fields.size()); + for (size_t i = 0; i < metadata->fill_value.size(); ++i) { + metadata->fill_value[i] = tensorstore::AllocateArray( + /*shape=*/span(), c_order, value_init, + metadata->data_type.fields[i].dtype); + } } metadata->user_attributes = metadata_constraints.user_attributes; metadata->unknown_extension_attributes = metadata_constraints.unknown_extension_attributes; - // Set dimension units TENSORSTORE_ASSIGN_OR_RETURN( auto dimension_units, GetEffectiveDimensionUnits(rank, metadata_constraints.dimension_units, @@ -895,12 +1152,16 @@ Result> GetNewMetadata( TENSORSTORE_ASSIGN_OR_RETURN(auto codec_spec, GetEffectiveCodec(metadata_constraints, schema)); - // Set chunk shape - ArrayCodecResolveParameters decoded; - decoded.dtype = metadata->data_type; + if (metadata->data_type.fields.size() == 1 && + metadata->data_type.fields[0].outer_shape.empty()) { + decoded.dtype = metadata->data_type.fields[0].dtype; + } else { + decoded.dtype = dtype_v; + } decoded.rank = metadata->rank; - decoded.fill_value = metadata->fill_value; + if (metadata->fill_value.size() == 1) + decoded.fill_value = metadata->fill_value[0]; TENSORSTORE_ASSIGN_OR_RETURN( auto chunk_layout, GetEffectiveChunkLayout(metadata_constraints, schema)); @@ -920,8 +1181,6 @@ Result> GetNewMetadata( if (!internal::RangesEqual(span(metadata->chunk_shape), span(read_chunk_shape))) { - // Read chunk and write chunk shapes differ. Insert sharding codec if there - // is not already one. if (!codec_spec->codecs || codec_spec->codecs->sharding_height() == 0) { auto sharding_codec = internal::MakeIntrusivePtr( @@ -945,7 +1204,8 @@ Result> GetNewMetadata( TENSORSTORE_RETURN_IF_ERROR(set_up_codecs( codec_spec->codecs ? *codec_spec->codecs : ZarrCodecChainSpec{})); TENSORSTORE_RETURN_IF_ERROR(ValidateMetadata(*metadata)); - TENSORSTORE_RETURN_IF_ERROR(ValidateMetadataSchema(*metadata, schema)); + TENSORSTORE_RETURN_IF_ERROR( + ValidateMetadataSchema(*metadata, field_index, schema)); return metadata; } diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index 05b8c6be3..4c7871b0d 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -33,6 +33,7 @@ #include "tensorstore/data_type.h" #include "tensorstore/driver/zarr3/codec/codec.h" #include "tensorstore/driver/zarr3/codec/codec_chain_spec.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/index.h" #include "tensorstore/index_space/dimension_units.h" #include "tensorstore/index_space/index_domain.h" @@ -72,19 +73,35 @@ struct ChunkKeyEncoding { }; struct FillValueJsonBinder { - DataType data_type; + ZarrDType dtype; + bool allow_missing_dtype = false; + FillValueJsonBinder() = default; + explicit FillValueJsonBinder(ZarrDType dtype, + bool allow_missing_dtype = false); + explicit FillValueJsonBinder(DataType dtype, + bool allow_missing_dtype = false); absl::Status operator()(std::true_type is_loading, internal_json_binding::NoOptions, - SharedArray* obj, + std::vector>* obj, ::nlohmann::json* j) const; absl::Status operator()(std::false_type is_loading, internal_json_binding::NoOptions, - const SharedArray* obj, + const std::vector>* obj, ::nlohmann::json* j) const; + + private: + absl::Status DecodeSingle(::nlohmann::json& j, DataType data_type, + SharedArray& out) const; + absl::Status EncodeSingle(const SharedArray& arr, + DataType data_type, + ::nlohmann::json& j) const; }; +struct SpecRankAndFieldInfo; + + struct ZarrMetadata { // The following members are common to `ZarrMetadata` and // `ZarrMetadataConstraints`, except that in `ZarrMetadataConstraints` some @@ -94,14 +111,14 @@ struct ZarrMetadata { int zarr_format; std::vector shape; - DataType data_type; + ZarrDType data_type; ::nlohmann::json::object_t user_attributes; std::optional dimension_units; std::vector> dimension_names; ChunkKeyEncoding chunk_key_encoding; std::vector chunk_shape; ZarrCodecChainSpec codec_specs; - SharedArray fill_value; + std::vector> fill_value; ::nlohmann::json::object_t unknown_extension_attributes; std::string GetCompatibilityKey() const; @@ -123,14 +140,14 @@ struct ZarrMetadataConstraints { std::optional zarr_format; std::optional> shape; - std::optional data_type; + std::optional data_type; ::nlohmann::json::object_t user_attributes; std::optional dimension_units; std::optional>> dimension_names; std::optional chunk_key_encoding; std::optional> chunk_shape; std::optional codec_specs; - std::optional> fill_value; + std::optional>> fill_value; ::nlohmann::json::object_t unknown_extension_attributes; TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrMetadataConstraints, @@ -159,6 +176,10 @@ Result> GetEffectiveDomain( /// Sets chunk layout constraints implied by `dtype`, `rank`, `chunk_shape`, and /// `codecs`. +absl::Status SetChunkLayoutFromMetadata( + const SpecRankAndFieldInfo& info, + std::optional> chunk_shape, + const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout); absl::Status SetChunkLayoutFromMetadata( DataType dtype, DimensionIndex rank, std::optional> chunk_shape, @@ -198,6 +219,8 @@ Result> GetEffectiveCodec( CodecSpec GetCodecFromMetadata(const ZarrMetadata& metadata); /// Validates that `schema` is compatible with `metadata`. +absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, + size_t field_index, const Schema& schema); absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, const Schema& schema); @@ -206,10 +229,22 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, /// \error `absl::StatusCode::kInvalidArgument` if any required fields are /// unspecified. Result> GetNewMetadata( - const ZarrMetadataConstraints& metadata_constraints, const Schema& schema); + const ZarrMetadataConstraints& metadata_constraints, + const Schema& schema, std::string_view selected_field = {}); absl::Status ValidateDataType(DataType dtype); +Result GetFieldIndex(const ZarrDType& dtype, + std::string_view selected_field); + +struct SpecRankAndFieldInfo { + DimensionIndex chunked_rank = dynamic_rank; + const ZarrDType::Field* field = nullptr; +}; + +SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, + size_t field_index); + } // namespace internal_zarr3 } // namespace tensorstore diff --git a/tensorstore/driver/zarr3/metadata_test.cc b/tensorstore/driver/zarr3/metadata_test.cc index 0b140fa80..11c97619f 100644 --- a/tensorstore/driver/zarr3/metadata_test.cc +++ b/tensorstore/driver/zarr3/metadata_test.cc @@ -51,6 +51,7 @@ namespace { namespace jb = ::tensorstore::internal_json_binding; using ::tensorstore::ChunkLayout; +using ::tensorstore::DataType; using ::tensorstore::CodecSpec; using ::tensorstore::dtype_v; using ::tensorstore::Index; @@ -68,6 +69,7 @@ using ::tensorstore::dtypes::float32_t; using ::tensorstore::dtypes::float64_t; using ::tensorstore::internal::uint_t; using ::tensorstore::internal_zarr3::FillValueJsonBinder; +using ::tensorstore::internal_zarr3::ZarrDType; using ::tensorstore::internal_zarr3::ZarrMetadata; using ::tensorstore::internal_zarr3::ZarrMetadataConstraints; using ::testing::HasSubstr; @@ -90,13 +92,30 @@ ::nlohmann::json GetBasicMetadata() { }; } +ZarrDType MakeScalarZarrDType(DataType dtype) { + ZarrDType dtype_info; + dtype_info.has_fields = false; + dtype_info.fields.resize(1); + auto& field = dtype_info.fields[0]; + field.dtype = dtype; + field.encoded_dtype = std::string(dtype.name()); + field.outer_shape.clear(); + field.flexible_shape.clear(); + field.field_shape.clear(); + field.num_inner_elements = 1; + field.byte_offset = 0; + field.num_bytes = dtype->size; + return dtype_info; +} + TEST(MetadataTest, ParseValid) { auto json = GetBasicMetadata(); tensorstore::TestJsonBinderRoundTripJsonOnly({json}); TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); EXPECT_THAT(metadata.shape, ::testing::ElementsAre(10, 11, 12)); EXPECT_THAT(metadata.chunk_shape, ::testing::ElementsAre(1, 2, 3)); - EXPECT_THAT(metadata.data_type, tensorstore::dtype_v); + ASSERT_EQ(metadata.data_type.fields.size(), 1); + EXPECT_EQ(tensorstore::dtype_v, metadata.data_type.fields[0].dtype); EXPECT_THAT(metadata.dimension_names, ::testing::ElementsAre("a", std::nullopt, "")); EXPECT_THAT(metadata.user_attributes, MatchesJson({{"a", "b"}, {"c", "d"}})); @@ -115,7 +134,8 @@ TEST(MetadataTest, ParseValidNoDimensionNames) { TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); EXPECT_THAT(metadata.shape, ::testing::ElementsAre(10, 11, 12)); EXPECT_THAT(metadata.chunk_shape, ::testing::ElementsAre(1, 2, 3)); - EXPECT_THAT(metadata.data_type, tensorstore::dtype_v); + ASSERT_EQ(metadata.data_type.fields.size(), 1); + EXPECT_EQ(tensorstore::dtype_v, metadata.data_type.fields[0].dtype); EXPECT_THAT(metadata.dimension_names, ::testing::ElementsAre(std::nullopt, std::nullopt, std::nullopt)); EXPECT_THAT(metadata.user_attributes, MatchesJson({{"a", "b"}, {"c", "d"}})); @@ -486,7 +506,9 @@ TEST(MetadataTest, DataTypes) { } TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); - EXPECT_EQ(tensorstore::GetDataType(data_type_name), metadata.data_type); + ASSERT_FALSE(metadata.data_type.fields.empty()); + EXPECT_EQ(tensorstore::GetDataType(data_type_name), + metadata.data_type.fields[0].dtype); } } @@ -503,18 +525,20 @@ TEST(MetadataTest, InvalidDataType) { template void TestFillValue(std::vector> cases, bool skip_to_json = false) { - auto binder = FillValueJsonBinder{dtype_v}; + FillValueJsonBinder binder(MakeScalarZarrDType(dtype_v)); for (const auto& [value, json] : cases) { SharedArray expected_fill_value = tensorstore::MakeScalarArray(value); if (!skip_to_json) { - EXPECT_THAT(jb::ToJson(expected_fill_value, binder), + std::vector> vec{expected_fill_value}; + EXPECT_THAT(jb::ToJson(vec, binder), ::testing::Optional(MatchesJson(json))) << "value=" << value << ", json=" << json; } - EXPECT_THAT(jb::FromJson>(json, binder), - ::testing::Optional( - tensorstore::MatchesArrayIdentically(expected_fill_value))) + EXPECT_THAT( + jb::FromJson>>(json, binder), + ::testing::Optional(::testing::ElementsAre( + tensorstore::MatchesArrayIdentically(expected_fill_value)))) << "json=" << json; } } @@ -522,10 +546,11 @@ void TestFillValue(std::vector> cases, template void TestFillValueInvalid( std::vector> cases) { - auto binder = FillValueJsonBinder{dtype_v}; + FillValueJsonBinder binder(MakeScalarZarrDType(dtype_v)); for (const auto& [json, matcher] : cases) { EXPECT_THAT( - jb::FromJson>(json, binder).status(), + jb::FromJson>>(json, binder) + .status(), StatusIs(absl::StatusCode::kInvalidArgument, MatchesRegex(matcher))) << "json=" << json; } From 187f42452a359bca712a64050176b93e5ce9b145 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 22:57:11 +0000 Subject: [PATCH 02/59] Updates to have proper reads --- tensorstore/driver/zarr3/chunk_cache.cc | 74 ++++++++++++++---- tensorstore/driver/zarr3/chunk_cache.h | 11 ++- tensorstore/driver/zarr3/driver.cc | 74 ++++++++++++------ tensorstore/driver/zarr3/dtype.cc | 64 +++++++++++---- tensorstore/driver/zarr3/metadata.cc | 100 ++++++++++++++++-------- 5 files changed, 239 insertions(+), 84 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index ee1cba9c1..6bfa8c039 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include #include @@ -73,15 +75,17 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - internal::CachePool::WeakPtr /*data_cache_pool*/) - : Base(std::move(store)), codec_state_(std::move(codec_state)) {} + ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/) + : Base(std::move(store)), + codec_state_(std::move(codec_state)), + dtype_(std::move(dtype)) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver>&& receiver) { return internal::ChunkCache::Read( {static_cast(request), - /*component_index=*/0, request.staleness_bound, + request.component_index, request.staleness_bound, request.fill_missing_data_reads}, std::move(receiver)); } @@ -92,7 +96,7 @@ void ZarrLeafChunkCache::Write( receiver) { return internal::ChunkCache::Write( {static_cast(request), - /*component_index=*/0, request.store_data_equal_to_fill_value}, + request.component_index, request.store_data_equal_to_fill_value}, std::move(receiver)); } @@ -149,12 +153,52 @@ std::string ZarrLeafChunkCache::GetChunkStorageKey( Result, 1>> ZarrLeafChunkCache::DecodeChunk(span chunk_indices, absl::Cord data) { + const size_t num_fields = dtype_.fields.size(); + absl::InlinedVector, 1> field_arrays(num_fields); + + + // For single non-structured field, decode directly + if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) { + TENSORSTORE_ASSIGN_OR_RETURN( + field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), + std::move(data))); + return field_arrays; + } + + // For structured types, decode byte array then extract fields + // Build decode shape: [chunk_dims..., bytes_per_outer_element] + const auto& chunk_shape = grid().chunk_shape; + std::vector decode_shape(chunk_shape.begin(), chunk_shape.end()); + decode_shape.push_back(dtype_.bytes_per_outer_element); + TENSORSTORE_ASSIGN_OR_RETURN( - auto array, - codec_state_->DecodeArray(grid().components[0].shape(), std::move(data))); - absl::InlinedVector, 1> components; - components.push_back(std::move(array)); - return components; + auto byte_array, codec_state_->DecodeArray(decode_shape, std::move(data))); + + // Extract each field from the byte array + const Index num_elements = byte_array.num_elements() / + dtype_.bytes_per_outer_element; + const auto* src_bytes = static_cast(byte_array.data()); + + for (size_t field_i = 0; field_i < num_fields; ++field_i) { + const auto& field = dtype_.fields[field_i]; + // Use the component's shape (from the grid) for the result array + const auto& component_shape = grid().components[field_i].shape(); + auto result_array = + AllocateArray(component_shape, c_order, default_init, field.dtype); + auto* dst = static_cast(result_array.data()); + const Index field_size = field.dtype->size; + + // Copy field data from each struct element + for (Index i = 0; i < num_elements; ++i) { + std::memcpy(dst + i * field_size, + src_bytes + i * dtype_.bytes_per_outer_element + + field.byte_offset, + field_size); + } + field_arrays[field_i] = std::move(result_array); + } + + return field_arrays; } Result ZarrLeafChunkCache::EncodeChunk( @@ -170,9 +214,10 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), + dtype_(std::move(dtype)), data_cache_pool_(std::move(data_cache_pool)) {} Result> TranslateCellToSourceTransformForShard( @@ -326,6 +371,7 @@ void ZarrShardedChunkCache::Read( *this, std::move(request.transform), std::move(receiver), [transaction = std::move(request.transaction), batch = std::move(request.batch), + component_index = request.component_index, staleness_bound = request.staleness_bound, fill_missing_data_reads = request.fill_missing_data_reads](auto entry) { Batch shard_batch = batch; @@ -339,8 +385,7 @@ void ZarrShardedChunkCache::Read( IndexTransform<>>&& receiver) { entry->sub_chunk_cache.get()->Read( {{transaction, std::move(transform), shard_batch}, - staleness_bound, - fill_missing_data_reads}, + component_index, staleness_bound, fill_missing_data_reads}, std::move(receiver)); }; }); @@ -354,6 +399,7 @@ void ZarrShardedChunkCache::Write( &ZarrArrayToArrayCodec::PreparedState::Write>( *this, std::move(request.transform), std::move(receiver), [transaction = std::move(request.transaction), + component_index = request.component_index, store_data_equal_to_fill_value = request.store_data_equal_to_fill_value](auto entry) { internal::OpenTransactionPtr shard_transaction = transaction; @@ -366,7 +412,7 @@ void ZarrShardedChunkCache::Write( AnyFlowReceiver>&& receiver) { entry->sub_chunk_cache.get()->Write( - {{shard_transaction, std::move(transform)}, + {{shard_transaction, std::move(transform)}, component_index, store_data_equal_to_fill_value}, std::move(receiver)); }; @@ -481,7 +527,7 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { *sharding_state.sub_chunk_codec_chain, std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), - cache.data_cache_pool_); + cache.dtype_, cache.data_cache_pool_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index dd40e43ac..5933115d7 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -31,6 +31,7 @@ #include "tensorstore/driver/read_request.h" #include "tensorstore/driver/write_request.h" #include "tensorstore/driver/zarr3/codec/codec.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/index.h" #include "tensorstore/index_space/index_transform.h" #include "tensorstore/internal/cache/cache.h" @@ -72,6 +73,7 @@ class ZarrChunkCache { virtual const Executor& executor() const = 0; struct ReadRequest : internal::DriverReadRequest { + size_t component_index = 0; absl::Time staleness_bound; bool fill_missing_data_reads; }; @@ -81,6 +83,7 @@ class ZarrChunkCache { IndexTransform<>>&& receiver) = 0; struct WriteRequest : internal::DriverWriteRequest { + size_t component_index = 0; bool store_data_equal_to_fill_value; }; @@ -154,6 +157,7 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, explicit ZarrLeafChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool); void Read(ZarrChunkCache::ReadRequest request, @@ -181,6 +185,7 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, kvstore::Driver* GetKvStoreDriver() override; ZarrCodecChain::PreparedState::Ptr codec_state_; + ZarrDType dtype_; }; /// Chunk cache for a Zarr array where each chunk is a shard. @@ -190,6 +195,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { public: explicit ZarrShardedChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool); const ZarrShardingCodec::PreparedState& sharding_codec_state() const { @@ -239,6 +245,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { kvstore::DriverPtr base_kvstore_; ZarrCodecChain::PreparedState::Ptr codec_state_; + ZarrDType dtype_; // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -253,11 +260,11 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { explicit ZarrShardSubChunkCache( kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, - internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), - std::move(data_cache_pool)), + std::move(dtype), std::move(data_cache_pool)), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 15faced0a..1674a1c6d 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -103,9 +103,11 @@ class ZarrDriverSpec /*Parent=*/KvsDriverSpec>; ZarrMetadataConstraints metadata_constraints; + std::string selected_field; constexpr static auto ApplyMembers = [](auto& x, auto f) { - return f(internal::BaseCast(x), x.metadata_constraints); + return f(internal::BaseCast(x), x.metadata_constraints, + x.selected_field); }; static inline const auto default_json_binder = jb::Sequence( @@ -139,7 +141,10 @@ class ZarrDriverSpec return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( - jb::DefaultInitializedValue())))); + jb::DefaultInitializedValue()))), + jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( + jb::DefaultValue( + [](auto* obj) { *obj = std::string{}; })))); absl::Status ApplyOptions(SpecOptions&& options) override { if (options.minimal_spec) { @@ -286,21 +291,33 @@ class DataCacheBase static internal::ChunkGridSpecification GetChunkGridSpecification( const ZarrMetadata& metadata) { assert(!metadata.fill_value.empty()); - auto fill_value = BroadcastArray(metadata.fill_value[0], - BoxView<>(metadata.rank)) - .value(); internal::ChunkGridSpecification::ComponentList components; - auto& component = components.emplace_back( - internal::AsyncWriteArray::Spec{ - std::move(fill_value), - // Since all dimensions are resizable, just - // specify unbounded `valid_data_bounds`. - Box<>(metadata.rank), - ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank))}, - metadata.chunk_shape); - component.array_spec.fill_value_comparison_kind = - EqualityComparisonKind::identical; + + // Create one component per field (like zarr v2) + for (size_t field_i = 0; field_i < metadata.data_type.fields.size(); + ++field_i) { + const auto& field = metadata.data_type.fields[field_i]; + auto fill_value = metadata.fill_value[field_i]; + if (!fill_value.valid()) { + // Use value-initialized rank-0 fill value (like zarr v2) + fill_value = AllocateArray(span{}, c_order, value_init, + field.dtype); + } + auto chunk_fill_value = + BroadcastArray(fill_value, BoxView<>(metadata.rank)).value(); + + auto& component = components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), + // Since all dimensions are resizable, just + // specify unbounded `valid_data_bounds`. + Box<>(metadata.rank), + ContiguousLayoutPermutation<>( + span(metadata.inner_order.data(), metadata.rank))}, + metadata.chunk_shape); + component.array_spec.fill_value_comparison_kind = + EqualityComparisonKind::identical; + } return internal::ChunkGridSpecification(std::move(components)); } @@ -381,7 +398,7 @@ class DataCacheBase Result> GetExternalToInternalTransform( const void* metadata_ptr, size_t component_index) override { - assert(component_index == 0); + // component_index corresponds to the selected field index const auto& metadata = *static_cast(metadata_ptr); const DimensionIndex rank = metadata.rank; std::string_view normalized_dimension_names[kMaxRank]; @@ -404,10 +421,16 @@ class DataCacheBase absl::Status GetBoundSpecData(KvsDriverSpec& spec_base, const void* metadata_ptr, size_t component_index) override { - assert(component_index == 0); auto& spec = static_cast(spec_base); const auto& metadata = *static_cast(metadata_ptr); spec.metadata_constraints = ZarrMetadataConstraints(metadata); + // Encode selected_field from component_index + if (metadata.data_type.has_fields && + component_index < metadata.data_type.fields.size()) { + spec.selected_field = metadata.data_type.fields[component_index].name; + } else { + spec.selected_field.clear(); + } return absl::OkStatus(); } @@ -513,7 +536,8 @@ class ZarrDriver : public ZarrDriverBase { AnyFlowReceiver> receiver) override { return cache()->zarr_chunk_cache().Read( - {std::move(request), GetCurrentDataStalenessBound(), + {std::move(request), this->component_index(), + GetCurrentDataStalenessBound(), this->fill_value_mode_.fill_missing_data_reads}, std::move(receiver)); } @@ -523,7 +547,7 @@ class ZarrDriver : public ZarrDriverBase { AnyFlowReceiver> receiver) override { return cache()->zarr_chunk_cache().Write( - {std::move(request), + {std::move(request), this->component_index(), this->fill_value_mode_.store_data_equal_to_fill_value}, std::move(receiver)); } @@ -621,7 +645,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { *static_cast(initializer.metadata.get()); return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, - metadata.codec_state, /*data_cache_pool=*/*cache_pool()); + metadata.codec_state, metadata.data_type, + /*data_cache_pool=*/*cache_pool()); } Result GetComponentIndex(const void* metadata_ptr, @@ -629,9 +654,12 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { const auto& metadata = *static_cast(metadata_ptr); TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().metadata_constraints)); + TENSORSTORE_ASSIGN_OR_RETURN( + auto field_index, + GetFieldIndex(metadata.data_type, spec().selected_field)); TENSORSTORE_RETURN_IF_ERROR( - ValidateMetadataSchema(metadata, spec().schema)); - return 0; + ValidateMetadataSchema(metadata, field_index, spec().schema)); + return field_index; } }; diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 8d1c9d49e..281b9c98b 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -76,20 +76,12 @@ namespace { /// \param value The zarr metadata "dtype" JSON specification. /// \param out[out] Must be non-null. Filled with the parsed dtype on success. /// \error `absl::StatusCode::kInvalidArgument' if `value` is invalid. -Result ParseDTypeNoDerived(const nlohmann::json& value) { - ZarrDType out; - if (value.is_string()) { - // Single field. - out.has_fields = false; - out.fields.resize(1); - TENSORSTORE_ASSIGN_OR_RETURN( - static_cast(out.fields[0]), - ParseBaseDType(value.get())); - return out; - } +// Helper to parse fields array (used by both array format and object format) +absl::Status ParseFieldsArray(const nlohmann::json& fields_json, + ZarrDType& out) { out.has_fields = true; - auto parse_result = internal_json::JsonParseArray( - value, + return internal_json::JsonParseArray( + fields_json, [&](ptrdiff_t size) { out.fields.resize(size); return absl::OkStatus(); @@ -140,7 +132,51 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { } }); }); - if (!parse_result.ok()) return parse_result; +} + +Result ParseDTypeNoDerived(const nlohmann::json& value) { + ZarrDType out; + if (value.is_string()) { + // Single field. + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(value.get())); + return out; + } + // Handle extended object format: + // {"name": "structured", "configuration": {"fields": [...]}} + if (value.is_object()) { + if (value.contains("name") && value.contains("configuration")) { + std::string type_name; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(value["name"], &type_name)); + if (type_name == "structured") { + const auto& config = value["configuration"]; + if (!config.is_object() || !config.contains("fields")) { + return absl::InvalidArgumentError( + "Structured data type requires 'configuration' object with " + "'fields' array"); + } + TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(config["fields"], out)); + return out; + } + // For other named types, try to parse as a base dtype + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(type_name)); + return out; + } + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected string, array, or object with 'name' and 'configuration', " + "but received: ", + value.dump())); + } + // Handle array format: [["field1", "type1"], ["field2", "type2"], ...] + TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(value, out)); return out; } diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index c96c31426..880991e8c 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -31,7 +31,10 @@ #include #include +#include + #include "absl/algorithm/container.h" +#include "absl/strings/escaping.h" #include "absl/base/casts.h" #include "absl/base/optimization.h" #include "absl/meta/type_traits.h" @@ -282,16 +285,44 @@ absl::Status FillValueJsonBinder::operator()( TENSORSTORE_RETURN_IF_ERROR( DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); } else { - if (!j->is_array()) { - return internal_json::ExpectedError(*j, "array"); - } - if (j->size() != dtype.fields.size()) { - return internal_json::ExpectedError( - *j, tensorstore::StrCat("array of size ", dtype.fields.size())); - } - for (size_t i = 0; i < dtype.fields.size(); ++i) { - TENSORSTORE_RETURN_IF_ERROR( - DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + // For structured types, handle both array format and base64-encoded string + if (j->is_string()) { + // Decode base64-encoded fill value for entire struct + std::string b64_decoded; + if (!absl::Base64Unescape(j->get(), &b64_decoded)) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected valid base64-encoded fill value, but received: ", + j->dump())); + } + // Verify size matches expected struct size + if (static_cast(b64_decoded.size()) != + dtype.bytes_per_outer_element) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected ", dtype.bytes_per_outer_element, + " base64-encoded bytes for fill_value, but received ", + b64_decoded.size(), " bytes")); + } + // Extract per-field fill values from decoded bytes + for (size_t i = 0; i < dtype.fields.size(); ++i) { + const auto& field = dtype.fields[i]; + auto arr = AllocateArray(span{}, c_order, default_init, + field.dtype); + std::memcpy(arr.data(), b64_decoded.data() + field.byte_offset, + field.dtype->size); + (*obj)[i] = std::move(arr); + } + } else if (j->is_array()) { + if (j->size() != dtype.fields.size()) { + return internal_json::ExpectedError( + *j, tensorstore::StrCat("array of size ", dtype.fields.size())); + } + for (size_t i = 0; i < dtype.fields.size(); ++i) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + } + } else { + return internal_json::ExpectedError(*j, + "array or base64-encoded string"); } } return absl::OkStatus(); @@ -561,28 +592,33 @@ std::string ZarrMetadata::GetCompatibilityKey() const { } absl::Status ValidateMetadata(ZarrMetadata& metadata) { + // Determine if this is a structured type with multiple fields + const bool is_structured = + metadata.data_type.fields.size() > 1 || + (metadata.data_type.fields.size() == 1 && + !metadata.data_type.fields[0].outer_shape.empty()); + + // Build the codec shape - for structured types, include bytes dimension + std::vector codec_shape(metadata.chunk_shape.begin(), + metadata.chunk_shape.end()); + if (is_structured) { + codec_shape.push_back(metadata.data_type.bytes_per_outer_element); + } + if (!metadata.codecs) { ArrayCodecResolveParameters decoded; - if (metadata.data_type.fields.size() == 1 && - metadata.data_type.fields[0].outer_shape.empty()) { + if (!is_structured) { decoded.dtype = metadata.data_type.fields[0].dtype; + decoded.rank = metadata.rank; } else { + // For structured types, use byte dtype with extra dimension decoded.dtype = dtype_v; - // TODO: Verify this works for structured types. - // Zarr2 uses a "scalar" array concept with byte storage for chunks. + decoded.rank = metadata.rank + 1; } - decoded.rank = metadata.rank; // Fill value for codec resolve might be complex. - // Zarr3 codecs usually don't depend on fill value except for some like - // "sharding_indexed"? Sharding uses fill_value for missing chunks. - if (metadata.fill_value.size() == 1) { + // For structured types, create a byte fill value + if (metadata.fill_value.size() == 1 && !is_structured) { decoded.fill_value = metadata.fill_value[0]; - } else { - // How to represent structured fill value for codec? - // Sharding expects a single array. - // If we use structured type, the "array" is bytes. - // We might need to encode the fill value to bytes. - // For now, leave empty if multiple fields. } BytesCodecResolveParameters encoded; @@ -593,17 +629,19 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { // Get codec chunk layout info. ArrayDataTypeAndShapeInfo array_info; - // array_info.dtype used here to validate codec compatibility. - if (metadata.data_type.fields.size() == 1 && - metadata.data_type.fields[0].outer_shape.empty()) { + if (!is_structured) { array_info.dtype = metadata.data_type.fields[0].dtype; + array_info.rank = metadata.rank; + std::copy_n(metadata.chunk_shape.begin(), metadata.rank, + array_info.shape.emplace().begin()); } else { array_info.dtype = dtype_v; + array_info.rank = metadata.rank + 1; + auto& shape = array_info.shape.emplace(); + std::copy_n(metadata.chunk_shape.begin(), metadata.rank, shape.begin()); + shape[metadata.rank] = metadata.data_type.bytes_per_outer_element; } - array_info.rank = metadata.rank; - std::copy_n(metadata.chunk_shape.begin(), metadata.rank, - array_info.shape.emplace().begin()); ArrayCodecChunkLayoutInfo layout_info; TENSORSTORE_RETURN_IF_ERROR( metadata.codec_specs.GetDecodedChunkLayout(array_info, layout_info)); @@ -617,7 +655,7 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { } TENSORSTORE_ASSIGN_OR_RETURN(metadata.codec_state, - metadata.codecs->Prepare(metadata.chunk_shape)); + metadata.codecs->Prepare(codec_shape)); return absl::OkStatus(); } From c2e73cd6b1a2dcd5499522dce0bacd378af43279 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 22:57:22 +0000 Subject: [PATCH 03/59] Local testing and examples --- examples/BUILD | 23 +++ examples/CMakeLists.txt | 163 ++++++++++++++++++ examples/read_structured_zarr3.cc | 271 ++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+) create mode 100644 examples/CMakeLists.txt create mode 100644 examples/read_structured_zarr3.cc diff --git a/examples/BUILD b/examples/BUILD index 94acdba14..4dcb2d604 100644 --- a/examples/BUILD +++ b/examples/BUILD @@ -122,3 +122,26 @@ tensorstore_cc_binary( "@riegeli//riegeli/bytes:writer", ], ) + +tensorstore_cc_binary( + name = "read_structured_zarr3", + srcs = ["read_structured_zarr3.cc"], + deps = [ + "//tensorstore", + "//tensorstore:array", + "//tensorstore:context", + "//tensorstore:data_type", + "//tensorstore:index", + "//tensorstore:open", + "//tensorstore:open_mode", + "//tensorstore:spec", + "//tensorstore/driver/zarr3", + "//tensorstore/kvstore/file", + "//tensorstore/util:result", + "//tensorstore/util:status", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/flags:parse", + "@abseil-cpp//absl/status", + "@nlohmann_json//:json", + ], +) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..92e9857fa --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,163 @@ +# Standalone CMakeLists.txt for read_structured_zarr3 example +# +# Build instructions: +# mkdir -p /home/ubuntu/source/tensorstore/examples/build +# cd /home/ubuntu/source/tensorstore/examples/build +# cmake .. +# make +# +# Run: +# ./read_structured_zarr3 --zarr_path=/home/ubuntu/source/tensorstore/filt_mig.mdio/headers + +cmake_minimum_required(VERSION 3.24) +project(read_structured_zarr3 LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Path to the tensorstore build directory +set(TENSORSTORE_BUILD_DIR "/home/ubuntu/source/tensorstore/build" CACHE PATH "Path to tensorstore build directory") +set(TENSORSTORE_SOURCE_DIR "/home/ubuntu/source/tensorstore" CACHE PATH "Path to tensorstore source directory") +set(DEPS_DIR "${TENSORSTORE_BUILD_DIR}/_deps") + +# Include paths (matching what tensorstore tests use) +include_directories( + ${TENSORSTORE_SOURCE_DIR} + ${DEPS_DIR}/absl-src + ${DEPS_DIR}/re2-src + ${DEPS_DIR}/riegeli-src +) + +include_directories(SYSTEM + ${DEPS_DIR}/half-build/include + ${DEPS_DIR}/half-src/include + ${DEPS_DIR}/nlohmann_json-build/include + ${DEPS_DIR}/nlohmann_json-src/include + ${TENSORSTORE_BUILD_DIR} +) + +# Compiler flags +add_compile_options( + -fPIE + -Wno-deprecated-declarations + -Wno-sign-compare + -Wno-unused-but-set-parameter + -Wno-maybe-uninitialized + -Wno-sequence-point + -Wno-unknown-warning-option + -Wno-stringop-overflow + -fsized-deallocation +) + +# Find all the static libraries we need from the tensorstore build +file(GLOB TENSORSTORE_LIBS "${TENSORSTORE_BUILD_DIR}/libtensorstore*.a") +file(GLOB_RECURSE ABSEIL_LIBS "${DEPS_DIR}/absl-build/absl/*.a") +file(GLOB_RECURSE RIEGELI_LIBS "${DEPS_DIR}/riegeli-build/*.a") + +# Additional dependency libraries - corrected paths +file(GLOB_RECURSE BLOSC_LIBS "${DEPS_DIR}/blosc-build/*.a") +file(GLOB_RECURSE ZSTD_LIBS "${DEPS_DIR}/zstd-build/*.a") +file(GLOB_RECURSE RE2_LIBS "${DEPS_DIR}/re2-build/*.a") +file(GLOB_RECURSE SNAPPY_LIBS "${DEPS_DIR}/snappy-build/*.a") +file(GLOB_RECURSE BROTLI_LIBS "${DEPS_DIR}/brotli-build/*.a") +file(GLOB_RECURSE LZ4_LIBS "${DEPS_DIR}/lz4-build/*.a") +file(GLOB_RECURSE ZLIB_LIBS "${DEPS_DIR}/zlib-build/*.a") +file(GLOB_RECURSE PROTOBUF_LIBS "${DEPS_DIR}/protobuf-build/*.a") +file(GLOB_RECURSE GRPC_LIBS "${DEPS_DIR}/grpc-build/*.a") +file(GLOB_RECURSE CARES_LIBS "${DEPS_DIR}/c-ares-build/*.a") +file(GLOB_RECURSE SSL_LIBS "${DEPS_DIR}/boringssl-build/ssl/*.a") +file(GLOB_RECURSE CRYPTO_LIBS "${DEPS_DIR}/boringssl-build/crypto/*.a") +file(GLOB_RECURSE LIBLZMA_LIBS "${DEPS_DIR}/liblzma-build/*.a") +file(GLOB_RECURSE BZIP2_LIBS "${DEPS_DIR}/bzip2-build/*.a") +file(GLOB_RECURSE JPEG_LIBS "${DEPS_DIR}/jpeg-build/*.a") +file(GLOB_RECURSE PNG_LIBS "${DEPS_DIR}/png-build/*.a") +file(GLOB_RECURSE TIFF_LIBS "${DEPS_DIR}/tiff-build/*.a") +file(GLOB_RECURSE AVIF_LIBS "${DEPS_DIR}/avif-build/*.a") +file(GLOB_RECURSE AOM_LIBS "${DEPS_DIR}/aom-build/*.a") +file(GLOB_RECURSE WEBP_LIBS "${DEPS_DIR}/webp-build/*.a") +file(GLOB_RECURSE CURL_LIBS "${DEPS_DIR}/curl-build/*.a") + +# Create executable +add_executable(read_structured_zarr3 read_structured_zarr3.cc) + +# Link libraries - use whole-archive for libraries that use static registration +# These include drivers, codecs, kvstores, and context resource providers +target_link_libraries(read_structured_zarr3 PRIVATE + # Force inclusion of libraries with static registrations + -Wl,--whole-archive + + # Context resource providers + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_data_copy_concurrency_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_file_io_concurrency_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_cache_cache_pool_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_concurrency_resource.a + + # Zarr3 driver and codecs + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_driver.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_blosc.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_bytes.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_crc32c.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_gzip.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_transpose.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_zstd.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_sharding_indexed.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_codec_chain_spec.a + + # File kvstore and its resource providers + ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file_file_resource.a + + -Wl,--no-whole-archive + + -Wl,--start-group + + # Tensorstore libs + ${TENSORSTORE_LIBS} + + # Riegeli + ${RIEGELI_LIBS} + + # Abseil + ${ABSEIL_LIBS} + + # Compression libs + ${BLOSC_LIBS} + ${ZSTD_LIBS} + ${LZ4_LIBS} + ${SNAPPY_LIBS} + ${BROTLI_LIBS} + ${ZLIB_LIBS} + ${LIBLZMA_LIBS} + ${BZIP2_LIBS} + + # Regex + ${RE2_LIBS} + + # Protocol buffers and gRPC + ${PROTOBUF_LIBS} + ${GRPC_LIBS} + ${CARES_LIBS} + + # SSL/TLS + ${SSL_LIBS} + ${CRYPTO_LIBS} + + # Image libraries + ${JPEG_LIBS} + ${PNG_LIBS} + ${TIFF_LIBS} + ${AVIF_LIBS} + ${AOM_LIBS} + ${WEBP_LIBS} + + # HTTP + ${CURL_LIBS} + + -Wl,--end-group + + # System libraries + pthread + dl + m + rt +) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc new file mode 100644 index 000000000..1caacd8f5 --- /dev/null +++ b/examples/read_structured_zarr3.cc @@ -0,0 +1,271 @@ +// Copyright 2024 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Standalone test for reading structured data from a Zarr v3 array. +// +// This test opens an existing zarr3 array with structured data type, +// reads the "inline" field, and prints all values. +// +// Usage: +// bazel run //examples:read_structured_zarr3 -- /path/to/zarr/array +// +// Or with cmake: +// cd examples/build && ./read_structured_zarr3 + +#include + +#include +#include +#include +#include + +#include "absl/flags/flag.h" +#include "absl/flags/parse.h" +#include "absl/status/status.h" +#include +#include "tensorstore/array.h" +#include "tensorstore/context.h" +#include "tensorstore/data_type.h" +#include "tensorstore/index.h" +#include "tensorstore/open.h" +#include "tensorstore/open_mode.h" +#include "tensorstore/spec.h" +#include "tensorstore/tensorstore.h" +#include "tensorstore/util/result.h" +#include "tensorstore/util/status.h" + +ABSL_FLAG(std::string, zarr_path, + "/home/ubuntu/source/tensorstore/filt_mig.mdio/headers", + "Path to the zarr3 array directory"); + +namespace { + +using ::tensorstore::Index; + +// Field layout from the zarr.json metadata: +// The structured dtype has the following fields with their byte offsets: +// trace_seq_num_line: int32 @ 0 +// trace_seq_num_reel: int32 @ 4 +// ... (many more fields) ... +// inline: int32 @ 180 +// crossline: int32 @ 184 +// cdp_x: int32 @ 188 +// cdp_y: int32 @ 192 +// +// Total struct size: 196 bytes (matches blosc typesize) + +constexpr size_t kInlineFieldOffset = 180; +constexpr size_t kStructSize = 196; + +// Read and parse the zarr.json metadata to display info about structured type +void PrintZarrMetadata(const std::string& zarr_path) { + std::string metadata_path = zarr_path + "/zarr.json"; + std::ifstream file(metadata_path); + if (!file.is_open()) { + std::cerr << "Could not open " << metadata_path << std::endl; + return; + } + + nlohmann::json metadata; + try { + file >> metadata; + } catch (const nlohmann::json::parse_error& e) { + std::cerr << "Failed to parse zarr.json: " << e.what() << std::endl; + return; + } + + std::cout << "\n=== Zarr Metadata ===" << std::endl; + std::cout << "Shape: " << metadata["shape"].dump() << std::endl; + std::cout << "Dimension names: " << metadata["dimension_names"].dump() + << std::endl; + + if (metadata.contains("data_type")) { + auto& dt = metadata["data_type"]; + std::cout << "\nData type format:" << std::endl; + if (dt.is_object()) { + std::cout << " Type: object with name=\"" << dt["name"].get() + << "\"" << std::endl; + if (dt.contains("configuration") && + dt["configuration"].contains("fields")) { + auto& fields = dt["configuration"]["fields"]; + std::cout << " Number of fields: " << fields.size() << std::endl; + std::cout << " Fields:" << std::endl; + size_t byte_offset = 0; + for (const auto& field : fields) { + std::string name = field[0].get(); + std::string type = field[1].get(); + size_t size = (type == "int32" || type == "uint32" || type == "float32") + ? 4 + : 2; // int16/uint16 + std::cout << " " << name << ": " << type << " @ byte " << byte_offset + << std::endl; + byte_offset += size; + } + std::cout << " Total struct size: " << byte_offset << " bytes" + << std::endl; + } + } else if (dt.is_string()) { + std::cout << " Type: simple \"" << dt.get() << "\"" + << std::endl; + } else if (dt.is_array()) { + std::cout << " Type: array with " << dt.size() << " fields" << std::endl; + } + } + + if (metadata.contains("codecs")) { + std::cout << "\nCodecs: " << metadata["codecs"].dump(2) << std::endl; + } +} + +absl::Status Run(const std::string& zarr_path) { + std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; + std::cout << "Opening zarr3 array at: " << zarr_path << std::endl; + + // First, display metadata information + PrintZarrMetadata(zarr_path); + + auto context = tensorstore::Context::Default(); + + // Create spec for opening the zarr3 array + // Note: "field" is at the driver level, not inside kvstore (same as zarr v2) + ::nlohmann::json spec_json = { + {"driver", "zarr3"}, + {"kvstore", + { + {"driver", "file"}, + {"path", zarr_path + "/"}, + }}, + {"field", "inline"}, // Field at byte offset 180 + }; + + std::cout << "\n=== Opening TensorStore ===" << std::endl; + std::cout << "Spec: " << spec_json.dump(2) << std::endl; + + // Open the TensorStore + auto open_result = + tensorstore::Open(spec_json, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!open_result.ok()) { + std::cout << "\n=== Open Failed ===" << std::endl; + std::cout << "Status: " << open_result.status() << std::endl; + std::cout << "\nThis error is expected if the zarr3 driver's dtype parsing\n" + << "does not yet support the extended structured data type format:\n" + << " {\"name\": \"structured\", \"configuration\": {\"fields\": [...]}}\n" + << std::endl; + std::cout << "The dtype.cc ParseDTypeNoDerived() function currently handles:\n" + << " 1. String format: \"int32\"\n" + << " 2. Array format: [[\"field1\", \"int32\"], ...]\n" + << "\nBut the zarr.json uses the extended object format shown above." + << std::endl; + return open_result.status(); + } + + auto store = std::move(open_result).value(); + + // Get information about the array + auto domain = store.domain(); + std::cout << "\n=== Array Info ===" << std::endl; + std::cout << "Domain: " << domain << std::endl; + std::cout << "Dtype: " << store.dtype() << std::endl; + std::cout << "Rank: " << store.rank() << std::endl; + + auto shape = domain.shape(); + std::cout << "Shape: ["; + for (int i = 0; i < shape.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << shape[i]; + } + std::cout << "]" << std::endl; + + // Read all data + std::cout << "\n=== Reading Data ===" << std::endl; + TENSORSTORE_ASSIGN_OR_RETURN( + auto array, tensorstore::Read(store).result()); + + std::cout << "Read complete. Array size: " << array.num_elements() + << " elements" << std::endl; + std::cout << "Data type: " << array.dtype() << std::endl; + + // Since field="inline" was specified, the array contains just int32 values + // directly - no struct extraction needed! + Index num_inline = shape[0]; + Index num_crossline = shape[1]; + + std::cout << "\n=== Inline field values (shape: " << num_inline << " x " + << num_crossline << ") ===" << std::endl; + + // Cast to int32 pointer since the data is already the inline field values + auto int_ptr = reinterpret_cast(array.data()); + + // Print first 10 rows (or fewer if less data) + Index rows_to_print = std::min(num_inline, Index{10}); + Index cols_to_print = std::min(num_crossline, Index{10}); + + for (Index i = 0; i < rows_to_print; ++i) { + for (Index j = 0; j < cols_to_print; ++j) { + std::cout << int_ptr[i * num_crossline + j]; + if (j < cols_to_print - 1) { + std::cout << "\t"; + } + } + if (num_crossline > cols_to_print) { + std::cout << "\t..."; + } + std::cout << std::endl; + } + if (num_inline > rows_to_print) { + std::cout << "... (" << (num_inline - rows_to_print) << " more rows)" + << std::endl; + } + + std::cout << "\n=== Summary ===" << std::endl; + std::cout << "Successfully read " << (num_inline * num_crossline) + << " inline values" << std::endl; + + // Show some statistics + int32_t min_val = int_ptr[0], max_val = int_ptr[0]; + int64_t sum = 0; + for (Index i = 0; i < num_inline * num_crossline; ++i) { + min_val = std::min(min_val, int_ptr[i]); + max_val = std::max(max_val, int_ptr[i]); + sum += int_ptr[i]; + } + std::cout << "Min value: " << min_val << std::endl; + std::cout << "Max value: " << max_val << std::endl; + std::cout << "Mean value: " << (static_cast(sum) / (num_inline * num_crossline)) << std::endl; + + return absl::OkStatus(); +} + +} // namespace + +int main(int argc, char** argv) { + absl::ParseCommandLine(argc, argv); + + std::string zarr_path = absl::GetFlag(FLAGS_zarr_path); + if (zarr_path.empty()) { + std::cerr << "Error: --zarr_path is required" << std::endl; + return 1; + } + + auto status = Run(zarr_path); + if (!status.ok()) { + std::cerr << "\nFinal status: " << status << std::endl; + return 1; + } + + return 0; +} From 9e8ed947f5912394ca715d36d6fd1eb630d04e8a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 18:12:58 +0000 Subject: [PATCH 04/59] Begin adding support for opening struct arrays as void and add support for raw bits dtype --- examples/read_structured_zarr3.cc | 324 +++++++++++++++++++----- tensorstore/driver/zarr3/chunk_cache.cc | 7 + tensorstore/driver/zarr3/driver.cc | 180 +++++++++++-- tensorstore/driver/zarr3/dtype.cc | 52 +++- tensorstore/driver/zarr3/dtype_test.cc | 14 + tensorstore/driver/zarr3/metadata.cc | 89 ++++++- 6 files changed, 565 insertions(+), 101 deletions(-) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index 1caacd8f5..259eade34 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -12,16 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Standalone test for reading structured data from a Zarr v3 array. +// Standalone test for reading structured data from Zarr v3 arrays. // -// This test opens an existing zarr3 array with structured data type, -// reads the "inline" field, and prints all values. +// This test opens two Zarr v3 arrays: +// 1. A structured array with named fields (headers/) +// 2. A raw bytes array containing struct data (raw_headers/) +// +// Both arrays should contain the same data, allowing comparison of: +// - Field-based access vs manual byte extraction +// - Structured dtype parsing vs raw byte handling // // Usage: -// bazel run //examples:read_structured_zarr3 -- /path/to/zarr/array +// bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir // // Or with cmake: -// cd examples/build && ./read_structured_zarr3 +// cd examples/build && ./read_structured_zarr3 --zarr_path=/path/to/parent/dir +// +// Where the parent dir contains both 'headers/' and 'raw_headers/' subdirs. #include @@ -45,9 +52,15 @@ #include "tensorstore/util/result.h" #include "tensorstore/util/status.h" +// Internal headers for testing dtype parsing +#include "tensorstore/driver/zarr3/dtype.h" + +// Additional headers for string operations +#include "absl/strings/str_join.h" + ABSL_FLAG(std::string, zarr_path, - "/home/ubuntu/source/tensorstore/filt_mig.mdio/headers", - "Path to the zarr3 array directory"); + "/home/ubuntu/source/tensorstore/filt_mig.mdio", + "Path to the parent .mdio directory containing headers/ and raw_headers/"); namespace { @@ -128,56 +141,13 @@ void PrintZarrMetadata(const std::string& zarr_path) { } } -absl::Status Run(const std::string& zarr_path) { - std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; - std::cout << "Opening zarr3 array at: " << zarr_path << std::endl; - - // First, display metadata information - PrintZarrMetadata(zarr_path); - - auto context = tensorstore::Context::Default(); - - // Create spec for opening the zarr3 array - // Note: "field" is at the driver level, not inside kvstore (same as zarr v2) - ::nlohmann::json spec_json = { - {"driver", "zarr3"}, - {"kvstore", - { - {"driver", "file"}, - {"path", zarr_path + "/"}, - }}, - {"field", "inline"}, // Field at byte offset 180 - }; - - std::cout << "\n=== Opening TensorStore ===" << std::endl; - std::cout << "Spec: " << spec_json.dump(2) << std::endl; - - // Open the TensorStore - auto open_result = - tensorstore::Open(spec_json, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!open_result.ok()) { - std::cout << "\n=== Open Failed ===" << std::endl; - std::cout << "Status: " << open_result.status() << std::endl; - std::cout << "\nThis error is expected if the zarr3 driver's dtype parsing\n" - << "does not yet support the extended structured data type format:\n" - << " {\"name\": \"structured\", \"configuration\": {\"fields\": [...]}}\n" - << std::endl; - std::cout << "The dtype.cc ParseDTypeNoDerived() function currently handles:\n" - << " 1. String format: \"int32\"\n" - << " 2. Array format: [[\"field1\", \"int32\"], ...]\n" - << "\nBut the zarr.json uses the extended object format shown above." - << std::endl; - return open_result.status(); - } - - auto store = std::move(open_result).value(); - +// Helper function to read and display inline field from an array +absl::Status ReadInlineField(const tensorstore::TensorStore<>& store, + const std::string& array_name, + bool is_raw_bytes = false) { // Get information about the array auto domain = store.domain(); - std::cout << "\n=== Array Info ===" << std::endl; + std::cout << "\n=== " << array_name << " Array Info ===" << std::endl; std::cout << "Domain: " << domain << std::endl; std::cout << "Dtype: " << store.dtype() << std::endl; std::cout << "Rank: " << store.rank() << std::endl; @@ -191,7 +161,7 @@ absl::Status Run(const std::string& zarr_path) { std::cout << "]" << std::endl; // Read all data - std::cout << "\n=== Reading Data ===" << std::endl; + std::cout << "\n=== Reading " << array_name << " Data ===" << std::endl; TENSORSTORE_ASSIGN_OR_RETURN( auto array, tensorstore::Read(store).result()); @@ -199,16 +169,46 @@ absl::Status Run(const std::string& zarr_path) { << " elements" << std::endl; std::cout << "Data type: " << array.dtype() << std::endl; - // Since field="inline" was specified, the array contains just int32 values - // directly - no struct extraction needed! - Index num_inline = shape[0]; - Index num_crossline = shape[1]; + Index num_inline, num_crossline; + const int32_t* int_ptr; + + if (is_raw_bytes) { + // For raw bytes, we need to extract the inline field manually + // Shape is [inline, crossline, struct_size] + num_inline = shape[0]; + num_crossline = shape[1]; + Index struct_size = shape[2]; + if (struct_size != kStructSize) { + std::cout << "Warning: Raw struct size (" << struct_size + << ") differs from expected header struct size (" << kStructSize + << "). Assuming padding." << std::endl; + } - std::cout << "\n=== Inline field values (shape: " << num_inline << " x " - << num_crossline << ") ===" << std::endl; + // Extract inline field (4 bytes starting at offset 180) + auto byte_ptr = reinterpret_cast(array.data()); + std::vector inline_values(num_inline * num_crossline); - // Cast to int32 pointer since the data is already the inline field values - auto int_ptr = reinterpret_cast(array.data()); + for (Index i = 0; i < num_inline; ++i) { + for (Index j = 0; j < num_crossline; ++j) { + Index struct_offset = (i * num_crossline + j) * struct_size; + Index field_offset = struct_offset + kInlineFieldOffset; + std::memcpy(&inline_values[i * num_crossline + j], + byte_ptr + field_offset, 4); + } + } + + std::cout << "Extracted inline field from raw bytes at offset " + << kInlineFieldOffset << std::endl; + int_ptr = inline_values.data(); + } else { + // For structured array, field access already gave us int32 values + num_inline = shape[0]; + num_crossline = shape[1]; + int_ptr = reinterpret_cast(array.data()); + } + + std::cout << "\n=== Inline field values from " << array_name + << " (shape: " << num_inline << " x " << num_crossline << ") ===" << std::endl; // Print first 10 rows (or fewer if less data) Index rows_to_print = std::min(num_inline, Index{10}); @@ -231,10 +231,10 @@ absl::Status Run(const std::string& zarr_path) { << std::endl; } - std::cout << "\n=== Summary ===" << std::endl; + std::cout << "\n=== " << array_name << " Summary ===" << std::endl; std::cout << "Successfully read " << (num_inline * num_crossline) << " inline values" << std::endl; - + // Show some statistics int32_t min_val = int_ptr[0], max_val = int_ptr[0]; int64_t sum = 0; @@ -250,6 +250,189 @@ absl::Status Run(const std::string& zarr_path) { return absl::OkStatus(); } +absl::Status Run(const std::string& zarr_path) { + std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; + std::cout << "Opening zarr3 arrays in: " << zarr_path << std::endl; + + auto context = tensorstore::Context::Default(); + + // First, display metadata information for structured array + std::string headers_path = zarr_path + "/headers"; + PrintZarrMetadata(headers_path); + + // Test raw_bytes parsing by reading and parsing the raw_headers zarr.json + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TESTING RAW_BYTES PARSING" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + std::string raw_metadata_path = zarr_path + "/raw_headers/zarr.json"; + std::ifstream raw_file(raw_metadata_path); + if (!raw_file.is_open()) { + std::cout << "Could not open " << raw_metadata_path << std::endl; + return absl::NotFoundError("Raw headers metadata not found"); + } + + nlohmann::json raw_metadata; + try { + raw_file >> raw_metadata; + } catch (const nlohmann::json::parse_error& e) { + std::cout << "Failed to parse raw zarr.json: " << e.what() << std::endl; + return absl::DataLossError("Invalid raw metadata JSON"); + } + + std::cout << "Raw headers data_type: " << raw_metadata["data_type"].dump(2) << std::endl; + + // Test parsing the raw_bytes data type + std::cout << "Testing raw_bytes dtype parsing..." << std::endl; + + // For now, just verify the JSON structure is what we expect + if (!raw_metadata.contains("data_type")) { + std::cout << "FAILED: No data_type in metadata" << std::endl; + return absl::NotFoundError("Missing data_type"); + } + + auto& dt = raw_metadata["data_type"]; + if (!dt.is_object() || !dt.contains("name") || dt["name"] != "raw_bytes") { + std::cout << "FAILED: data_type is not raw_bytes extension" << std::endl; + return absl::InvalidArgumentError("Not raw_bytes extension"); + } + + if (!dt.contains("configuration") || !dt["configuration"].contains("length_bytes")) { + std::cout << "FAILED: Missing length_bytes in configuration" << std::endl; + return absl::InvalidArgumentError("Missing length_bytes"); + } + + int length_bytes = dt["configuration"]["length_bytes"]; + std::cout << "SUCCESS: Found raw_bytes extension with length_bytes = " << length_bytes << std::endl; + std::cout << "This should parse to:" << std::endl; + std::cout << " - Single field with byte_t dtype" << std::endl; + std::cout << " - Field shape: [" << length_bytes << "]" << std::endl; + std::cout << " - Bytes per outer element: " << length_bytes << std::endl; + + // Now actually test the parsing implementation + std::cout << "\n=== Testing ParseDType Implementation ===" << std::endl; + auto dtype_result = tensorstore::internal_zarr3::ParseDType(dt); + if (!dtype_result.ok()) { + std::cout << "FAILED: Could not parse raw_bytes data type: " << dtype_result.status() << std::endl; + return dtype_result.status(); + } + + auto dtype = std::move(dtype_result).value(); + std::cout << "SUCCESS: ParseDType worked!" << std::endl; + std::cout << " Fields: " << dtype.fields.size() << std::endl; + std::cout << " Has fields: " << dtype.has_fields << std::endl; + std::cout << " Bytes per outer element: " << dtype.bytes_per_outer_element << std::endl; + + if (!dtype.fields.empty()) { + const auto& field = dtype.fields[0]; + std::cout << " Field name: '" << field.name << "'" << std::endl; + std::cout << " Field dtype: " << field.dtype << std::endl; + std::cout << " Field shape: [" << absl::StrJoin(field.field_shape, ", ") << "]" << std::endl; + std::cout << " Field num_inner_elements: " << field.num_inner_elements << std::endl; + std::cout << " Field num_bytes: " << field.num_bytes << std::endl; + } + + // Verify the parsing is correct + bool parsing_correct = true; + if (dtype.fields.size() != 1) { + std::cout << "ERROR: Expected 1 field, got " << dtype.fields.size() << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].name != "") { + std::cout << "ERROR: Expected empty field name, got '" << dtype.fields[0].name << "'" << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].dtype != tensorstore::dtype_v) { + std::cout << "ERROR: Expected byte_t dtype, got " << dtype.fields[0].dtype << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].field_shape != std::vector{length_bytes}) { + std::cout << "ERROR: Expected field shape [" << length_bytes << "], got [" + << absl::StrJoin(dtype.fields[0].field_shape, ", ") << "]" << std::endl; + parsing_correct = false; + } + if (dtype.bytes_per_outer_element != length_bytes) { + std::cout << "ERROR: Expected " << length_bytes << " bytes per element, got " + << dtype.bytes_per_outer_element << std::endl; + parsing_correct = false; + } + + if (parsing_correct) { + std::cout << "\n✅ PARSING VERIFICATION: All checks passed!" << std::endl; + std::cout << "The raw_bytes extension is correctly parsed." << std::endl; + } else { + std::cout << "\n❌ PARSING VERIFICATION: Some checks failed!" << std::endl; + return absl::InternalError("Parsing verification failed"); + } + + // Test 1: Read from structured array using field access + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TEST 1: Reading from structured 'headers' array" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + ::nlohmann::json headers_spec = ::nlohmann::json::object(); + headers_spec["driver"] = "zarr3"; + headers_spec["kvstore"] = ::nlohmann::json::object(); + headers_spec["kvstore"]["driver"] = "file"; + headers_spec["kvstore"]["path"] = headers_path + "/"; + headers_spec["field"] = "inline"; // Extract inline field (int32 at byte offset 180) + + std::cout << "Spec: " << headers_spec.dump(2) << std::endl; + + auto headers_open_result = + tensorstore::Open(headers_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!headers_open_result.ok()) { + std::cout << "\n=== Headers Open Failed ===" << std::endl; + std::cout << "Status: " << headers_open_result.status() << std::endl; + return headers_open_result.status(); + } + + auto headers_store = std::move(headers_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_store, "headers")); + + // Test 2: Read from raw bytes array (no special void access needed) + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TEST 2: Reading from raw 'raw_headers' array" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + std::string raw_headers_path = zarr_path + "/raw_headers"; + ::nlohmann::json raw_spec = ::nlohmann::json::object(); + raw_spec["driver"] = "zarr3"; + raw_spec["kvstore"] = ::nlohmann::json::object(); + raw_spec["kvstore"]["driver"] = "file"; + raw_spec["kvstore"]["path"] = raw_headers_path + "/"; + // No field specified - raw_bytes has a single anonymous field + + std::cout << "Spec: " << raw_spec.dump(2) << std::endl; + + auto raw_open_result = + tensorstore::Open(raw_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!raw_open_result.ok()) { + std::cout << "\n=== Raw Headers Open Failed ===" << std::endl; + std::cout << "Status: " << raw_open_result.status() << std::endl; + return raw_open_result.status(); + } + + auto raw_store = std::move(raw_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); + + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "COMPARISON: Both methods should give identical inline field values" << std::endl; + std::cout << std::string(60, '=') << std::endl; + std::cout << "The structured 'headers' array provides field access convenience,\n" + << "while the raw 'raw_headers' array provides direct byte access.\n" + << "Both extract the inline field from byte offset " << kInlineFieldOffset + << " in " << kStructSize << "-byte structs." << std::endl; + + return absl::OkStatus(); +} + } // namespace int main(int argc, char** argv) { @@ -261,6 +444,15 @@ int main(int argc, char** argv) { return 1; } + // Verify the path structure + std::string headers_path = zarr_path + "/headers"; + std::string raw_headers_path = zarr_path + "/raw_headers"; + + std::cout << "Expecting arrays at:" << std::endl; + std::cout << " Structured: " << headers_path << std::endl; + std::cout << " Raw bytes: " << raw_headers_path << std::endl; + std::cout << std::endl; + auto status = Run(zarr_path); if (!status.ok()) { std::cerr << "\nFinal status: " << status << std::endl; diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 6bfa8c039..64b6d69fd 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -156,6 +156,13 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, const size_t num_fields = dtype_.fields.size(); absl::InlinedVector, 1> field_arrays(num_fields); + // Special case: void access - return raw bytes directly + if (num_fields == 1 && dtype_.fields[0].name == "") { + TENSORSTORE_ASSIGN_OR_RETURN( + field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), + std::move(data))); + return field_arrays; + } // For single non-structured field, decode directly if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) { diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 1674a1c6d..b4d96da1f 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,8 @@ namespace tensorstore { namespace internal_zarr3 { +constexpr size_t kVoidFieldIndex = size_t(-1); + // Avoid anonymous namespace to workaround MSVC bug. // // https://developercommunity.visualstudio.com/t/Bug-involving-virtual-functions-templat/10424129 @@ -263,12 +266,29 @@ class DataCacheBase DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) override { const auto& metadata = *static_cast(metadata_ptr); - assert(bounds.rank() == static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); + assert(bounds.rank() >= static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), + bounds.origin().begin() + metadata.shape.size(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); implicit_lower_bounds = false; - implicit_upper_bounds = true; + implicit_upper_bounds = false; + for (DimensionIndex i = 0; + i < static_cast(metadata.shape.size()); ++i) { + implicit_upper_bounds[i] = true; + } + if (bounds.rank() > static_cast(metadata.shape.size()) && + metadata.data_type.fields.size() == 1) { + const auto& field = metadata.data_type.fields[0]; + if (static_cast(metadata.shape.size() + + field.field_shape.size()) == + bounds.rank()) { + for (size_t i = 0; i < field.field_shape.size(); ++i) { + bounds.shape()[metadata.shape.size() + i] = field.field_shape[i]; + bounds.origin()[metadata.shape.size() + i] = 0; + } + } + } } Result> GetResizedMetadata( @@ -289,10 +309,47 @@ class DataCacheBase } static internal::ChunkGridSpecification GetChunkGridSpecification( - const ZarrMetadata& metadata) { + const ZarrMetadata& metadata, size_t field_index = 0) { assert(!metadata.fill_value.empty()); internal::ChunkGridSpecification::ComponentList components; + // Special case: void access - create single component for entire struct + if (field_index == kVoidFieldIndex) { + // For void access, use the fill_value from the single raw_bytes field + auto& fill_value = metadata.fill_value[0]; + std::cout << "[DEBUG] Void access fill_value: shape=" << fill_value.shape() + << ", dtype=" << fill_value.dtype() << std::endl; + + // Broadcast to shape [unbounded, unbounded, ..., struct_size] + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.push_back(metadata.data_type.bytes_per_outer_element); + std::cout << "[DEBUG] Void access target_shape: ["; + for (size_t i = 0; i < target_shape.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << target_shape[i]; + } + std::cout << "]" << std::endl; + auto chunk_fill_value = + BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + + // Add extra dimension for struct size in bytes + std::vector chunk_shape_with_bytes = metadata.chunk_shape; + chunk_shape_with_bytes.push_back(metadata.data_type.bytes_per_outer_element); + + auto& component = components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), + // Since all dimensions are resizable, just + // specify unbounded `valid_data_bounds`. + Box<>(metadata.rank + 1), + ContiguousLayoutPermutation<>( + span(metadata.inner_order.data(), metadata.rank + 1))}, + chunk_shape_with_bytes); + component.array_spec.fill_value_comparison_kind = + EqualityComparisonKind::identical; + return internal::ChunkGridSpecification(std::move(components)); + } + // Create one component per field (like zarr v2) for (size_t field_i = 0; field_i < metadata.data_type.fields.size(); ++field_i) { @@ -303,18 +360,47 @@ class DataCacheBase fill_value = AllocateArray(span{}, c_order, value_init, field.dtype); } + + // Handle fields with shape (e.g. raw_bytes) + const size_t field_rank = field.field_shape.size(); + + // 1. Construct target shape for broadcasting + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.insert(target_shape.end(), field.field_shape.begin(), + field.field_shape.end()); + auto chunk_fill_value = - BroadcastArray(fill_value, BoxView<>(metadata.rank)).value(); + BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + + // 2. Construct component chunk shape + std::vector component_chunk_shape = metadata.chunk_shape; + component_chunk_shape.insert(component_chunk_shape.end(), + field.field_shape.begin(), + field.field_shape.end()); + + // 3. Construct permutation + std::vector component_permutation(metadata.rank + + field_rank); + std::copy_n(metadata.inner_order.data(), metadata.rank, + component_permutation.begin()); + std::iota(component_permutation.begin() + metadata.rank, + component_permutation.end(), metadata.rank); + + // 4. Construct bounds + Box<> valid_data_bounds(metadata.rank + field_rank); + for (size_t i = 0; i < field_rank; ++i) { + valid_data_bounds[metadata.rank + i] = + IndexInterval::UncheckedSized(0, field.field_shape[i]); + } auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ std::move(chunk_fill_value), // Since all dimensions are resizable, just // specify unbounded `valid_data_bounds`. - Box<>(metadata.rank), - ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank))}, - metadata.chunk_shape); + std::move(valid_data_bounds), + ContiguousLayoutPermutation<>(component_permutation)}, + component_chunk_shape); component.array_spec.fill_value_comparison_kind = EqualityComparisonKind::identical; } @@ -342,7 +428,7 @@ class DataCacheBase [](std::string& out, DimensionIndex dim, Index grid_index) { absl::StrAppend(&out, grid_index); }, - rank, grid_indices); + rank, grid_indices.subspan(0, rank)); return key; } @@ -355,17 +441,21 @@ class DataCacheBase key_prefix_.size() + (metadata.chunk_key_encoding.kind == ChunkKeyEncoding::kDefault ? 2 : 0)); - return internal::ParseGridIndexKeyWithDimensionSeparator( - metadata.chunk_key_encoding.separator, - [](std::string_view part, DimensionIndex dim, Index& grid_index) { - if (part.empty() || !absl::ascii_isdigit(part.front()) || - !absl::ascii_isdigit(part.back()) || - !absl::SimpleAtoi(part, &grid_index)) { - return false; - } - return true; - }, - key, grid_indices); + if (!internal::ParseGridIndexKeyWithDimensionSeparator( + metadata.chunk_key_encoding.separator, + [](std::string_view part, DimensionIndex dim, Index& grid_index) { + if (part.empty() || !absl::ascii_isdigit(part.front()) || + !absl::ascii_isdigit(part.back()) || + !absl::SimpleAtoi(part, &grid_index)) { + return false; + } + return true; + }, + key, grid_indices.subspan(0, metadata.rank))) { + return false; + } + std::fill(grid_indices.begin() + metadata.rank, grid_indices.end(), 0); + return true; } Index MinGridIndexForLexicographicalOrder( @@ -378,7 +468,7 @@ class DataCacheBase *static_cast(initial_metadata().get()); if (metadata.chunk_key_encoding.kind == ChunkKeyEncoding::kDefault) { std::string key = tensorstore::StrCat(key_prefix_, "c"); - for (DimensionIndex i = 0; i < cell_indices.size(); ++i) { + for (DimensionIndex i = 0; i < metadata.rank; ++i) { tensorstore::StrAppend( &key, std::string_view(&metadata.chunk_key_encoding.separator, 1), cell_indices[i]); @@ -388,7 +478,7 @@ class DataCacheBase // Use "0" for rank 0 as a special case. std::string key = tensorstore::StrCat( key_prefix_, cell_indices.empty() ? 0 : cell_indices[0]); - for (DimensionIndex i = 1; i < cell_indices.size(); ++i) { + for (DimensionIndex i = 1; i < metadata.rank; ++i) { tensorstore::StrAppend( &key, std::string_view(&metadata.chunk_key_encoding.separator, 1), cell_indices[i]); @@ -400,7 +490,11 @@ class DataCacheBase const void* metadata_ptr, size_t component_index) override { // component_index corresponds to the selected field index const auto& metadata = *static_cast(metadata_ptr); + const auto& field = metadata.data_type.fields[component_index]; const DimensionIndex rank = metadata.rank; + const DimensionIndex field_rank = field.field_shape.size(); + const DimensionIndex total_rank = rank + field_rank; + std::string_view normalized_dimension_names[kMaxRank]; for (DimensionIndex i = 0; i < rank; ++i) { if (const auto& name = metadata.dimension_names[i]; name.has_value()) { @@ -408,11 +502,20 @@ class DataCacheBase } } auto builder = - tensorstore::IndexTransformBuilder<>(rank, rank) - .input_shape(metadata.shape) - .input_labels(span(&normalized_dimension_names[0], rank)); - builder.implicit_upper_bounds(true); + tensorstore::IndexTransformBuilder<>(total_rank, total_rank); + std::vector full_shape = metadata.shape; + full_shape.insert(full_shape.end(), field.field_shape.begin(), + field.field_shape.end()); + builder.input_shape(full_shape); + builder.input_labels(span(&normalized_dimension_names[0], total_rank)); + + DimensionSet implicit_upper_bounds(false); for (DimensionIndex i = 0; i < rank; ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + + for (DimensionIndex i = 0; i < total_rank; ++i) { builder.output_single_input_dimension(i, i); } return builder.Finalize(); @@ -643,9 +746,26 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { DataCacheInitializer&& initializer) override { const auto& metadata = *static_cast(initializer.metadata.get()); + // For void access, modify the dtype to indicate special handling + ZarrDType dtype = metadata.data_type; + if (spec().selected_field == "") { + // Create a synthetic dtype for void access + dtype = ZarrDType{ + /*.has_fields=*/false, + /*.fields=*/{ZarrDType::Field{ + ZarrDType::BaseDType{"", dtype_v, + {metadata.data_type.bytes_per_outer_element}}, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{metadata.data_type.bytes_per_outer_element}, + /*.num_inner_elements=*/metadata.data_type.bytes_per_outer_element, + /*.byte_offset=*/0, + /*.num_bytes=*/metadata.data_type.bytes_per_outer_element}}, + /*.bytes_per_outer_element=*/metadata.data_type.bytes_per_outer_element}; + } return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, - metadata.codec_state, metadata.data_type, + metadata.codec_state, dtype, /*data_cache_pool=*/*cache_pool()); } @@ -657,6 +777,10 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto field_index, GetFieldIndex(metadata.data_type, spec().selected_field)); + // For void access, map to component index 0 + if (field_index == kVoidFieldIndex) { + field_index = 0; + } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 281b9c98b..116712d70 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -19,6 +19,7 @@ #include #include "absl/base/optimization.h" +#include "absl/strings/ascii.h" #include "tensorstore/data_type.h" #include "tensorstore/internal/json_binding/json_binding.h" #include "tensorstore/util/endian.h" @@ -57,9 +58,26 @@ Result ParseBaseDType(std::string_view dtype) { if (dtype == "complex128") return make_dtype(dtype_v<::tensorstore::dtypes::complex128_t>); + // Handle r raw bits type where N is number of bits (must be multiple of 8) + if (dtype.size() > 1 && dtype[0] == 'r' && absl::ascii_isdigit(dtype[1])) { + std::string_view suffix = dtype.substr(1); + Index num_bits = 0; + if (!absl::SimpleAtoi(suffix, &num_bits) || + num_bits == 0 || + num_bits % 8 != 0) { + return absl::InvalidArgumentError(tensorstore::StrCat( + dtype, " data type is invalid; expected r where N is a positive " + "multiple of 8")); + } + Index num_bytes = num_bits / 8; + return ZarrDType::BaseDType{std::string(dtype), + dtype_v<::tensorstore::dtypes::byte_t>, + {num_bytes}}; + } + constexpr std::string_view kSupported = "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " - "bfloat16, float16, float32, float64, complex64, complex128"; + "bfloat16, float16, float32, float64, complex64, complex128, r"; return absl::InvalidArgumentError( tensorstore::StrCat(dtype, " data type is not one of the supported " "data types: ", @@ -162,6 +180,34 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(config["fields"], out)); return out; } + if (type_name == "raw_bytes") { + const auto& config = value["configuration"]; + if (!config.is_object() || !config.contains("length_bytes")) { + return absl::InvalidArgumentError( + "raw_bytes data type requires 'configuration' object with " + "'length_bytes' field"); + } + Index length_bytes; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(config["length_bytes"], &length_bytes)); + if (length_bytes <= 0) { + return absl::InvalidArgumentError( + "raw_bytes length_bytes must be positive"); + } + out.has_fields = false; + out.fields.resize(1); + out.fields[0].encoded_dtype = "raw_bytes"; + out.fields[0].dtype = dtype_v; + out.fields[0].flexible_shape = {length_bytes}; + out.fields[0].outer_shape = {}; + out.fields[0].name = ""; + out.fields[0].field_shape = {length_bytes}; + out.fields[0].num_inner_elements = length_bytes; + out.fields[0].byte_offset = 0; + out.fields[0].num_bytes = length_bytes; + out.bytes_per_outer_element = length_bytes; + return out; + } // For other named types, try to parse as a base dtype out.has_fields = false; out.fields.resize(1); @@ -326,6 +372,10 @@ Result ChooseBaseDType(DataType dtype) { return MakeBaseDType("complex64", dtype); if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) return MakeBaseDType("complex128", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) + return MakeBaseDType("r8", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::char_t>) + return MakeBaseDType("r8", dtype); return absl::InvalidArgumentError( tensorstore::StrCat("Data type not supported: ", dtype)); } diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index cbb7acbfb..e1c5b444c 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -68,6 +68,9 @@ TEST(ParseBaseDType, Success) { CheckBaseDType("float64", dtype_v, {}); CheckBaseDType("complex64", dtype_v, {}); CheckBaseDType("complex128", dtype_v, {}); + CheckBaseDType("r8", dtype_v, {1}); + CheckBaseDType("r16", dtype_v, {2}); + CheckBaseDType("r64", dtype_v, {8}); } TEST(ParseBaseDType, Failure) { @@ -81,6 +84,15 @@ TEST(ParseBaseDType, Failure) { StatusIs(absl::StatusCode::kInvalidArgument)); EXPECT_THAT(ParseBaseDType(""))); + EXPECT_THAT(ParseBaseDType("r7"), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is invalid; expected r"))); + EXPECT_THAT(ParseBaseDType("r0"), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is invalid; expected r"))); } void CheckDType(const ::nlohmann::json& json, const ZarrDType& expected) { @@ -266,6 +278,8 @@ TEST(ChooseBaseDTypeTest, RoundTrip) { dtype_v, dtype_v, dtype_v, + dtype_v, + dtype_v, }; for (auto dtype : kSupportedDataTypes) { SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 880991e8c..6a83cdbec 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -250,6 +250,10 @@ constexpr std::array FillValueDataTypeFunctions::Make<::tensorstore::dtypes::T>(); \ /**/ TENSORSTORE_ZARR3_FOR_EACH_DATA_TYPE(TENSORSTORE_INTERNAL_DO_DEF) + // Add char_t support for string data types + functions[static_cast(DataTypeId::char_t)] = + FillValueDataTypeFunctions::Make<::tensorstore::dtypes::char_t>(); + // byte_t is handled specially to use uint8_t functions #undef TENSORSTORE_INTERNAL_DO_DEF return functions; }(); @@ -282,8 +286,39 @@ absl::Status FillValueJsonBinder::operator()( std::vector>* obj, ::nlohmann::json* j) const { obj->resize(dtype.fields.size()); if (dtype.fields.size() == 1) { - TENSORSTORE_RETURN_IF_ERROR( - DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + // Special case: raw_bytes (single field with byte_t and flexible shape) + if (dtype.fields[0].dtype.id() == DataTypeId::byte_t && + !dtype.fields[0].flexible_shape.empty()) { + // Handle base64-encoded fill value for raw_bytes + if (!j->is_string()) { + return absl::InvalidArgumentError( + "Expected base64-encoded string for raw_bytes fill_value"); + } + std::string b64_decoded; + if (!absl::Base64Unescape(j->get(), &b64_decoded)) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected valid base64-encoded fill value, but received: ", + j->dump())); + } + // Verify size matches expected byte array size + Index expected_size = dtype.fields[0].num_inner_elements; + if (static_cast(b64_decoded.size()) != expected_size) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected ", expected_size, + " base64-encoded bytes for fill_value, but received ", + b64_decoded.size(), " bytes")); + } + // Create fill value array + auto fill_arr = AllocateArray(dtype.fields[0].field_shape, c_order, + default_init, dtype.fields[0].dtype); + std::memcpy(fill_arr.data(), b64_decoded.data(), b64_decoded.size()); + std::cout << "[DEBUG] Raw bytes fill_value parsed: shape=" << fill_arr.shape() + << ", dtype=" << dtype.fields[0].dtype << std::endl; + (*obj)[0] = std::move(fill_arr); + } else { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + } } else { // For structured types, handle both array format and base64-encoded string if (j->is_string()) { @@ -361,8 +396,14 @@ absl::Status FillValueJsonBinder::DecodeSingle(::nlohmann::json& j, AllocateArray(span{}, c_order, default_init, data_type); void* data = arr.data(); out = std::move(arr); + // Special handling for byte_t: use uint8_t functions since they're binary compatible + auto type_id = data_type.id(); + if (type_id == DataTypeId::byte_t) { + type_id = DataTypeId::uint8_t; + } + const auto& functions = - kFillValueDataTypeFunctions[static_cast(data_type.id())]; + kFillValueDataTypeFunctions[static_cast(type_id)]; if (!functions.decode) { if (allow_missing_dtype) { out = SharedArray(); @@ -381,8 +422,14 @@ absl::Status FillValueJsonBinder::EncodeSingle( return absl::InvalidArgumentError( "data_type must be specified before fill_value"); } + // Special handling for byte_t: use uint8_t functions since they're binary compatible + auto type_id = data_type.id(); + if (type_id == DataTypeId::byte_t) { + type_id = DataTypeId::uint8_t; + } + const auto& functions = - kFillValueDataTypeFunctions[static_cast(data_type.id())]; + kFillValueDataTypeFunctions[static_cast(type_id)]; if (!functions.encode) { return absl::FailedPreconditionError( "fill_value unsupported for specified data_type"); @@ -751,8 +798,19 @@ std::string GetFieldNames(const ZarrDType& dtype) { } } // namespace +constexpr size_t kVoidFieldIndex = size_t(-1); + Result GetFieldIndex(const ZarrDType& dtype, std::string_view selected_field) { + // Special case: "" requests raw byte access (works for any dtype) + if (selected_field == "") { + if (dtype.fields.empty()) { + return absl::FailedPreconditionError( + "Requested field \"\" but dtype has no fields"); + } + return kVoidFieldIndex; + } + if (selected_field.empty()) { if (dtype.fields.size() != 1) { return absl::FailedPreconditionError(tensorstore::StrCat( @@ -779,6 +837,9 @@ SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, SpecRankAndFieldInfo info; info.chunked_rank = metadata.rank; info.field = &metadata.data_type.fields[field_index]; + if (!info.field->field_shape.empty()) { + info.chunked_rank += info.field->field_shape.size(); + } return info; } @@ -798,8 +859,24 @@ Result> GetEffectiveDomain( assert(RankConstraint::EqualOrUnspecified(schema.rank(), rank)); IndexDomainBuilder builder(std::max(schema.rank().rank, rank)); if (metadata_shape) { - builder.shape(*metadata_shape); - builder.implicit_upper_bounds(true); + if (static_cast(metadata_shape->size()) < rank && + info.field && !info.field->field_shape.empty() && + static_cast(metadata_shape->size() + + info.field->field_shape.size()) == rank) { + std::vector full_shape(metadata_shape->begin(), + metadata_shape->end()); + full_shape.insert(full_shape.end(), info.field->field_shape.begin(), + info.field->field_shape.end()); + builder.shape(full_shape); + DimensionSet implicit_upper_bounds(false); + for (size_t i = 0; i < metadata_shape->size(); ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + } else { + builder.shape(*metadata_shape); + builder.implicit_upper_bounds(true); + } } else { builder.origin(GetConstantVector(builder.rank())); } From 44c765ec04e0492cd8ba9aa9f5b43cf97834359b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 18:28:09 +0000 Subject: [PATCH 05/59] Fix failing tests --- tensorstore/driver/zarr3/dtype.cc | 26 ++++++++++++++++++++++---- tensorstore/driver/zarr3/dtype_test.cc | 9 +++++++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 116712d70..5b3261812 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -75,6 +75,13 @@ Result ParseBaseDType(std::string_view dtype) { {num_bytes}}; } + // Handle bare "r" - must have a number after it + if (dtype.size() >= 1 && dtype[0] == 'r') { + return absl::InvalidArgumentError(tensorstore::StrCat( + dtype, " data type is invalid; expected r where N is a positive " + "multiple of 8")); + } + constexpr std::string_view kSupported = "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " "bfloat16, float16, float32, float64, complex64, complex128, r"; @@ -372,10 +379,21 @@ Result ChooseBaseDType(DataType dtype) { return MakeBaseDType("complex64", dtype); if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) return MakeBaseDType("complex128", dtype); - if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) - return MakeBaseDType("r8", dtype); - if (dtype == dtype_v<::tensorstore::dtypes::char_t>) - return MakeBaseDType("r8", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) { + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype; + base_dtype.encoded_dtype = "r8"; + base_dtype.flexible_shape = {1}; + return base_dtype; + } + if (dtype == dtype_v<::tensorstore::dtypes::char_t>) { + // char_t encodes as r8, which parses back to byte_t + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype_v<::tensorstore::dtypes::byte_t>; + base_dtype.encoded_dtype = "r8"; + base_dtype.flexible_shape = {1}; + return base_dtype; + } return absl::InvalidArgumentError( tensorstore::StrCat("Data type not supported: ", dtype)); } diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index e1c5b444c..ef55aba09 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -285,10 +285,15 @@ TEST(ChooseBaseDTypeTest, RoundTrip) { SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto base_zarr_dtype, ChooseBaseDType(dtype)); - EXPECT_EQ(dtype, base_zarr_dtype.dtype); + // byte_t and char_t both encode as r8, which parses back to byte_t + DataType expected_dtype = dtype; + if (dtype == dtype_v) { + expected_dtype = dtype_v; + } + EXPECT_EQ(expected_dtype, base_zarr_dtype.dtype); TENSORSTORE_ASSERT_OK_AND_ASSIGN( auto parsed, ParseBaseDType(base_zarr_dtype.encoded_dtype)); - EXPECT_EQ(dtype, parsed.dtype); + EXPECT_EQ(expected_dtype, parsed.dtype); EXPECT_EQ(base_zarr_dtype.flexible_shape, parsed.flexible_shape); EXPECT_EQ(base_zarr_dtype.encoded_dtype, parsed.encoded_dtype); } From 547642d819aa5ac878300530e9d049018de27db8 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 20:10:09 +0000 Subject: [PATCH 06/59] Resolve issues with opening struct as void --- examples/read_structured_zarr3.cc | 40 ++++++++++++-- tensorstore/driver/zarr3/driver.cc | 83 ++++++++++++++++++++++++------ 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index 259eade34..bf12ced1b 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -422,12 +422,44 @@ absl::Status Run(const std::string& zarr_path) { auto raw_store = std::move(raw_open_result).value(); TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); + // Test 3: Read from headers array as void (field="") + // Use a fresh context to avoid cache sharing with Test 1 std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "COMPARISON: Both methods should give identical inline field values" << std::endl; + std::cout << "TEST 3: Reading from 'headers' array as void (field=\"\")" << std::endl; std::cout << std::string(60, '=') << std::endl; - std::cout << "The structured 'headers' array provides field access convenience,\n" - << "while the raw 'raw_headers' array provides direct byte access.\n" - << "Both extract the inline field from byte offset " << kInlineFieldOffset + + auto context_void = tensorstore::Context::Default(); + + ::nlohmann::json headers_void_spec = ::nlohmann::json::object(); + headers_void_spec["driver"] = "zarr3"; + headers_void_spec["kvstore"] = ::nlohmann::json::object(); + headers_void_spec["kvstore"]["driver"] = "file"; + headers_void_spec["kvstore"]["path"] = headers_path + "/"; + headers_void_spec["field"] = ""; // Special field for raw byte access + + std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; + + auto headers_void_open_result = + tensorstore::Open(headers_void_spec, context_void, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!headers_void_open_result.ok()) { + std::cout << "\n=== Headers (void) Open Failed ===" << std::endl; + std::cout << "Status: " << headers_void_open_result.status() << std::endl; + return headers_void_open_result.status(); + } + + auto headers_void_store = std::move(headers_void_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (void)", /*is_raw_bytes=*/true)); + + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; + std::cout << std::string(60, '=') << std::endl; + std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" + << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" + << "- Test 3: 'headers' with field=\"\" provides raw byte access to structured data\n" + << "All three extract the inline field from byte offset " << kInlineFieldOffset << " in " << kStructSize << "-byte structs." << std::endl; return absl::OkStatus(); diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index b4d96da1f..bed1171d2 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -315,26 +315,27 @@ class DataCacheBase // Special case: void access - create single component for entire struct if (field_index == kVoidFieldIndex) { - // For void access, use the fill_value from the single raw_bytes field - auto& fill_value = metadata.fill_value[0]; - std::cout << "[DEBUG] Void access fill_value: shape=" << fill_value.shape() - << ", dtype=" << fill_value.dtype() << std::endl; + // For void access, create a zero-filled byte array as the fill value + const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; + auto base_fill_value = AllocateArray( + span({bytes_per_element}), c_order, value_init, + dtype_v); // Broadcast to shape [unbounded, unbounded, ..., struct_size] std::vector target_shape(metadata.rank, kInfIndex); - target_shape.push_back(metadata.data_type.bytes_per_outer_element); - std::cout << "[DEBUG] Void access target_shape: ["; - for (size_t i = 0; i < target_shape.size(); ++i) { - if (i > 0) std::cout << ", "; - std::cout << target_shape[i]; - } - std::cout << "]" << std::endl; + target_shape.push_back(bytes_per_element); auto chunk_fill_value = - BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); // Add extra dimension for struct size in bytes std::vector chunk_shape_with_bytes = metadata.chunk_shape; - chunk_shape_with_bytes.push_back(metadata.data_type.bytes_per_outer_element); + chunk_shape_with_bytes.push_back(bytes_per_element); + + // Create permutation: copy existing inner_order and add the new dimension + std::vector void_permutation(metadata.rank + 1); + std::copy_n(metadata.inner_order.data(), metadata.rank, + void_permutation.begin()); + void_permutation[metadata.rank] = metadata.rank; // Add the bytes dimension auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ @@ -343,7 +344,7 @@ class DataCacheBase // specify unbounded `valid_data_bounds`. Box<>(metadata.rank + 1), ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank + 1))}, + span(void_permutation.data(), metadata.rank + 1))}, chunk_shape_with_bytes); component.array_spec.fill_value_comparison_kind = EqualityComparisonKind::identical; @@ -570,7 +571,13 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { std::string key_prefix, U&&... arg) : ChunkCacheImpl(std::move(initializer.store), std::forward(arg)...), DataCacheBase(std::move(initializer), std::move(key_prefix)), - grid_(DataCacheBase::GetChunkGridSpecification(metadata())) {} + grid_(DataCacheBase::GetChunkGridSpecification( + metadata(), + // Check if this is void access by examining the dtype + (ChunkCacheImpl::dtype_.fields.size() == 1 && + ChunkCacheImpl::dtype_.fields[0].name == "") + ? kVoidFieldIndex + : 0)) {} const internal::LexicographicalGridIndexKeyParser& GetChunkStorageKeyParser() final { @@ -596,6 +603,52 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { return DataCacheBase::executor(); } + // Override to handle void access - check the dtype to see if this is void + Result> GetExternalToInternalTransform( + const void* metadata_ptr, size_t component_index) override { + const auto& metadata = *static_cast(metadata_ptr); + + // Check if this is void access by examining the cache's dtype + const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && + ChunkCacheImpl::dtype_.fields[0].name == ""); + + if (is_void_access) { + // For void access, create transform with extra bytes dimension + const DimensionIndex rank = metadata.rank; + const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; + const DimensionIndex total_rank = rank + 1; + + std::string_view normalized_dimension_names[kMaxRank]; + for (DimensionIndex i = 0; i < rank; ++i) { + if (const auto& name = metadata.dimension_names[i]; name.has_value()) { + normalized_dimension_names[i] = *name; + } + } + + auto builder = + tensorstore::IndexTransformBuilder<>(total_rank, total_rank); + std::vector full_shape = metadata.shape; + full_shape.push_back(bytes_per_element); + builder.input_shape(full_shape); + builder.input_labels(span(&normalized_dimension_names[0], total_rank)); + + DimensionSet implicit_upper_bounds(false); + for (DimensionIndex i = 0; i < rank; ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + + for (DimensionIndex i = 0; i < total_rank; ++i) { + builder.output_single_input_dimension(i, i); + } + return builder.Finalize(); + } + + // Not void access - delegate to base implementation + return DataCacheBase::GetExternalToInternalTransform(metadata_ptr, + component_index); + } + internal::ChunkGridSpecification grid_; }; From 2a4c3d852e0f38b5601dd43482ae878d86a6d7b6 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 26 Nov 2025 15:03:55 +0000 Subject: [PATCH 07/59] Remove debug print --- tensorstore/driver/zarr3/metadata.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 6a83cdbec..9aef7bd0b 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -312,8 +312,6 @@ absl::Status FillValueJsonBinder::operator()( auto fill_arr = AllocateArray(dtype.fields[0].field_shape, c_order, default_init, dtype.fields[0].dtype); std::memcpy(fill_arr.data(), b64_decoded.data(), b64_decoded.size()); - std::cout << "[DEBUG] Raw bytes fill_value parsed: shape=" << fill_arr.shape() - << ", dtype=" << dtype.fields[0].dtype << std::endl; (*obj)[0] = std::move(fill_arr); } else { TENSORSTORE_RETURN_IF_ERROR( From b0abb94070f7be7337e7a30b90802ee8617801dd Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Dec 2025 22:01:10 +0000 Subject: [PATCH 08/59] Add field for open as void --- .gitignore | 5 +++++ examples/read_structured_zarr3.cc | 11 ++++++----- tensorstore/driver/zarr3/driver.cc | 31 +++++++++++++++++++----------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index e4737363c..7c75044c5 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,8 @@ __pycache__ *.pyc /python/tensorstore/*.so /python/tensorstore/*.pyd + +build/ +bootstrap.sh +filt_mig.mdio +generate_test.py \ No newline at end of file diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index bf12ced1b..720ef1330 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -21,6 +21,7 @@ // Both arrays should contain the same data, allowing comparison of: // - Field-based access vs manual byte extraction // - Structured dtype parsing vs raw byte handling +// - New open_as_void option for raw byte access to structured data // // Usage: // bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir @@ -422,10 +423,10 @@ absl::Status Run(const std::string& zarr_path) { auto raw_store = std::move(raw_open_result).value(); TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); - // Test 3: Read from headers array as void (field="") + // Test 3: Read from headers array as void (open_as_void=true) // Use a fresh context to avoid cache sharing with Test 1 std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 3: Reading from 'headers' array as void (field=\"\")" << std::endl; + std::cout << "TEST 3: Reading from 'headers' array as void (open_as_void=true)" << std::endl; std::cout << std::string(60, '=') << std::endl; auto context_void = tensorstore::Context::Default(); @@ -435,7 +436,7 @@ absl::Status Run(const std::string& zarr_path) { headers_void_spec["kvstore"] = ::nlohmann::json::object(); headers_void_spec["kvstore"]["driver"] = "file"; headers_void_spec["kvstore"]["path"] = headers_path + "/"; - headers_void_spec["field"] = ""; // Special field for raw byte access + headers_void_spec["open_as_void"] = true; // New option for raw byte access std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; @@ -451,14 +452,14 @@ absl::Status Run(const std::string& zarr_path) { } auto headers_void_store = std::move(headers_void_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (void)", /*is_raw_bytes=*/true)); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (open_as_void)", /*is_raw_bytes=*/true)); std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; std::cout << std::string(60, '=') << std::endl; std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" - << "- Test 3: 'headers' with field=\"\" provides raw byte access to structured data\n" + << "- Test 3: 'headers' with open_as_void=true provides raw byte access to structured data\n" << "All three extract the inline field from byte offset " << kInlineFieldOffset << " in " << kStructSize << "-byte structs." << std::endl; diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index bed1171d2..f4aad10d7 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -107,10 +107,11 @@ class ZarrDriverSpec ZarrMetadataConstraints metadata_constraints; std::string selected_field; + bool open_as_void; constexpr static auto ApplyMembers = [](auto& x, auto f) { return f(internal::BaseCast(x), x.metadata_constraints, - x.selected_field); + x.selected_field, x.open_as_void); }; static inline const auto default_json_binder = jb::Sequence( @@ -145,9 +146,17 @@ class ZarrDriverSpec }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( jb::DefaultInitializedValue()))), - jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( - jb::DefaultValue( - [](auto* obj) { *obj = std::string{}; })))); + jb::Member( + "field", + jb::Projection<&ZarrDriverSpec::selected_field>( + jb::DefaultValue( + [](auto* obj) { *obj = std::string{}; }))), + jb::Member( + "open_as_void", + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; /*selected_field = "";*/ })))); + absl::Status ApplyOptions(SpecOptions&& options) override { if (options.minimal_spec) { @@ -607,43 +616,43 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { Result> GetExternalToInternalTransform( const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); - + // Check if this is void access by examining the cache's dtype const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && ChunkCacheImpl::dtype_.fields[0].name == ""); - + if (is_void_access) { // For void access, create transform with extra bytes dimension const DimensionIndex rank = metadata.rank; const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; const DimensionIndex total_rank = rank + 1; - + std::string_view normalized_dimension_names[kMaxRank]; for (DimensionIndex i = 0; i < rank; ++i) { if (const auto& name = metadata.dimension_names[i]; name.has_value()) { normalized_dimension_names[i] = *name; } } - + auto builder = tensorstore::IndexTransformBuilder<>(total_rank, total_rank); std::vector full_shape = metadata.shape; full_shape.push_back(bytes_per_element); builder.input_shape(full_shape); builder.input_labels(span(&normalized_dimension_names[0], total_rank)); - + DimensionSet implicit_upper_bounds(false); for (DimensionIndex i = 0; i < rank; ++i) { implicit_upper_bounds[i] = true; } builder.implicit_upper_bounds(implicit_upper_bounds); - + for (DimensionIndex i = 0; i < total_rank; ++i) { builder.output_single_input_dimension(i, i); } return builder.Finalize(); } - + // Not void access - delegate to base implementation return DataCacheBase::GetExternalToInternalTransform(metadata_ptr, component_index); From fff0a5be9ce8fa1baed0a2db5503b852f3fb5184 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:38:36 +0000 Subject: [PATCH 09/59] Add a shim for new open_as_void flag open option --- tensorstore/driver/zarr3/driver.cc | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f4aad10d7..18c8f3a77 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -140,8 +140,9 @@ class ZarrDriverSpec // at metadata level only. } } - TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( - RankConstraint{obj->metadata_constraints.rank})); + TENSORSTORE_RETURN_IF_ERROR( + obj->schema.Set( + RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( @@ -151,11 +152,23 @@ class ZarrDriverSpec jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), + + // NEW: wrap the open_as_void projection in a Validate jb::Member( "open_as_void", - jb::Projection<&ZarrDriverSpec::open_as_void>( - jb::DefaultValue( - [](auto* v) { *v = false; /*selected_field = "";*/ })))); + jb::Validate( + [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { + // At this point, Projection has already set obj->open_as_void + if (obj->open_as_void) { + obj->selected_field = ""; + } + return absl::OkStatus(); + }, + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; }))))); + + absl::Status ApplyOptions(SpecOptions&& options) override { From b6c24f96289a523d14cd6dc9a173f70e10690e15 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:55:02 +0000 Subject: [PATCH 10/59] Revert some formatting changes --- tensorstore/driver/zarr3/driver.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 18c8f3a77..dd95c711b 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -140,22 +140,18 @@ class ZarrDriverSpec // at metadata level only. } } - TENSORSTORE_RETURN_IF_ERROR( - obj->schema.Set( - RankConstraint{obj->metadata_constraints.rank})); + TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( + RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( jb::DefaultInitializedValue()))), - jb::Member( - "field", - jb::Projection<&ZarrDriverSpec::selected_field>( + jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), // NEW: wrap the open_as_void projection in a Validate - jb::Member( - "open_as_void", + jb::Member("open_as_void", jb::Validate( [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { // At this point, Projection has already set obj->open_as_void From 488b1605c1f15f322e4b39f03b02d6cd8b29900b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:56:34 +0000 Subject: [PATCH 11/59] revert gitignore changes --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 7c75044c5..e4737363c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,3 @@ __pycache__ *.pyc /python/tensorstore/*.so /python/tensorstore/*.pyd - -build/ -bootstrap.sh -filt_mig.mdio -generate_test.py \ No newline at end of file From 54941a09cf5e057e9c32d20512c0bb114b6f9b83 Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Wed, 3 Dec 2025 13:06:22 -0600 Subject: [PATCH 12/59] V3 structs remove shim (#2) * Begin removing void field shim * Fully removed void string shim * Cleanup debug prints * Remove shimmed validation * Remove unnecessary comment * Prefer false over zero for ternary clarity --- tensorstore/driver/zarr3/chunk_cache.cc | 16 ++++++---- tensorstore/driver/zarr3/chunk_cache.h | 14 ++++++--- tensorstore/driver/zarr3/driver.cc | 38 +++++++---------------- tensorstore/driver/zarr3/metadata.cc | 14 +++++---- tensorstore/driver/zarr3/metadata.h | 6 ++-- tensorstore/driver/zarr3/metadata_test.cc | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 64b6d69fd..f14efd607 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -75,10 +75,12 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/) + ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/, + bool open_as_void) : Base(std::move(store)), codec_state_(std::move(codec_state)), - dtype_(std::move(dtype)) {} + dtype_(std::move(dtype)), + open_as_void_(open_as_void) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver chunk_indices, absl::InlinedVector, 1> field_arrays(num_fields); // Special case: void access - return raw bytes directly - if (num_fields == 1 && dtype_.fields[0].name == "") { + if (open_as_void_) { TENSORSTORE_ASSIGN_OR_RETURN( field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), std::move(data))); @@ -221,11 +223,13 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), - data_cache_pool_(std::move(data_cache_pool)) {} + data_cache_pool_(std::move(data_cache_pool)), + open_as_void_(open_as_void) {} Result> TranslateCellToSourceTransformForShard( IndexTransform<> transform, span grid_cell_indices, @@ -534,7 +538,7 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { *sharding_state.sub_chunk_codec_chain, std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), - cache.dtype_, cache.data_cache_pool_); + cache.dtype_, cache.data_cache_pool_, cache.open_as_void_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index 5933115d7..a39eb1dc8 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -158,7 +158,8 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, explicit ZarrLeafChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, - internal::CachePool::WeakPtr data_cache_pool); + internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void = false); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -246,6 +249,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { kvstore::DriverPtr base_kvstore_; ZarrCodecChain::PreparedState::Ptr codec_state_; ZarrDType dtype_; + bool open_as_void_; // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -260,11 +264,13 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { explicit ZarrShardSubChunkCache( kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, - ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void = false) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), - std::move(dtype), std::move(data_cache_pool)), + std::move(dtype), std::move(data_cache_pool), + open_as_void), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index dd95c711b..f4c0ad9d7 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -149,20 +149,9 @@ class ZarrDriverSpec jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), - - // NEW: wrap the open_as_void projection in a Validate - jb::Member("open_as_void", - jb::Validate( - [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { - // At this point, Projection has already set obj->open_as_void - if (obj->open_as_void) { - obj->selected_field = ""; - } - return absl::OkStatus(); - }, - jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::Member("open_as_void", jb::Projection<&ZarrDriverSpec::open_as_void>( jb::DefaultValue( - [](auto* v) { *v = false; }))))); + [](auto* v) { *v = false; })))); @@ -592,10 +581,7 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { grid_(DataCacheBase::GetChunkGridSpecification( metadata(), // Check if this is void access by examining the dtype - (ChunkCacheImpl::dtype_.fields.size() == 1 && - ChunkCacheImpl::dtype_.fields[0].name == "") - ? kVoidFieldIndex - : 0)) {} + ChunkCacheImpl::open_as_void_ ? kVoidFieldIndex : false)) {} const internal::LexicographicalGridIndexKeyParser& GetChunkStorageKeyParser() final { @@ -626,9 +612,8 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); - // Check if this is void access by examining the cache's dtype - const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && - ChunkCacheImpl::dtype_.fields[0].name == ""); + // Check if this is void access by examining the stored flag + const bool is_void_access = ChunkCacheImpl::open_as_void_; if (is_void_access) { // For void access, create transform with extra bytes dimension @@ -802,7 +787,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto metadata, internal_zarr3::GetNewMetadata(spec().metadata_constraints, - spec().schema), + spec().schema, spec().selected_field, spec().open_as_void), tensorstore::MaybeAnnotateStatus( _, "Cannot create using specified \"metadata\" and schema")); return metadata; @@ -819,15 +804,15 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { *static_cast(initializer.metadata.get()); // For void access, modify the dtype to indicate special handling ZarrDType dtype = metadata.data_type; - if (spec().selected_field == "") { + if (spec().open_as_void) { // Create a synthetic dtype for void access dtype = ZarrDType{ /*.has_fields=*/false, /*.fields=*/{ZarrDType::Field{ - ZarrDType::BaseDType{"", dtype_v, + ZarrDType::BaseDType{"", dtype_v, {metadata.data_type.bytes_per_outer_element}}, /*.outer_shape=*/{}, - /*.name=*/"", + /*.name=*/"", /*.field_shape=*/{metadata.data_type.bytes_per_outer_element}, /*.num_inner_elements=*/metadata.data_type.bytes_per_outer_element, /*.byte_offset=*/0, @@ -837,7 +822,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, metadata.codec_state, dtype, - /*data_cache_pool=*/*cache_pool()); + /*data_cache_pool=*/*cache_pool(), + spec().open_as_void); } Result GetComponentIndex(const void* metadata_ptr, @@ -847,7 +833,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { ValidateMetadata(metadata, spec().metadata_constraints)); TENSORSTORE_ASSIGN_OR_RETURN( auto field_index, - GetFieldIndex(metadata.data_type, spec().selected_field)); + GetFieldIndex(metadata.data_type, spec().selected_field, spec().open_as_void)); // For void access, map to component index 0 if (field_index == kVoidFieldIndex) { field_index = 0; diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 9aef7bd0b..ba4454de4 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -799,12 +799,14 @@ std::string GetFieldNames(const ZarrDType& dtype) { constexpr size_t kVoidFieldIndex = size_t(-1); Result GetFieldIndex(const ZarrDType& dtype, - std::string_view selected_field) { - // Special case: "" requests raw byte access (works for any dtype) - if (selected_field == "") { + std::string_view selected_field, + bool open_as_void) { + // Special case: open_as_void requests raw byte access (works for any dtype) + + if (open_as_void) { if (dtype.fields.empty()) { return absl::FailedPreconditionError( - "Requested field \"\" but dtype has no fields"); + "Requested void access but dtype has no fields"); } return kVoidFieldIndex; } @@ -1138,7 +1140,7 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, Result> GetNewMetadata( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, - std::string_view selected_field) { + std::string_view selected_field, bool open_as_void) { auto metadata = std::make_shared(); metadata->zarr_format = metadata_constraints.zarr_format.value_or(3); @@ -1165,7 +1167,7 @@ Result> GetNewMetadata( } TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(metadata->data_type, selected_field)); + size_t field_index, GetFieldIndex(metadata->data_type, selected_field, open_as_void)); SpecRankAndFieldInfo info; info.field = &metadata->data_type.fields[field_index]; info.chunked_rank = metadata_constraints.rank; diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index 4c7871b0d..857210546 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -230,12 +230,14 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, /// unspecified. Result> GetNewMetadata( const ZarrMetadataConstraints& metadata_constraints, - const Schema& schema, std::string_view selected_field = {}); + const Schema& schema, std::string_view selected_field = {}, + bool open_as_void = false); absl::Status ValidateDataType(DataType dtype); Result GetFieldIndex(const ZarrDType& dtype, - std::string_view selected_field); + std::string_view selected_field, + bool open_as_void = false); struct SpecRankAndFieldInfo { DimensionIndex chunked_rank = dynamic_rank; diff --git a/tensorstore/driver/zarr3/metadata_test.cc b/tensorstore/driver/zarr3/metadata_test.cc index 11c97619f..ba7a26593 100644 --- a/tensorstore/driver/zarr3/metadata_test.cc +++ b/tensorstore/driver/zarr3/metadata_test.cc @@ -438,7 +438,7 @@ Result> TestGetNewMetadata( TENSORSTORE_RETURN_IF_ERROR(status); TENSORSTORE_ASSIGN_OR_RETURN( auto constraints, ZarrMetadataConstraints::FromJson(constraints_json)); - return GetNewMetadata(constraints, schema); + return GetNewMetadata(constraints, schema, /*selected_field=*/{}, /*open_as_void=*/false); } TEST(GetNewMetadataTest, DuplicateDimensionNames) { From c9f58f9eae12c236c1398619c0c43a298fc58dfc Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 19:38:40 +0000 Subject: [PATCH 13/59] Fix structured fill value population --- tensorstore/driver/zarr3/driver.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f4c0ad9d7..51cc17f42 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -675,7 +675,13 @@ class ZarrDriver : public ZarrDriverBase { if (metadata.fill_value.empty()) { return SharedArray(); } - return metadata.fill_value[0]; + // return metadata.fill_value[0]; + // TODO: Doe we actually need to validate this or can we trust that component_index will return a valid index? + size_t index = this->component_index(); + if (index >= metadata.fill_value.size()) { + return absl::OutOfRangeError("Component index out of bounds"); + } + return metadata.fill_value[index]; } Future GetStorageStatistics( From 7655cfd4cf435e90a1b468929c344de1300a0aa1 Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 10:03:47 -0600 Subject: [PATCH 14/59] V3 examples merge (#3) * Implement a more general and portable example set * Fix driver cache bug * Update example for template * Cleanup example * Remove testing examples from source --- examples/CMakeLists.txt | 163 ---------- examples/read_structured_zarr3.cc | 496 ----------------------------- tensorstore/driver/zarr3/driver.cc | 8 +- 3 files changed, 6 insertions(+), 661 deletions(-) delete mode 100644 examples/CMakeLists.txt delete mode 100644 examples/read_structured_zarr3.cc diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index 92e9857fa..000000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1,163 +0,0 @@ -# Standalone CMakeLists.txt for read_structured_zarr3 example -# -# Build instructions: -# mkdir -p /home/ubuntu/source/tensorstore/examples/build -# cd /home/ubuntu/source/tensorstore/examples/build -# cmake .. -# make -# -# Run: -# ./read_structured_zarr3 --zarr_path=/home/ubuntu/source/tensorstore/filt_mig.mdio/headers - -cmake_minimum_required(VERSION 3.24) -project(read_structured_zarr3 LANGUAGES CXX) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -# Path to the tensorstore build directory -set(TENSORSTORE_BUILD_DIR "/home/ubuntu/source/tensorstore/build" CACHE PATH "Path to tensorstore build directory") -set(TENSORSTORE_SOURCE_DIR "/home/ubuntu/source/tensorstore" CACHE PATH "Path to tensorstore source directory") -set(DEPS_DIR "${TENSORSTORE_BUILD_DIR}/_deps") - -# Include paths (matching what tensorstore tests use) -include_directories( - ${TENSORSTORE_SOURCE_DIR} - ${DEPS_DIR}/absl-src - ${DEPS_DIR}/re2-src - ${DEPS_DIR}/riegeli-src -) - -include_directories(SYSTEM - ${DEPS_DIR}/half-build/include - ${DEPS_DIR}/half-src/include - ${DEPS_DIR}/nlohmann_json-build/include - ${DEPS_DIR}/nlohmann_json-src/include - ${TENSORSTORE_BUILD_DIR} -) - -# Compiler flags -add_compile_options( - -fPIE - -Wno-deprecated-declarations - -Wno-sign-compare - -Wno-unused-but-set-parameter - -Wno-maybe-uninitialized - -Wno-sequence-point - -Wno-unknown-warning-option - -Wno-stringop-overflow - -fsized-deallocation -) - -# Find all the static libraries we need from the tensorstore build -file(GLOB TENSORSTORE_LIBS "${TENSORSTORE_BUILD_DIR}/libtensorstore*.a") -file(GLOB_RECURSE ABSEIL_LIBS "${DEPS_DIR}/absl-build/absl/*.a") -file(GLOB_RECURSE RIEGELI_LIBS "${DEPS_DIR}/riegeli-build/*.a") - -# Additional dependency libraries - corrected paths -file(GLOB_RECURSE BLOSC_LIBS "${DEPS_DIR}/blosc-build/*.a") -file(GLOB_RECURSE ZSTD_LIBS "${DEPS_DIR}/zstd-build/*.a") -file(GLOB_RECURSE RE2_LIBS "${DEPS_DIR}/re2-build/*.a") -file(GLOB_RECURSE SNAPPY_LIBS "${DEPS_DIR}/snappy-build/*.a") -file(GLOB_RECURSE BROTLI_LIBS "${DEPS_DIR}/brotli-build/*.a") -file(GLOB_RECURSE LZ4_LIBS "${DEPS_DIR}/lz4-build/*.a") -file(GLOB_RECURSE ZLIB_LIBS "${DEPS_DIR}/zlib-build/*.a") -file(GLOB_RECURSE PROTOBUF_LIBS "${DEPS_DIR}/protobuf-build/*.a") -file(GLOB_RECURSE GRPC_LIBS "${DEPS_DIR}/grpc-build/*.a") -file(GLOB_RECURSE CARES_LIBS "${DEPS_DIR}/c-ares-build/*.a") -file(GLOB_RECURSE SSL_LIBS "${DEPS_DIR}/boringssl-build/ssl/*.a") -file(GLOB_RECURSE CRYPTO_LIBS "${DEPS_DIR}/boringssl-build/crypto/*.a") -file(GLOB_RECURSE LIBLZMA_LIBS "${DEPS_DIR}/liblzma-build/*.a") -file(GLOB_RECURSE BZIP2_LIBS "${DEPS_DIR}/bzip2-build/*.a") -file(GLOB_RECURSE JPEG_LIBS "${DEPS_DIR}/jpeg-build/*.a") -file(GLOB_RECURSE PNG_LIBS "${DEPS_DIR}/png-build/*.a") -file(GLOB_RECURSE TIFF_LIBS "${DEPS_DIR}/tiff-build/*.a") -file(GLOB_RECURSE AVIF_LIBS "${DEPS_DIR}/avif-build/*.a") -file(GLOB_RECURSE AOM_LIBS "${DEPS_DIR}/aom-build/*.a") -file(GLOB_RECURSE WEBP_LIBS "${DEPS_DIR}/webp-build/*.a") -file(GLOB_RECURSE CURL_LIBS "${DEPS_DIR}/curl-build/*.a") - -# Create executable -add_executable(read_structured_zarr3 read_structured_zarr3.cc) - -# Link libraries - use whole-archive for libraries that use static registration -# These include drivers, codecs, kvstores, and context resource providers -target_link_libraries(read_structured_zarr3 PRIVATE - # Force inclusion of libraries with static registrations - -Wl,--whole-archive - - # Context resource providers - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_data_copy_concurrency_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_file_io_concurrency_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_cache_cache_pool_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_concurrency_resource.a - - # Zarr3 driver and codecs - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_driver.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_blosc.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_bytes.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_crc32c.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_gzip.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_transpose.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_zstd.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_sharding_indexed.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_codec_chain_spec.a - - # File kvstore and its resource providers - ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file_file_resource.a - - -Wl,--no-whole-archive - - -Wl,--start-group - - # Tensorstore libs - ${TENSORSTORE_LIBS} - - # Riegeli - ${RIEGELI_LIBS} - - # Abseil - ${ABSEIL_LIBS} - - # Compression libs - ${BLOSC_LIBS} - ${ZSTD_LIBS} - ${LZ4_LIBS} - ${SNAPPY_LIBS} - ${BROTLI_LIBS} - ${ZLIB_LIBS} - ${LIBLZMA_LIBS} - ${BZIP2_LIBS} - - # Regex - ${RE2_LIBS} - - # Protocol buffers and gRPC - ${PROTOBUF_LIBS} - ${GRPC_LIBS} - ${CARES_LIBS} - - # SSL/TLS - ${SSL_LIBS} - ${CRYPTO_LIBS} - - # Image libraries - ${JPEG_LIBS} - ${PNG_LIBS} - ${TIFF_LIBS} - ${AVIF_LIBS} - ${AOM_LIBS} - ${WEBP_LIBS} - - # HTTP - ${CURL_LIBS} - - -Wl,--end-group - - # System libraries - pthread - dl - m - rt -) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc deleted file mode 100644 index 720ef1330..000000000 --- a/examples/read_structured_zarr3.cc +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright 2024 The TensorStore Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Standalone test for reading structured data from Zarr v3 arrays. -// -// This test opens two Zarr v3 arrays: -// 1. A structured array with named fields (headers/) -// 2. A raw bytes array containing struct data (raw_headers/) -// -// Both arrays should contain the same data, allowing comparison of: -// - Field-based access vs manual byte extraction -// - Structured dtype parsing vs raw byte handling -// - New open_as_void option for raw byte access to structured data -// -// Usage: -// bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir -// -// Or with cmake: -// cd examples/build && ./read_structured_zarr3 --zarr_path=/path/to/parent/dir -// -// Where the parent dir contains both 'headers/' and 'raw_headers/' subdirs. - -#include - -#include -#include -#include -#include - -#include "absl/flags/flag.h" -#include "absl/flags/parse.h" -#include "absl/status/status.h" -#include -#include "tensorstore/array.h" -#include "tensorstore/context.h" -#include "tensorstore/data_type.h" -#include "tensorstore/index.h" -#include "tensorstore/open.h" -#include "tensorstore/open_mode.h" -#include "tensorstore/spec.h" -#include "tensorstore/tensorstore.h" -#include "tensorstore/util/result.h" -#include "tensorstore/util/status.h" - -// Internal headers for testing dtype parsing -#include "tensorstore/driver/zarr3/dtype.h" - -// Additional headers for string operations -#include "absl/strings/str_join.h" - -ABSL_FLAG(std::string, zarr_path, - "/home/ubuntu/source/tensorstore/filt_mig.mdio", - "Path to the parent .mdio directory containing headers/ and raw_headers/"); - -namespace { - -using ::tensorstore::Index; - -// Field layout from the zarr.json metadata: -// The structured dtype has the following fields with their byte offsets: -// trace_seq_num_line: int32 @ 0 -// trace_seq_num_reel: int32 @ 4 -// ... (many more fields) ... -// inline: int32 @ 180 -// crossline: int32 @ 184 -// cdp_x: int32 @ 188 -// cdp_y: int32 @ 192 -// -// Total struct size: 196 bytes (matches blosc typesize) - -constexpr size_t kInlineFieldOffset = 180; -constexpr size_t kStructSize = 196; - -// Read and parse the zarr.json metadata to display info about structured type -void PrintZarrMetadata(const std::string& zarr_path) { - std::string metadata_path = zarr_path + "/zarr.json"; - std::ifstream file(metadata_path); - if (!file.is_open()) { - std::cerr << "Could not open " << metadata_path << std::endl; - return; - } - - nlohmann::json metadata; - try { - file >> metadata; - } catch (const nlohmann::json::parse_error& e) { - std::cerr << "Failed to parse zarr.json: " << e.what() << std::endl; - return; - } - - std::cout << "\n=== Zarr Metadata ===" << std::endl; - std::cout << "Shape: " << metadata["shape"].dump() << std::endl; - std::cout << "Dimension names: " << metadata["dimension_names"].dump() - << std::endl; - - if (metadata.contains("data_type")) { - auto& dt = metadata["data_type"]; - std::cout << "\nData type format:" << std::endl; - if (dt.is_object()) { - std::cout << " Type: object with name=\"" << dt["name"].get() - << "\"" << std::endl; - if (dt.contains("configuration") && - dt["configuration"].contains("fields")) { - auto& fields = dt["configuration"]["fields"]; - std::cout << " Number of fields: " << fields.size() << std::endl; - std::cout << " Fields:" << std::endl; - size_t byte_offset = 0; - for (const auto& field : fields) { - std::string name = field[0].get(); - std::string type = field[1].get(); - size_t size = (type == "int32" || type == "uint32" || type == "float32") - ? 4 - : 2; // int16/uint16 - std::cout << " " << name << ": " << type << " @ byte " << byte_offset - << std::endl; - byte_offset += size; - } - std::cout << " Total struct size: " << byte_offset << " bytes" - << std::endl; - } - } else if (dt.is_string()) { - std::cout << " Type: simple \"" << dt.get() << "\"" - << std::endl; - } else if (dt.is_array()) { - std::cout << " Type: array with " << dt.size() << " fields" << std::endl; - } - } - - if (metadata.contains("codecs")) { - std::cout << "\nCodecs: " << metadata["codecs"].dump(2) << std::endl; - } -} - -// Helper function to read and display inline field from an array -absl::Status ReadInlineField(const tensorstore::TensorStore<>& store, - const std::string& array_name, - bool is_raw_bytes = false) { - // Get information about the array - auto domain = store.domain(); - std::cout << "\n=== " << array_name << " Array Info ===" << std::endl; - std::cout << "Domain: " << domain << std::endl; - std::cout << "Dtype: " << store.dtype() << std::endl; - std::cout << "Rank: " << store.rank() << std::endl; - - auto shape = domain.shape(); - std::cout << "Shape: ["; - for (int i = 0; i < shape.size(); ++i) { - if (i > 0) std::cout << ", "; - std::cout << shape[i]; - } - std::cout << "]" << std::endl; - - // Read all data - std::cout << "\n=== Reading " << array_name << " Data ===" << std::endl; - TENSORSTORE_ASSIGN_OR_RETURN( - auto array, tensorstore::Read(store).result()); - - std::cout << "Read complete. Array size: " << array.num_elements() - << " elements" << std::endl; - std::cout << "Data type: " << array.dtype() << std::endl; - - Index num_inline, num_crossline; - const int32_t* int_ptr; - - if (is_raw_bytes) { - // For raw bytes, we need to extract the inline field manually - // Shape is [inline, crossline, struct_size] - num_inline = shape[0]; - num_crossline = shape[1]; - Index struct_size = shape[2]; - if (struct_size != kStructSize) { - std::cout << "Warning: Raw struct size (" << struct_size - << ") differs from expected header struct size (" << kStructSize - << "). Assuming padding." << std::endl; - } - - // Extract inline field (4 bytes starting at offset 180) - auto byte_ptr = reinterpret_cast(array.data()); - std::vector inline_values(num_inline * num_crossline); - - for (Index i = 0; i < num_inline; ++i) { - for (Index j = 0; j < num_crossline; ++j) { - Index struct_offset = (i * num_crossline + j) * struct_size; - Index field_offset = struct_offset + kInlineFieldOffset; - std::memcpy(&inline_values[i * num_crossline + j], - byte_ptr + field_offset, 4); - } - } - - std::cout << "Extracted inline field from raw bytes at offset " - << kInlineFieldOffset << std::endl; - int_ptr = inline_values.data(); - } else { - // For structured array, field access already gave us int32 values - num_inline = shape[0]; - num_crossline = shape[1]; - int_ptr = reinterpret_cast(array.data()); - } - - std::cout << "\n=== Inline field values from " << array_name - << " (shape: " << num_inline << " x " << num_crossline << ") ===" << std::endl; - - // Print first 10 rows (or fewer if less data) - Index rows_to_print = std::min(num_inline, Index{10}); - Index cols_to_print = std::min(num_crossline, Index{10}); - - for (Index i = 0; i < rows_to_print; ++i) { - for (Index j = 0; j < cols_to_print; ++j) { - std::cout << int_ptr[i * num_crossline + j]; - if (j < cols_to_print - 1) { - std::cout << "\t"; - } - } - if (num_crossline > cols_to_print) { - std::cout << "\t..."; - } - std::cout << std::endl; - } - if (num_inline > rows_to_print) { - std::cout << "... (" << (num_inline - rows_to_print) << " more rows)" - << std::endl; - } - - std::cout << "\n=== " << array_name << " Summary ===" << std::endl; - std::cout << "Successfully read " << (num_inline * num_crossline) - << " inline values" << std::endl; - - // Show some statistics - int32_t min_val = int_ptr[0], max_val = int_ptr[0]; - int64_t sum = 0; - for (Index i = 0; i < num_inline * num_crossline; ++i) { - min_val = std::min(min_val, int_ptr[i]); - max_val = std::max(max_val, int_ptr[i]); - sum += int_ptr[i]; - } - std::cout << "Min value: " << min_val << std::endl; - std::cout << "Max value: " << max_val << std::endl; - std::cout << "Mean value: " << (static_cast(sum) / (num_inline * num_crossline)) << std::endl; - - return absl::OkStatus(); -} - -absl::Status Run(const std::string& zarr_path) { - std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; - std::cout << "Opening zarr3 arrays in: " << zarr_path << std::endl; - - auto context = tensorstore::Context::Default(); - - // First, display metadata information for structured array - std::string headers_path = zarr_path + "/headers"; - PrintZarrMetadata(headers_path); - - // Test raw_bytes parsing by reading and parsing the raw_headers zarr.json - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TESTING RAW_BYTES PARSING" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::string raw_metadata_path = zarr_path + "/raw_headers/zarr.json"; - std::ifstream raw_file(raw_metadata_path); - if (!raw_file.is_open()) { - std::cout << "Could not open " << raw_metadata_path << std::endl; - return absl::NotFoundError("Raw headers metadata not found"); - } - - nlohmann::json raw_metadata; - try { - raw_file >> raw_metadata; - } catch (const nlohmann::json::parse_error& e) { - std::cout << "Failed to parse raw zarr.json: " << e.what() << std::endl; - return absl::DataLossError("Invalid raw metadata JSON"); - } - - std::cout << "Raw headers data_type: " << raw_metadata["data_type"].dump(2) << std::endl; - - // Test parsing the raw_bytes data type - std::cout << "Testing raw_bytes dtype parsing..." << std::endl; - - // For now, just verify the JSON structure is what we expect - if (!raw_metadata.contains("data_type")) { - std::cout << "FAILED: No data_type in metadata" << std::endl; - return absl::NotFoundError("Missing data_type"); - } - - auto& dt = raw_metadata["data_type"]; - if (!dt.is_object() || !dt.contains("name") || dt["name"] != "raw_bytes") { - std::cout << "FAILED: data_type is not raw_bytes extension" << std::endl; - return absl::InvalidArgumentError("Not raw_bytes extension"); - } - - if (!dt.contains("configuration") || !dt["configuration"].contains("length_bytes")) { - std::cout << "FAILED: Missing length_bytes in configuration" << std::endl; - return absl::InvalidArgumentError("Missing length_bytes"); - } - - int length_bytes = dt["configuration"]["length_bytes"]; - std::cout << "SUCCESS: Found raw_bytes extension with length_bytes = " << length_bytes << std::endl; - std::cout << "This should parse to:" << std::endl; - std::cout << " - Single field with byte_t dtype" << std::endl; - std::cout << " - Field shape: [" << length_bytes << "]" << std::endl; - std::cout << " - Bytes per outer element: " << length_bytes << std::endl; - - // Now actually test the parsing implementation - std::cout << "\n=== Testing ParseDType Implementation ===" << std::endl; - auto dtype_result = tensorstore::internal_zarr3::ParseDType(dt); - if (!dtype_result.ok()) { - std::cout << "FAILED: Could not parse raw_bytes data type: " << dtype_result.status() << std::endl; - return dtype_result.status(); - } - - auto dtype = std::move(dtype_result).value(); - std::cout << "SUCCESS: ParseDType worked!" << std::endl; - std::cout << " Fields: " << dtype.fields.size() << std::endl; - std::cout << " Has fields: " << dtype.has_fields << std::endl; - std::cout << " Bytes per outer element: " << dtype.bytes_per_outer_element << std::endl; - - if (!dtype.fields.empty()) { - const auto& field = dtype.fields[0]; - std::cout << " Field name: '" << field.name << "'" << std::endl; - std::cout << " Field dtype: " << field.dtype << std::endl; - std::cout << " Field shape: [" << absl::StrJoin(field.field_shape, ", ") << "]" << std::endl; - std::cout << " Field num_inner_elements: " << field.num_inner_elements << std::endl; - std::cout << " Field num_bytes: " << field.num_bytes << std::endl; - } - - // Verify the parsing is correct - bool parsing_correct = true; - if (dtype.fields.size() != 1) { - std::cout << "ERROR: Expected 1 field, got " << dtype.fields.size() << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].name != "") { - std::cout << "ERROR: Expected empty field name, got '" << dtype.fields[0].name << "'" << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].dtype != tensorstore::dtype_v) { - std::cout << "ERROR: Expected byte_t dtype, got " << dtype.fields[0].dtype << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].field_shape != std::vector{length_bytes}) { - std::cout << "ERROR: Expected field shape [" << length_bytes << "], got [" - << absl::StrJoin(dtype.fields[0].field_shape, ", ") << "]" << std::endl; - parsing_correct = false; - } - if (dtype.bytes_per_outer_element != length_bytes) { - std::cout << "ERROR: Expected " << length_bytes << " bytes per element, got " - << dtype.bytes_per_outer_element << std::endl; - parsing_correct = false; - } - - if (parsing_correct) { - std::cout << "\n✅ PARSING VERIFICATION: All checks passed!" << std::endl; - std::cout << "The raw_bytes extension is correctly parsed." << std::endl; - } else { - std::cout << "\n❌ PARSING VERIFICATION: Some checks failed!" << std::endl; - return absl::InternalError("Parsing verification failed"); - } - - // Test 1: Read from structured array using field access - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 1: Reading from structured 'headers' array" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - ::nlohmann::json headers_spec = ::nlohmann::json::object(); - headers_spec["driver"] = "zarr3"; - headers_spec["kvstore"] = ::nlohmann::json::object(); - headers_spec["kvstore"]["driver"] = "file"; - headers_spec["kvstore"]["path"] = headers_path + "/"; - headers_spec["field"] = "inline"; // Extract inline field (int32 at byte offset 180) - - std::cout << "Spec: " << headers_spec.dump(2) << std::endl; - - auto headers_open_result = - tensorstore::Open(headers_spec, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!headers_open_result.ok()) { - std::cout << "\n=== Headers Open Failed ===" << std::endl; - std::cout << "Status: " << headers_open_result.status() << std::endl; - return headers_open_result.status(); - } - - auto headers_store = std::move(headers_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_store, "headers")); - - // Test 2: Read from raw bytes array (no special void access needed) - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 2: Reading from raw 'raw_headers' array" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::string raw_headers_path = zarr_path + "/raw_headers"; - ::nlohmann::json raw_spec = ::nlohmann::json::object(); - raw_spec["driver"] = "zarr3"; - raw_spec["kvstore"] = ::nlohmann::json::object(); - raw_spec["kvstore"]["driver"] = "file"; - raw_spec["kvstore"]["path"] = raw_headers_path + "/"; - // No field specified - raw_bytes has a single anonymous field - - std::cout << "Spec: " << raw_spec.dump(2) << std::endl; - - auto raw_open_result = - tensorstore::Open(raw_spec, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!raw_open_result.ok()) { - std::cout << "\n=== Raw Headers Open Failed ===" << std::endl; - std::cout << "Status: " << raw_open_result.status() << std::endl; - return raw_open_result.status(); - } - - auto raw_store = std::move(raw_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); - - // Test 3: Read from headers array as void (open_as_void=true) - // Use a fresh context to avoid cache sharing with Test 1 - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 3: Reading from 'headers' array as void (open_as_void=true)" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - auto context_void = tensorstore::Context::Default(); - - ::nlohmann::json headers_void_spec = ::nlohmann::json::object(); - headers_void_spec["driver"] = "zarr3"; - headers_void_spec["kvstore"] = ::nlohmann::json::object(); - headers_void_spec["kvstore"]["driver"] = "file"; - headers_void_spec["kvstore"]["path"] = headers_path + "/"; - headers_void_spec["open_as_void"] = true; // New option for raw byte access - - std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; - - auto headers_void_open_result = - tensorstore::Open(headers_void_spec, context_void, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!headers_void_open_result.ok()) { - std::cout << "\n=== Headers (void) Open Failed ===" << std::endl; - std::cout << "Status: " << headers_void_open_result.status() << std::endl; - return headers_void_open_result.status(); - } - - auto headers_void_store = std::move(headers_void_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (open_as_void)", /*is_raw_bytes=*/true)); - - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; - std::cout << std::string(60, '=') << std::endl; - std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" - << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" - << "- Test 3: 'headers' with open_as_void=true provides raw byte access to structured data\n" - << "All three extract the inline field from byte offset " << kInlineFieldOffset - << " in " << kStructSize << "-byte structs." << std::endl; - - return absl::OkStatus(); -} - -} // namespace - -int main(int argc, char** argv) { - absl::ParseCommandLine(argc, argv); - - std::string zarr_path = absl::GetFlag(FLAGS_zarr_path); - if (zarr_path.empty()) { - std::cerr << "Error: --zarr_path is required" << std::endl; - return 1; - } - - // Verify the path structure - std::string headers_path = zarr_path + "/headers"; - std::string raw_headers_path = zarr_path + "/raw_headers"; - - std::cout << "Expecting arrays at:" << std::endl; - std::cout << " Structured: " << headers_path << std::endl; - std::cout << " Raw bytes: " << raw_headers_path << std::endl; - std::cout << std::endl; - - auto status = Run(zarr_path); - if (!status.ok()) { - std::cerr << "\nFinal status: " << status << std::endl; - return 1; - } - - return 0; -} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 51cc17f42..ec30edd82 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -779,12 +779,16 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { std::string GetDataCacheKey(const void* metadata) override { std::string result; + const auto& zarr_metadata = *static_cast(metadata); internal::EncodeCacheKey( - &result, spec().store.path, - static_cast(metadata)->GetCompatibilityKey()); + &result, + spec().store.path, + zarr_metadata.GetCompatibilityKey(), + spec().open_as_void ? "void" : "normal"); return result; } + Result> Create(const void* existing_metadata, CreateOptions options) override { if (existing_metadata) { From 8c4c4cafe2b33df06131d985c2574c973f817b3d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 16:07:26 +0000 Subject: [PATCH 15/59] Remove vestigial example build --- examples/BUILD | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/examples/BUILD b/examples/BUILD index 4dcb2d604..94acdba14 100644 --- a/examples/BUILD +++ b/examples/BUILD @@ -122,26 +122,3 @@ tensorstore_cc_binary( "@riegeli//riegeli/bytes:writer", ], ) - -tensorstore_cc_binary( - name = "read_structured_zarr3", - srcs = ["read_structured_zarr3.cc"], - deps = [ - "//tensorstore", - "//tensorstore:array", - "//tensorstore:context", - "//tensorstore:data_type", - "//tensorstore:index", - "//tensorstore:open", - "//tensorstore:open_mode", - "//tensorstore:spec", - "//tensorstore/driver/zarr3", - "//tensorstore/kvstore/file", - "//tensorstore/util:result", - "//tensorstore/util:status", - "@abseil-cpp//absl/flags:flag", - "@abseil-cpp//absl/flags:parse", - "@abseil-cpp//absl/status", - "@nlohmann_json//:json", - ], -) From 4b590f855adc963fe20940bd704693d81190483a Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 11:11:14 -0600 Subject: [PATCH 16/59] V3 structs fix fills (#4) * Use the appropriate fill value for open_as_void structured data * Cleanup --- tensorstore/driver/zarr3/driver.cc | 70 ++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index ec30edd82..f86e4ad88 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -171,12 +171,74 @@ class ZarrDriverSpec IndexTransformView<> transform) const override { SharedArray fill_value{schema.fill_value()}; - const auto& metadata = metadata_constraints; - if (metadata.fill_value && !metadata.fill_value->empty()) { - fill_value = (*metadata.fill_value)[0]; + const auto& constraints = metadata_constraints; + + // If constraints don't specify a fill value, just use the schema's. + if (!constraints.fill_value || constraints.fill_value->empty()) { + return fill_value; + } + + const auto& vec = *constraints.fill_value; + + // If we don't have dtype information, we can't do field-aware logic. + if (!constraints.data_type) { + if (!vec.empty()) return vec[0]; + return fill_value; + } + + const ZarrDType& dtype = *constraints.data_type; + + // Determine which field this spec refers to (or void access). + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(dtype, selected_field, open_as_void)); + + // ── Normal field access: just return that field's fill_value ─────────────── + if (field_index != kVoidFieldIndex) { + if (field_index < vec.size()) { + return vec[field_index]; + } + // Fallback to "no fill". + return SharedArray(); + } + + // ── Void access: synthesize a byte-level fill value ──────────────────────── + // + // We want a 1D byte array of length bytes_per_outer_element whose contents + // are exactly the Zarr-defined struct layout built from per-field fills. + + // Special case: "raw bytes" field (single byte_t field with flexible shape). + // In that case the existing fill array already has the correct bytes. + if (dtype.fields.size() == 1 && + dtype.fields[0].dtype.id() == DataTypeId::byte_t && + !dtype.fields[0].flexible_shape.empty()) { + // vec[0] should be a byte array of size bytes_per_outer_element. + return vec[0]; + } + + const Index nbytes = dtype.bytes_per_outer_element; + + auto byte_arr = AllocateArray( + span({nbytes}), c_order, default_init, + dtype_v); + auto* dst = static_cast(byte_arr.data()); + std::memset(dst, 0, static_cast(nbytes)); + + // Pack each field's scalar fill into its byte_offset region. + for (size_t i = 0; i < dtype.fields.size() && i < vec.size(); ++i) { + const auto& field = dtype.fields[i]; + const auto& field_fill = vec[i]; + if (!field_fill.valid()) continue; + + // We assume a single outer element per field here (which is exactly how + // FillValueJsonBinder constructs per-field fill values). + std::memcpy( + dst + field.byte_offset, + static_cast(field_fill.data()), + static_cast(field.num_bytes)); } - return fill_value; + return byte_arr; } Result GetDimensionUnits() const override { From c0082a0f09c4537bed65aaaf17939f8825204985 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 17:22:51 +0000 Subject: [PATCH 17/59] Add new options to schema --- tensorstore/driver/zarr3/schema.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tensorstore/driver/zarr3/schema.yml b/tensorstore/driver/zarr3/schema.yml index 4f9733415..9491027b1 100644 --- a/tensorstore/driver/zarr3/schema.yml +++ b/tensorstore/driver/zarr3/schema.yml @@ -17,6 +17,31 @@ allOf: automatically. When creating a new array, the new metadata is obtained by combining these metadata constraints with any `Schema` constraints. $ref: driver/zarr3/Metadata + field: + type: string + title: Field selection for structured arrays. + description: | + Name of the field to select from a structured array. When specified, + the tensorstore will provide access to only the specified field of + each element in the structured array. + open_as_void: + type: boolean + default: false + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. + oneOf: + - not: + anyOf: + - required: ["field"] + - required: ["open_as_void"] + - allOf: + - not: + required: ["field"] + - not: + required: ["open_as_void"] examples: - driver: zarr3 kvstore: From 9a46c82968fb1e70e1cb14e3b827dcf627b80463 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 17:31:17 +0000 Subject: [PATCH 18/59] Fix copyright header date --- tensorstore/driver/zarr3/dtype.cc | 2 +- tensorstore/driver/zarr3/dtype.h | 2 +- tensorstore/driver/zarr3/dtype_test.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 5b3261812..b8aacaa68 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -1,4 +1,4 @@ -// Copyright 2020 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h index 430dd8849..73a6b0961 100644 --- a/tensorstore/driver/zarr3/dtype.h +++ b/tensorstore/driver/zarr3/dtype.h @@ -1,4 +1,4 @@ -// Copyright 2020 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index ef55aba09..709178bc3 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -1,4 +1,4 @@ -// Copyright 2023 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From b9b5e41db3266155aa47323249f18687a1e2e45b Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 12:52:30 -0600 Subject: [PATCH 19/59] Cleanup (#5) --- tensorstore/driver/zarr3/driver.cc | 2 -- tensorstore/driver/zarr3/dtype_test.cc | 1 - 2 files changed, 3 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f86e4ad88..f65533197 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -737,8 +737,6 @@ class ZarrDriver : public ZarrDriverBase { if (metadata.fill_value.empty()) { return SharedArray(); } - // return metadata.fill_value[0]; - // TODO: Doe we actually need to validate this or can we trust that component_index will return a valid index? size_t index = this->component_index(); if (index >= metadata.fill_value.size()) { return absl::OutOfRangeError("Component index out of bounds"); diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index 709178bc3..a41830069 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -17,7 +17,6 @@ #include #include -#include // for std::byte #include #include From 31e55ec60e006e7a68abf3c64cf43a3cdf28072a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Jan 2026 15:19:56 +0000 Subject: [PATCH 20/59] Remove default values --- tensorstore/driver/zarr3/chunk_cache.h | 6 +++--- tensorstore/driver/zarr3/metadata.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index a39eb1dc8..f9ff19a00 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -159,7 +159,7 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void = false); + bool open_as_void); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -265,7 +265,7 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void = false) + bool open_as_void) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index 857210546..d091dea22 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -230,14 +230,14 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, /// unspecified. Result> GetNewMetadata( const ZarrMetadataConstraints& metadata_constraints, - const Schema& schema, std::string_view selected_field = {}, - bool open_as_void = false); + const Schema& schema, std::string_view selected_field, + bool open_as_void); absl::Status ValidateDataType(DataType dtype); Result GetFieldIndex(const ZarrDType& dtype, std::string_view selected_field, - bool open_as_void = false); + bool open_as_void); struct SpecRankAndFieldInfo { DimensionIndex chunked_rank = dynamic_rank; From 89098f8e26649362f81bbfe424a6bf8a47c82b3f Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:11:28 +0000 Subject: [PATCH 21/59] zarr3: Add mutual exclusivity validation for field and open_as_void Matches the pattern from zarr v2 driver (PR #272). When both "field" and "open_as_void" are specified in the spec, return an error since these options are mutually exclusive - field selects a specific field from a structured array, while open_as_void provides raw byte access to the entire structure. --- tensorstore/driver/zarr3/driver.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f65533197..2b0530fcb 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -151,10 +151,15 @@ class ZarrDriverSpec [](auto* obj) { *obj = std::string{}; }))), jb::Member("open_as_void", jb::Projection<&ZarrDriverSpec::open_as_void>( jb::DefaultValue( - [](auto* v) { *v = false; })))); - - - + [](auto* v) { *v = false; }))), + jb::Initialize([](auto* obj) { + // Validate that field and open_as_void are mutually exclusive + if (obj->open_as_void && !obj->selected_field.empty()) { + return absl::InvalidArgumentError( + "\"field\" and \"open_as_void\" are mutually exclusive"); + } + return absl::OkStatus(); + })); absl::Status ApplyOptions(SpecOptions&& options) override { if (options.minimal_spec) { From 471aa1b68973aba81d84a0050341a5886330ac78 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:11:48 +0000 Subject: [PATCH 22/59] zarr3: Reject URL syntax when selected_field or open_as_void specified The zarr3 URL syntax cannot represent field selection or void access mode. Following the pattern from zarr v2 driver (PR #272), ToUrl() now returns an error when either of these options is specified instead of silently ignoring them. --- tensorstore/driver/zarr3/driver.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 2b0530fcb..2190464d9 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -263,6 +263,14 @@ class ZarrDriverSpec } Result ToUrl() const override { + if (!selected_field.empty()) { + return absl::InvalidArgumentError( + "zarr3 URL syntax not supported with selected_field specified"); + } + if (open_as_void) { + return absl::InvalidArgumentError( + "zarr3 URL syntax not supported with open_as_void specified"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto base_url, store.ToUrl()); return tensorstore::StrCat(base_url, "|", id, ":"); } From 34e52fe6331eb6eca999f6b3b384d1bdc00ec2e6 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:12:32 +0000 Subject: [PATCH 23/59] zarr3: Preserve open_as_void flag in GetBoundSpecData for spec round-trip Following the pattern from zarr v2 driver (PR #272), override GetBoundSpecData in ZarrDataCache to set spec.open_as_void from ChunkCacheImpl::open_as_void_. This ensures that when you open a store with open_as_void=true and then call spec(), the resulting spec correctly has open_as_void=true set. Without this fix, opening a store with open_as_void=true and then getting its spec would lose the open_as_void flag, causing incorrect behavior if the spec is used to re-open the store. --- tensorstore/driver/zarr3/driver.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 2190464d9..b21eb9cd2 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -727,6 +727,17 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { component_index); } + absl::Status GetBoundSpecData(KvsDriverSpec& spec_base, + const void* metadata_ptr, + size_t component_index) override { + TENSORSTORE_RETURN_IF_ERROR( + DataCacheBase::GetBoundSpecData(spec_base, metadata_ptr, component_index)); + auto& spec = static_cast(spec_base); + // Preserve the open_as_void flag so spec round-trips correctly + spec.open_as_void = ChunkCacheImpl::open_as_void_; + return absl::OkStatus(); + } + internal::ChunkGridSpecification grid_; }; From 0db22e4623565d666a0a0af5b7cb3799a44c301e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:25:47 +0000 Subject: [PATCH 24/59] zarr3: Add open_as_void tests and fix BUILD dependency Add comprehensive tests for open_as_void functionality following the patterns from zarr v2 driver (PR #272): Tests that PASS: - OpenAsVoidSimpleType: Verifies simple type arrays can be opened with open_as_void, gaining an extra dimension for bytes - OpenAsVoidSpecRoundtrip: Verifies open_as_void preserved in spec JSON - OpenAsVoidGetBoundSpecData: Verifies spec() on void store returns open_as_void=true (tests the GetBoundSpecData fix) - OpenAsVoidCannotUseWithField: Verifies mutual exclusivity validation - OpenAsVoidUrlNotSupported: Verifies ToUrl() rejects open_as_void - FieldSelectionUrlNotSupported: Verifies ToUrl() rejects selected_field Tests marked TODO (pending codec chain implementation): - OpenAsVoidStructuredType - OpenAsVoidWithCompression - OpenAsVoidReadWrite - OpenAsVoidWriteRoundtrip Also fixes BUILD file: adds :metadata dependency to :chunk_cache target to provide the dtype.h header that chunk_cache.h includes. --- tensorstore/driver/zarr3/BUILD | 1 + tensorstore/driver/zarr3/driver_test.cc | 223 ++++++++++++++++++++++++ 2 files changed, 224 insertions(+) diff --git a/tensorstore/driver/zarr3/BUILD b/tensorstore/driver/zarr3/BUILD index b9e442bdf..685050024 100644 --- a/tensorstore/driver/zarr3/BUILD +++ b/tensorstore/driver/zarr3/BUILD @@ -221,6 +221,7 @@ tensorstore_cc_library( srcs = ["chunk_cache.cc"], hdrs = ["chunk_cache.h"], deps = [ + ":metadata", "//tensorstore:array", "//tensorstore:array_storage_statistics", "//tensorstore:batch", diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index ffef84247..54f79ba79 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1830,4 +1830,227 @@ TEST(DriverTest, UrlSchemeRoundtrip) { {"kvstore", {{"driver", "memory"}, {"path", "abc.zarr3/def/"}}}}); } +// Tests for open_as_void functionality + +TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { + // Test open_as_void with a simple data type (int16) + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int16"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write some data + auto data = tensorstore::MakeArray({{1, 2}, {3, 4}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be the size of the data type (2 bytes for int16) + EXPECT_EQ(2, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending implementation +// of multi-field structured type handling in open_as_void mode. The v3 +// implementation needs additional work to properly handle structured types +// with multiple fields when opened with open_as_void=true. + +// TODO(b/xxx): OpenAsVoidWithCompression test disabled pending implementation +// of void access codec chain handling. Currently fails with "Not enough data" +// error when reading void-accessed data through compression codecs. + +TEST(Zarr3DriverTest, OpenAsVoidSpecRoundtrip) { + // Test that open_as_void is properly preserved in spec round-trips + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", "int16"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto json_result, spec.ToJson()); + + EXPECT_EQ(true, json_result.value("open_as_void", false)); +} + +TEST(Zarr3DriverTest, OpenAsVoidGetBoundSpecData) { + // Test that open_as_void is correctly preserved when getting spec from an + // opened void store. This tests ZarrDataCache::GetBoundSpecData. + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int16"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec_json{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec_json, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // Get the spec from the opened void store - this invokes GetBoundSpecData + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto obtained_spec, void_store.spec()); + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto obtained_json, obtained_spec.ToJson()); + + // Verify open_as_void is true in the obtained spec + EXPECT_EQ(true, obtained_json.value("open_as_void", false)); + + // Also verify metadata was correctly populated + EXPECT_TRUE(obtained_json.contains("metadata")); + auto& metadata = obtained_json["metadata"]; + EXPECT_EQ("int16", metadata.value("data_type", "")); +} + +TEST(Zarr3DriverTest, OpenAsVoidCannotUseWithField) { + // Test that specifying both open_as_void and field is rejected as they are + // mutually exclusive options. + ::nlohmann::json spec_with_both{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", + {{"name", "structured"}, + {"configuration", + {{"fields", + ::nlohmann::json::array({{"x", "uint8"}, {"y", "int16"}})}}}}}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + {"field", "x"}, + {"open_as_void", true}, + }; + + // Specifying both field and open_as_void should fail at spec parsing + EXPECT_THAT( + tensorstore::Spec::FromJson(spec_with_both), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("\"field\" and \"open_as_void\" are mutually " + "exclusive"))); +} + +TEST(Zarr3DriverTest, OpenAsVoidUrlNotSupported) { + // Test that open_as_void is not supported with URL syntax + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", "int16"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // ToUrl should fail when open_as_void is specified + EXPECT_THAT(spec.ToUrl(), StatusIs(absl::StatusCode::kInvalidArgument)); +} + +// TODO(b/xxx): OpenAsVoidReadWrite test disabled pending implementation +// of void access codec chain handling. Currently fails with "Not enough data" +// error when reading void-accessed data. + +// TODO(b/xxx): OpenAsVoidWriteRoundtrip test disabled pending implementation +// of void access codec chain handling. Currently fails with "Not enough data" +// error when reading/writing void-accessed data. + +TEST(Zarr3DriverTest, FieldSelectionUrlNotSupported) { + // Test that field selection is not supported with URL syntax + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"field", "x"}, + {"metadata", + { + {"data_type", + {{"name", "structured"}, + {"configuration", + {{"fields", + ::nlohmann::json::array({{"x", "uint8"}, {"y", "int16"}})}}}}}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // ToUrl should fail when field is specified + EXPECT_THAT(spec.ToUrl(), StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("selected_field"))); +} + } // namespace From 5fadaf0715fb79811ca97ec3555f2f4f0896589f Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:40:12 +0000 Subject: [PATCH 25/59] zarr3: Fix DecodeChunk and EncodeChunk for void access The codec chain is prepared for the original dtype and chunk shape (without the extra bytes dimension). For void access: DecodeChunk: - Strip the bytes dimension from grid's chunk_shape to get original shape - Decode using the original codec shape - Reinterpret the decoded bytes as [chunk_shape..., bytes_per_elem] EncodeChunk: - Input has shape [chunk_shape..., bytes_per_elem] of byte_t - Create a view with the original chunk shape and element_size - Encode using the original codec This follows the pattern from zarr v2 (PR #272) where the void metadata has the chunk_layout computed to match encoded/decoded layouts. --- tensorstore/driver/zarr3/chunk_cache.cc | 56 +++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index f14efd607..e39852222 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -158,11 +158,38 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, const size_t num_fields = dtype_.fields.size(); absl::InlinedVector, 1> field_arrays(num_fields); - // Special case: void access - return raw bytes directly + // Special case: void access - decode using original codec shape, then + // reinterpret as bytes with extra dimension. + // + // The codec was prepared for the original dtype and chunk_shape (without + // bytes dimension). We decode to that shape, then view the raw bytes with + // an extra dimension representing the bytes per element. if (open_as_void_) { + // The grid's chunk_shape for void has extra bytes dimension - strip it + // to get the original codec shape. + const auto& void_chunk_shape = grid().chunk_shape; + std::vector original_chunk_shape( + void_chunk_shape.begin(), + void_chunk_shape.end() - 1); // Strip bytes dimension + + // Decode using original codec shape TENSORSTORE_ASSIGN_OR_RETURN( - field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), - std::move(data))); + auto decoded_array, + codec_state_->DecodeArray(original_chunk_shape, std::move(data))); + + // Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem] + // This creates a view over the same memory but with byte dtype and extra dim + const auto& void_component_shape = grid().components[0].shape(); + auto byte_array = AllocateArray( + void_component_shape, c_order, default_init, + dtype_v); + + // Copy decoded data to byte array (handles potential layout differences) + std::memcpy(byte_array.data(), decoded_array.data(), + decoded_array.num_elements() * + decoded_array.dtype().size()); + + field_arrays[0] = std::move(byte_array); return field_arrays; } @@ -214,6 +241,29 @@ Result ZarrLeafChunkCache::EncodeChunk( span chunk_indices, span> component_arrays) { assert(component_arrays.size() == 1); + + // Special case: void access - reinterpret byte array back to original + // dtype shape before encoding. + // + // The input has shape [chunk_shape..., bytes_per_elem] of byte_t. + // The codec expects [chunk_shape] of the original dtype. + if (open_as_void_) { + const auto& byte_array = component_arrays[0]; + const Index bytes_per_element = dtype_.bytes_per_outer_element; + + // Build original chunk shape by stripping the bytes dimension + const auto& void_shape = byte_array.shape(); + std::vector original_shape(void_shape.begin(), void_shape.end() - 1); + + // Create a view over the byte data with original layout + // The codec expects the original dtype's element size for stride calculation + auto encoded_array = SharedArray( + byte_array.element_pointer(), + StridedLayout<>(c_order, bytes_per_element, original_shape)); + + return codec_state_->EncodeArray(encoded_array); + } + return codec_state_->EncodeArray(component_arrays[0]); } From a25dd7d36842ff9e26e8062f5013333e9891a83d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:40:20 +0000 Subject: [PATCH 26/59] zarr3: Add read/write tests for open_as_void Add tests that verify: - OpenAsVoidReadWrite: Write data via typed access, read via void access verifying byte layout is correct - OpenAsVoidWriteRoundtrip: Write via typed access, verify byte values can be read via void access with correct little-endian layout These tests verify the DecodeChunk fix works correctly for reading data written with the original dtype through void (byte) access. --- tensorstore/driver/zarr3/driver_test.cc | 123 ++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 54f79ba79..e590b2866 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2018,13 +2018,124 @@ TEST(Zarr3DriverTest, OpenAsVoidUrlNotSupported) { EXPECT_THAT(spec.ToUrl(), StatusIs(absl::StatusCode::kInvalidArgument)); } -// TODO(b/xxx): OpenAsVoidReadWrite test disabled pending implementation -// of void access codec chain handling. Currently fails with "Not enough data" -// error when reading void-accessed data. +TEST(Zarr3DriverTest, OpenAsVoidReadWrite) { + // Test reading and writing through open_as_void + auto context = Context::Default(); -// TODO(b/xxx): OpenAsVoidWriteRoundtrip test disabled pending implementation -// of void access codec chain handling. Currently fails with "Not enough data" -// error when reading/writing void-accessed data. + // Create an array + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "uint16"}, + {"shape", {2, 2}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write data as normal uint16 + auto data = + tensorstore::MakeArray({{0x0102, 0x0304}, {0x0506, 0x0708}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open as void and read + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Read the raw bytes + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + + // Verify shape: [2, 2, 2] where last dim is 2 bytes per uint16 + EXPECT_EQ(bytes_read.shape()[0], 2); + EXPECT_EQ(bytes_read.shape()[1], 2); + EXPECT_EQ(bytes_read.shape()[2], 2); + + // Verify the raw bytes (little endian) + auto bytes_ptr = static_cast(bytes_read.data()); + // First element: 0x0102 -> bytes 0x02, 0x01 (little endian) + EXPECT_EQ(bytes_ptr[0], 0x02); + EXPECT_EQ(bytes_ptr[1], 0x01); +} + +TEST(Zarr3DriverTest, OpenAsVoidWriteRoundtrip) { + // Test that writing through open_as_void correctly encodes data + // and can be read back both through void access and normal typed access. + auto context = Context::Default(); + + // Create an array and write initial data via typed access + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "uint16"}, + {"shape", {2, 2}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write initial data via typed access + auto data = tensorstore::MakeArray({{0x1234, 0x5678}, + {0x9ABC, 0xDEF0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Now read via void access and verify the byte layout + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // Read through void access + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + auto bytes_read_ptr = static_cast(bytes_read.data()); + + // Verify the raw bytes (little endian) + // Element [0,0] = 0x1234 -> bytes 0x34, 0x12 + EXPECT_EQ(bytes_read_ptr[0], 0x34); + EXPECT_EQ(bytes_read_ptr[1], 0x12); + // Element [0,1] = 0x5678 -> bytes 0x78, 0x56 + EXPECT_EQ(bytes_read_ptr[2], 0x78); + EXPECT_EQ(bytes_read_ptr[3], 0x56); + // Element [1,0] = 0x9ABC -> bytes 0xBC, 0x9A + EXPECT_EQ(bytes_read_ptr[4], 0xBC); + EXPECT_EQ(bytes_read_ptr[5], 0x9A); + // Element [1,1] = 0xDEF0 -> bytes 0xF0, 0xDE + EXPECT_EQ(bytes_read_ptr[6], 0xF0); + EXPECT_EQ(bytes_read_ptr[7], 0xDE); +} TEST(Zarr3DriverTest, FieldSelectionUrlNotSupported) { // Test that field selection is not supported with URL syntax From 7065b424f99b52965464a09b2442df92d8a0628b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 15:41:30 +0000 Subject: [PATCH 27/59] zarr3: Add compression test for open_as_void Verify that open_as_void works correctly when the array uses compression codecs (gzip). The fix to DecodeChunk properly handles the bytes->bytes codec chain when decoding for void access. --- tensorstore/driver/zarr3/driver_test.cc | 68 +++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index e590b2866..13d0eeee3 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1891,9 +1891,71 @@ TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { // implementation needs additional work to properly handle structured types // with multiple fields when opened with open_as_void=true. -// TODO(b/xxx): OpenAsVoidWithCompression test disabled pending implementation -// of void access codec chain handling. Currently fails with "Not enough data" -// error when reading void-accessed data through compression codecs. +TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { + // Test open_as_void with compression enabled + auto context = Context::Default(); + + // Create an array with gzip compression + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int32"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + {"codecs", {{{"name", "bytes"}}, {{"name", "gzip"}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write some data + auto data = tensorstore::MakeArray( + {{0x01020304, 0x05060708}, {0x090a0b0c, 0x0d0e0f10}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 4 bytes for int32 + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); + + // Read the raw bytes and verify decompression works + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto read_result, + tensorstore::Read(void_store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + EXPECT_EQ(read_result.shape()[0], 2); + EXPECT_EQ(read_result.shape()[1], 2); + EXPECT_EQ(read_result.shape()[2], 4); +} TEST(Zarr3DriverTest, OpenAsVoidSpecRoundtrip) { // Test that open_as_void is properly preserved in spec round-trips From b8daec0c4fa0c187cb76e1cd844a31096db3563e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 16:26:24 +0000 Subject: [PATCH 28/59] zarr3: Add original_is_structured flag for void access For void access, the codec handling differs between: - Non-structured types: codec prepared for [chunk_shape] with original dtype Need to decode/encode then reinterpret bytes. - Structured types: codec already prepared for [chunk_shape, bytes_per_elem] with byte dtype. Just decode/encode directly. Add original_is_structured parameter to cache constructors to properly distinguish these cases in DecodeChunk and EncodeChunk. This follows the pattern from zarr v2 (PR #272) where CreateVoidMetadata() creates a modified metadata for void access. --- tensorstore/driver/zarr3/chunk_cache.cc | 60 ++++++++++++++++--------- tensorstore/driver/zarr3/chunk_cache.h | 12 +++-- tensorstore/driver/zarr3/driver.cc | 9 +++- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index e39852222..8f8acc384 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -76,11 +76,12 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/, - bool open_as_void) + bool open_as_void, bool original_is_structured) : Base(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), - open_as_void_(open_as_void) {} + open_as_void_(open_as_void), + original_is_structured_(original_is_structured) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver chunk_indices, const size_t num_fields = dtype_.fields.size(); absl::InlinedVector, 1> field_arrays(num_fields); - // Special case: void access - decode using original codec shape, then - // reinterpret as bytes with extra dimension. + // Special case: void access - decode and return as bytes. // - // The codec was prepared for the original dtype and chunk_shape (without - // bytes dimension). We decode to that shape, then view the raw bytes with - // an extra dimension representing the bytes per element. + // For non-structured types: codec was prepared for [chunk_shape] with + // original dtype. We decode to that shape then reinterpret as bytes. + // + // For structured types: codec was already prepared for + // [chunk_shape, bytes_per_elem] with byte dtype. Just decode directly. if (open_as_void_) { - // The grid's chunk_shape for void has extra bytes dimension - strip it - // to get the original codec shape. + const auto& void_component_shape = grid().components[0].shape(); + + if (original_is_structured_) { + // Structured types: codec already expects bytes with extra dimension. + // Just decode directly to the void component shape. + TENSORSTORE_ASSIGN_OR_RETURN( + field_arrays[0], + codec_state_->DecodeArray(void_component_shape, std::move(data))); + return field_arrays; + } + + // Non-structured types: codec expects original dtype without extra + // dimension. Decode, then reinterpret as bytes. const auto& void_chunk_shape = grid().chunk_shape; std::vector original_chunk_shape( void_chunk_shape.begin(), @@ -178,8 +191,6 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, codec_state_->DecodeArray(original_chunk_shape, std::move(data))); // Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem] - // This creates a view over the same memory but with byte dtype and extra dim - const auto& void_component_shape = grid().components[0].shape(); auto byte_array = AllocateArray( void_component_shape, c_order, default_init, dtype_v); @@ -242,12 +253,20 @@ Result ZarrLeafChunkCache::EncodeChunk( span> component_arrays) { assert(component_arrays.size() == 1); - // Special case: void access - reinterpret byte array back to original - // dtype shape before encoding. + // Special case: void access - encode bytes back to original format. + // + // For structured types: codec already expects bytes with extra dimension. + // Just encode directly. // - // The input has shape [chunk_shape..., bytes_per_elem] of byte_t. - // The codec expects [chunk_shape] of the original dtype. + // For non-structured types: reinterpret byte array as original dtype + // and shape before encoding. if (open_as_void_) { + if (original_is_structured_) { + // Structured types: codec already expects bytes with extra dimension. + return codec_state_->EncodeArray(component_arrays[0]); + } + + // Non-structured types: reinterpret bytes as original dtype/shape. const auto& byte_array = component_arrays[0]; const Index bytes_per_element = dtype_.bytes_per_outer_element; @@ -256,7 +275,6 @@ Result ZarrLeafChunkCache::EncodeChunk( std::vector original_shape(void_shape.begin(), void_shape.end() - 1); // Create a view over the byte data with original layout - // The codec expects the original dtype's element size for stride calculation auto encoded_array = SharedArray( byte_array.element_pointer(), StridedLayout<>(c_order, bytes_per_element, original_shape)); @@ -274,12 +292,13 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void) + bool open_as_void, bool original_is_structured) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), - data_cache_pool_(std::move(data_cache_pool)), - open_as_void_(open_as_void) {} + open_as_void_(open_as_void), + original_is_structured_(original_is_structured), + data_cache_pool_(std::move(data_cache_pool)) {} Result> TranslateCellToSourceTransformForShard( IndexTransform<> transform, span grid_cell_indices, @@ -588,7 +607,8 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { *sharding_state.sub_chunk_codec_chain, std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), - cache.dtype_, cache.data_cache_pool_, cache.open_as_void_); + cache.dtype_, cache.data_cache_pool_, cache.open_as_void_, + cache.original_is_structured_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index f9ff19a00..34ffbf7d9 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -159,7 +159,8 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void); + bool open_as_void, + bool original_is_structured); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -250,6 +253,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { ZarrCodecChain::PreparedState::Ptr codec_state_; ZarrDType dtype_; bool open_as_void_; + bool original_is_structured_; // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -265,12 +269,12 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void) + bool open_as_void, bool original_is_structured) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), std::move(dtype), std::move(data_cache_pool), - open_as_void), + open_as_void, original_is_structured), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index b21eb9cd2..06945da15 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -913,11 +913,18 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { /*.num_bytes=*/metadata.data_type.bytes_per_outer_element}}, /*.bytes_per_outer_element=*/metadata.data_type.bytes_per_outer_element}; } + // Determine if original dtype is structured (multiple fields or field with + // outer_shape). This affects how void access handles codec operations. + const bool original_is_structured = + metadata.data_type.fields.size() > 1 || + (metadata.data_type.fields.size() == 1 && + !metadata.data_type.fields[0].outer_shape.empty()); + return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, metadata.codec_state, dtype, /*data_cache_pool=*/*cache_pool(), - spec().open_as_void); + spec().open_as_void, original_is_structured); } Result GetComponentIndex(const void* metadata_ptr, From 5dab237c1955f597e22a001885c126c81ae07a01 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 16:26:29 +0000 Subject: [PATCH 29/59] zarr3: Mark structured type void access test as TODO The structured type with void access requires additional work to handle rank mismatch between spec transform (based on original shape) and void access transform (which adds the bytes dimension). Mark as TODO for now. --- tensorstore/driver/zarr3/driver_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 13d0eeee3..35bcfe505 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1886,10 +1886,10 @@ TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { void_store.dtype()); } -// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending implementation -// of multi-field structured type handling in open_as_void mode. The v3 -// implementation needs additional work to properly handle structured types -// with multiple fields when opened with open_as_void=true. +// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending additional work +// to handle rank mismatch between spec transform and void access transform. +// The void access adds an extra dimension for bytes_per_outer_element, but the +// spec's transform is based on the original array shape without this dimension. TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { // Test open_as_void with compression enabled From 6a05640b0a2118dc4cf16418c4e66c0a43590868 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 16:43:11 +0000 Subject: [PATCH 30/59] zarr3: Fix GetDomain rank handling for void access For void access, the domain needs to include an extra dimension for bytes_per_outer_element. This requires: 1. Deferring rank setting in the JSON binder until after open_as_void is known, then adding 1 to the rank for void access. 2. Building the domain directly in GetDomain() when open_as_void=true and the metadata constraints include dtype and shape, adding the extra bytes dimension. This enables void access to work correctly with simple (non-structured) types when creating arrays. --- tensorstore/driver/zarr3/driver.cc | 54 ++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 06945da15..e43e1a178 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -50,6 +50,7 @@ #include "tensorstore/index_interval.h" #include "tensorstore/index_space/dimension_units.h" #include "tensorstore/index_space/index_domain.h" +#include "tensorstore/index_space/index_domain_builder.h" #include "tensorstore/index_space/index_transform.h" #include "tensorstore/index_space/index_transform_builder.h" #include "tensorstore/internal/async_write_array.h" @@ -140,8 +141,7 @@ class ZarrDriverSpec // at metadata level only. } } - TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( - RankConstraint{obj->metadata_constraints.rank})); + // Note: rank is set in Initialize after open_as_void is known. return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( @@ -158,6 +158,15 @@ class ZarrDriverSpec return absl::InvalidArgumentError( "\"field\" and \"open_as_void\" are mutually exclusive"); } + // Set the rank from metadata constraints, adding 1 for void access + // (which has an extra bytes dimension). + if (obj->metadata_constraints.rank != dynamic_rank) { + DimensionIndex rank = obj->metadata_constraints.rank; + if (obj->open_as_void) { + rank += 1; + } + TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set(RankConstraint{rank})); + } return absl::OkStatus(); })); @@ -169,6 +178,47 @@ class ZarrDriverSpec } Result> GetDomain() const override { + // For void access with known dtype and shape, build domain directly + // to include the extra bytes dimension. + if (open_as_void && metadata_constraints.data_type && + metadata_constraints.shape) { + const Index bytes_per_elem = + metadata_constraints.data_type->bytes_per_outer_element; + const DimensionIndex original_rank = metadata_constraints.shape->size(); + IndexDomainBuilder builder(original_rank + 1); + + // Set original dimensions from metadata + for (DimensionIndex i = 0; i < original_rank; ++i) { + builder.origin()[i] = 0; + builder.shape()[i] = (*metadata_constraints.shape)[i]; + } + + // Add bytes dimension + builder.origin()[original_rank] = 0; + builder.shape()[original_rank] = bytes_per_elem; + + // Set implicit bounds: array dims are implicit, bytes dim is explicit + DimensionSet implicit_lower(false); + DimensionSet implicit_upper(false); + for (DimensionIndex i = 0; i < original_rank; ++i) { + implicit_upper[i] = true; // Array dimensions are resizable + } + builder.implicit_lower_bounds(implicit_lower); + builder.implicit_upper_bounds(implicit_upper); + + // Copy dimension names if available + if (metadata_constraints.dimension_names) { + for (DimensionIndex i = 0; i < original_rank; ++i) { + if (const auto& name = (*metadata_constraints.dimension_names)[i]; + name.has_value()) { + builder.labels()[i] = *name; + } + } + } + + return builder.Finalize(); + } + return GetEffectiveDomain(metadata_constraints, schema); } From 76b30023cc940f71f7f73a5319cf191e655796e4 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 16:43:18 +0000 Subject: [PATCH 31/59] zarr3: Mark structured type void access test as TODO The structured type void access requires additional work in GetNewMetadata to properly handle the extra bytes dimension. The current implementation doesn't correctly propagate the void rank through all the metadata validation and domain building code paths. For now, disable this test and leave as TODO for future work. --- tensorstore/driver/zarr3/driver_test.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 35bcfe505..d08660b12 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1886,10 +1886,11 @@ TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { void_store.dtype()); } -// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending additional work -// to handle rank mismatch between spec transform and void access transform. -// The void access adds an extra dimension for bytes_per_outer_element, but the -// spec's transform is based on the original array shape without this dimension. +// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending implementation +// of proper rank handling in GetNewMetadata for void access with structured +// types. The current implementation doesn't correctly handle the extra bytes +// dimension when creating new arrays with open_as_void=true and structured +// dtypes. TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { // Test open_as_void with compression enabled From 97944508605f98098c5bcfbdfc1f34ca73127175 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 16:53:54 +0000 Subject: [PATCH 32/59] zarr3: Improve comments for void access rank handling Update comments in the JSON binder initialization to better explain the void field's field_shape and how it affects the schema rank. Also update the TODO for the structured type void access test to more accurately describe the remaining work needed: - GetNewMetadata needs to handle field_shape dimensions - SetChunkLayoutFromMetadata needs dimension mismatch handling --- tensorstore/driver/zarr3/driver.cc | 7 ++++--- tensorstore/driver/zarr3/driver_test.cc | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index e43e1a178..a9844d5ad 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -158,12 +158,13 @@ class ZarrDriverSpec return absl::InvalidArgumentError( "\"field\" and \"open_as_void\" are mutually exclusive"); } - // Set the rank from metadata constraints, adding 1 for void access - // (which has an extra bytes dimension). + // Set the schema rank from metadata constraints. + // For void access, add 1 for the bytes dimension (from the void field's + // field_shape = {bytes_per_outer_element}). if (obj->metadata_constraints.rank != dynamic_rank) { DimensionIndex rank = obj->metadata_constraints.rank; if (obj->open_as_void) { - rank += 1; + rank += 1; // Add bytes dimension } TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set(RankConstraint{rank})); } diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index d08660b12..c27b9a7ca 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1888,9 +1888,11 @@ TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { // TODO(b/xxx): OpenAsVoidStructuredType test disabled pending implementation // of proper rank handling in GetNewMetadata for void access with structured -// types. The current implementation doesn't correctly handle the extra bytes -// dimension when creating new arrays with open_as_void=true and structured -// dtypes. +// types. Creating new arrays with open_as_void=true and structured dtypes +// requires adding field_shape dimensions to chunked_rank and updating +// SetChunkLayoutFromMetadata to handle the dimension mismatch between +// metadata shape and full rank. This is a more extensive change that will +// be addressed separately. TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { // Test open_as_void with compression enabled From a0271956f459f3f4a9b73f27d2fdfadd11ad0edb Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 17:18:01 +0000 Subject: [PATCH 33/59] zarr3: Fix void access EncodeChunk to use original dtype When encoding data through void access, the codec expects the original dtype (e.g., int32), not the synthesized void dtype (byte_t). This fix: 1. Adds original_dtype_ member to ZarrLeafChunkCache and ZarrShardedChunkCache to store the original dtype from metadata. 2. Updates EncodeChunk to use original_dtype_ when creating the SharedArray for encoding, ensuring the codec receives data in the correct format. 3. Passes original_dtype through MakeZarrChunkCache and ZarrShardSubChunkCache constructors. This fixes writing through void access, both with and without compression. --- tensorstore/driver/zarr3/chunk_cache.cc | 28 ++++++++++++++++++------- tensorstore/driver/zarr3/chunk_cache.h | 12 +++++++---- tensorstore/driver/zarr3/driver.cc | 8 ++++++- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 8f8acc384..f2a61f5c8 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -76,12 +76,13 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/, - bool open_as_void, bool original_is_structured) + bool open_as_void, bool original_is_structured, DataType original_dtype) : Base(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), open_as_void_(open_as_void), - original_is_structured_(original_is_structured) {} + original_is_structured_(original_is_structured), + original_dtype_(original_dtype) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver ZarrLeafChunkCache::EncodeChunk( const auto& void_shape = byte_array.shape(); std::vector original_shape(void_shape.begin(), void_shape.end() - 1); - // Create a view over the byte data with original layout - auto encoded_array = SharedArray( - byte_array.element_pointer(), - StridedLayout<>(c_order, bytes_per_element, original_shape)); + // Use the original dtype (stored during cache creation) for encoding. + // This is the dtype the codec was prepared for, not the void dtype. + + // Create a view over the byte data with original dtype and layout. + // Use the aliasing constructor to share ownership with byte_array but + // interpret the data with the original dtype. + SharedArray encoded_array; + auto aliased_ptr = std::shared_ptr( + byte_array.pointer(), // Share ownership with byte_array + byte_array.data()); // But point to the raw data + encoded_array.element_pointer() = SharedElementPointer( + std::move(aliased_ptr), original_dtype_); + encoded_array.layout() = StridedLayout<>(c_order, bytes_per_element, + original_shape); return codec_state_->EncodeArray(encoded_array); } @@ -292,12 +303,13 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void, bool original_is_structured) + bool open_as_void, bool original_is_structured, DataType original_dtype) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), open_as_void_(open_as_void), original_is_structured_(original_is_structured), + original_dtype_(original_dtype), data_cache_pool_(std::move(data_cache_pool)) {} Result> TranslateCellToSourceTransformForShard( @@ -608,7 +620,7 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), cache.dtype_, cache.data_cache_pool_, cache.open_as_void_, - cache.original_is_structured_); + cache.original_is_structured_, cache.original_dtype_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index 34ffbf7d9..58b1d4c68 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -160,7 +160,8 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, bool open_as_void, - bool original_is_structured); + bool original_is_structured, + DataType original_dtype); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -254,6 +257,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { ZarrDType dtype_; bool open_as_void_; bool original_is_structured_; + DataType original_dtype_; // Original dtype for void access encoding // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -269,12 +273,12 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void, bool original_is_structured) + bool open_as_void, bool original_is_structured, DataType original_dtype) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), std::move(dtype), std::move(data_cache_pool), - open_as_void, original_is_structured), + open_as_void, original_is_structured, original_dtype), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index a9844d5ad..8b6a355eb 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -971,11 +971,17 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { (metadata.data_type.fields.size() == 1 && !metadata.data_type.fields[0].outer_shape.empty()); + // Get the original dtype for void access encoding (needed by EncodeChunk). + // For non-structured types, this is the single field's dtype. + DataType original_dtype = metadata.data_type.fields.size() > 0 + ? metadata.data_type.fields[0].dtype + : DataType{}; + return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, metadata.codec_state, dtype, /*data_cache_pool=*/*cache_pool(), - spec().open_as_void, original_is_structured); + spec().open_as_void, original_is_structured, original_dtype); } Result GetComponentIndex(const void* metadata_ptr, From 83a519950e1351f4b6ef61e57809c1dff0d94ac2 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 17:18:08 +0000 Subject: [PATCH 34/59] zarr3: Add OpenAsVoidWriteWithCompression test Add test to verify that writing through void access with compression enabled works correctly. The test: 1. Creates an array with gzip compression 2. Initializes with zeros via typed access 3. Opens as void and writes raw bytes 4. Reads back through void access to verify the write 5. Reads back through typed access to verify byte interpretation This test exercises the EncodeChunk path for void access with the codec chain including compression. --- tensorstore/driver/zarr3/driver_test.cc | 95 +++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index c27b9a7ca..c01237626 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2202,6 +2202,101 @@ TEST(Zarr3DriverTest, OpenAsVoidWriteRoundtrip) { EXPECT_EQ(bytes_read_ptr[7], 0xDE); } +TEST(Zarr3DriverTest, OpenAsVoidWriteWithCompression) { + // Test writing through open_as_void with compression enabled. + // Verifies that the EncodeChunk method correctly compresses data. + auto context = Context::Default(); + + // Create an array with gzip compression + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int32"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {4, 4}}}}}}, + {"codecs", + ::nlohmann::json::array( + {{{"name", "bytes"}, {"configuration", {{"endian", "little"}}}}, + {{"name", "gzip"}, {"configuration", {{"level", 5}}}}})}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Initialize with zeros + auto zeros = tensorstore::MakeArray( + {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(zeros, store).result()); + + // Open as void for writing + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Verify the void store has the expected shape: [4, 4, 4] (4x4 ints, 4 bytes each) + EXPECT_EQ(3, void_store.rank()); + EXPECT_EQ(4, void_store.domain().shape()[0]); + EXPECT_EQ(4, void_store.domain().shape()[1]); + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // Create raw bytes representing int32 values in little endian + // Using a simple pattern: 0x01020304 at position [0,0] + auto raw_bytes = tensorstore::AllocateArray( + {4, 4, 4}, tensorstore::c_order, tensorstore::value_init); + + // Set first element to 0x01020304 (little endian: 04 03 02 01) + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + raw_bytes_ptr[0] = 0x04; + raw_bytes_ptr[1] = 0x03; + raw_bytes_ptr[2] = 0x02; + raw_bytes_ptr[3] = 0x01; + + // Write raw bytes through void access (triggers compression) + TENSORSTORE_EXPECT_OK(tensorstore::Write(raw_bytes, void_store).result()); + + // Verify the write worked by reading back through void access first + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto void_read, + tensorstore::Read(void_store).result()); + auto void_read_ptr = static_cast(void_read.data()); + // First 4 bytes should be our pattern + EXPECT_EQ(void_read_ptr[0], 0x04); + EXPECT_EQ(void_read_ptr[1], 0x03); + EXPECT_EQ(void_read_ptr[2], 0x02); + EXPECT_EQ(void_read_ptr[3], 0x01); + + // Read back through normal typed access + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto typed_read, + tensorstore::Read(typed_store).result()); + auto typed_ptr = static_cast(typed_read.data()); + + // First element should be 0x01020304 + EXPECT_EQ(typed_ptr[0], 0x01020304); + // Rest should be zeros + EXPECT_EQ(typed_ptr[1], 0); +} + TEST(Zarr3DriverTest, FieldSelectionUrlNotSupported) { // Test that field selection is not supported with URL syntax ::nlohmann::json json_spec{ From 0d307c8c2ac5f12bdf2d5e38bff12e6015fa13df Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 17:41:42 +0000 Subject: [PATCH 35/59] zarr3: Add GetSpecInfo rank tests for void access Add tests to verify that GetSpecInfo correctly computes rank when open_as_void=true (mirroring v2 test patterns): - GetSpecInfoOpenAsVoidWithKnownRank: Verifies full_rank = chunked_rank + 1 - GetSpecInfoOpenAsVoidWithDynamicRank: Verifies dynamic rank handling - GetSpecInfoOpenAsVoidWithoutDtype: Verifies behavior without dtype - GetSpecInfoOpenAsVoidRankConsistency: Verifies spec rank matches opened store Also adds TODO for OpenAsVoidFillValue test - fill_value handling for void access requires additional implementation (similar to v2's CreateVoidMetadata which converts fill_value to byte array). --- tensorstore/driver/zarr3/driver_test.cc | 150 ++++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index c01237626..7d7307c2c 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2324,4 +2324,154 @@ TEST(Zarr3DriverTest, FieldSelectionUrlNotSupported) { HasSubstr("selected_field"))); } +// Tests for GetSpecInfo() with open_as_void (mirroring v2 tests) + +TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithKnownRank) { + // Test that GetSpecInfo correctly computes rank when open_as_void=true + // and dtype is specified with known chunked_rank. + // Expected: full_rank = chunked_rank + 1 (for bytes dimension) + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", "int32"}, // 4-byte integer + {"shape", {10, 20}}, // 2D array, so chunked_rank=2 + {"chunk_grid", + {{"name", "regular"}, + {"configuration", {{"chunk_shape", {5, 10}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // With open_as_void and dtype specified, rank should be chunked_rank + 1 + // chunked_rank = 2 (from shape), so full_rank = 3 + EXPECT_EQ(3, spec.rank()); +} + +TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithDynamicRank) { + // Test GetSpecInfo when open_as_void=true with dtype but no shape/chunks + // (i.e., chunked_rank is dynamic). In this case, full_rank should remain + // dynamic until metadata is loaded. + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", "int16"}, + // No shape or chunks specified, so chunked_rank is dynamic + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // When chunked_rank is dynamic, full_rank remains dynamic + EXPECT_EQ(tensorstore::dynamic_rank, spec.rank()); +} + +TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithoutDtype) { + // Test that when open_as_void=true but dtype is not specified, + // GetSpecInfo falls through to normal GetSpecRankAndFieldInfo behavior. + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + // No metadata.data_type specified + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // Without dtype, rank should be dynamic (normal behavior) + EXPECT_EQ(tensorstore::dynamic_rank, spec.rank()); +} + +TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidRankConsistency) { + // Verify that the rank computed by GetSpecInfo matches what we get when + // actually opening the store. + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "float32"}, // 4-byte float + {"shape", {3, 4, 5}}, // 3D array + {"chunk_grid", + {{"name", "regular"}, + {"configuration", {{"chunk_shape", {3, 4, 5}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Open the store with open_as_void - don't specify metadata so it's read + // from the existing store + ::nlohmann::json void_spec_json{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec_json, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // Opened store rank should be chunked_rank + 1 = 3 + 1 = 4 + EXPECT_EQ(4, void_store.rank()); + + // Verify bytes dimension size - the domain is valid on an opened store + auto store_domain = void_store.domain(); + EXPECT_TRUE(store_domain.valid()); + EXPECT_EQ(4, store_domain.shape()[3]); // 4 bytes for float32 + + // Now test the spec parsing with known metadata also sets rank correctly + ::nlohmann::json void_spec_with_metadata{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix2/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", "float32"}, + {"shape", {3, 4, 5}}, + {"chunk_grid", + {{"name", "regular"}, + {"configuration", {{"chunk_shape", {3, 4, 5}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_spec, tensorstore::Spec::FromJson(void_spec_with_metadata)); + + // Spec rank should be 4 (3D chunked + 1 bytes dimension) + // This verifies GetSpecInfo computes full_rank = chunked_rank + 1 + EXPECT_EQ(4, void_spec.rank()); +} + +// TODO(fill_value): OpenAsVoidFillValue test disabled pending implementation +// of proper fill_value handling for void access. The v2 implementation converts +// the fill_value to a byte array representation via CreateVoidMetadata(). +// The v3 implementation needs similar functionality to properly expose the +// fill_value as raw bytes when using open_as_void. +// +// TEST(Zarr3DriverTest, OpenAsVoidFillValue) { +// // Test that fill_value is correctly obtained from metadata when using +// // open_as_void. The void access should get the fill_value representing +// // the raw bytes of the original fill_value. +// ... +// } + } // namespace From 5819c8a401a21a47161654ad7f4f3b5cbecb5b79 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 17:55:00 +0000 Subject: [PATCH 36/59] zarr3: Add fill_value handling for void access Implement proper fill_value conversion for void access mode: 1. Add is_void_access() virtual method to DataCacheBase to expose whether the cache was opened with open_as_void=true. 2. Modify ZarrDriver::GetFillValue to convert fill_value to byte array representation when in void access mode. This copies bytes from each field's fill_value at their respective offsets, similar to v2's CreateVoidMetadata handling. 3. Add OpenAsVoidFillValue test to verify that: - Normal store returns the expected scalar fill_value - Void store returns fill_value as byte array with correct shape - Byte representation matches the original value (little endian) --- tensorstore/driver/zarr3/driver.cc | 33 ++++++++++++ tensorstore/driver/zarr3/driver_test.cc | 69 ++++++++++++++++++++----- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 8b6a355eb..6a9315b5c 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -380,6 +381,9 @@ class DataCacheBase virtual ZarrChunkCache& zarr_chunk_cache() = 0; + /// Returns true if this cache was opened with open_as_void=true. + virtual bool is_void_access() const = 0; + absl::Status ValidateMetadataCompatibility( const void* existing_metadata_ptr, const void* new_metadata_ptr) override { @@ -718,6 +722,8 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { ZarrChunkCache& zarr_chunk_cache() final { return *this; } + bool is_void_access() const final { return ChunkCacheImpl::open_as_void_; } + const internal::ChunkGridSpecification& grid() const override { return grid_; } @@ -812,6 +818,33 @@ class ZarrDriver : public ZarrDriverBase { if (metadata.fill_value.empty()) { return SharedArray(); } + + // For void access, convert fill_value to byte array representation. + // This is similar to v2's CreateVoidMetadata fill_value handling. + // In zarr3, endianness is handled by the codec chain, so we just copy + // the raw bytes from each field's fill_value. + if (static_cast(cache())->is_void_access()) { + const Index nbytes = metadata.data_type.bytes_per_outer_element; + auto byte_fill = AllocateArray({nbytes}, c_order, value_init); + + // Copy bytes from each field's fill_value at their respective offsets + for (size_t field_i = 0; field_i < metadata.data_type.fields.size(); + ++field_i) { + const auto& field = metadata.data_type.fields[field_i]; + if (field_i >= metadata.fill_value.size() || + !metadata.fill_value[field_i].valid()) { + continue; + } + const auto& fill_value = metadata.fill_value[field_i]; + // Copy the raw bytes from the fill_value to the byte array + std::memcpy(byte_fill.data() + field.byte_offset, + fill_value.data(), + field.num_bytes); + } + + return byte_fill; + } + size_t index = this->component_index(); if (index >= metadata.fill_value.size()) { return absl::OutOfRangeError("Component index out of bounds"); diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 7d7307c2c..d592b8924 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2461,17 +2461,62 @@ TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidRankConsistency) { EXPECT_EQ(4, void_spec.rank()); } -// TODO(fill_value): OpenAsVoidFillValue test disabled pending implementation -// of proper fill_value handling for void access. The v2 implementation converts -// the fill_value to a byte array representation via CreateVoidMetadata(). -// The v3 implementation needs similar functionality to properly expose the -// fill_value as raw bytes when using open_as_void. -// -// TEST(Zarr3DriverTest, OpenAsVoidFillValue) { -// // Test that fill_value is correctly obtained from metadata when using -// // open_as_void. The void access should get the fill_value representing -// // the raw bytes of the original fill_value. -// ... -// } +TEST(Zarr3DriverTest, OpenAsVoidFillValue) { + // Test that fill_value is correctly obtained from metadata when using + // open_as_void. The void access should get the fill_value representing + // the raw bytes of the original fill_value. + auto context = Context::Default(); + + // Create an array with an explicit fill_value + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int16"}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + {"fill_value", 0x1234}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Verify the normal store has the expected fill_value + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto normal_fill, store.fill_value()); + EXPECT_TRUE(normal_fill.valid()); + EXPECT_EQ(tensorstore::MakeScalarArray(0x1234), normal_fill); + + // Open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // Verify void store has a valid fill_value derived from the original + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto void_fill, void_store.fill_value()); + EXPECT_TRUE(void_fill.valid()); + + // The void fill_value should have shape {2} (2 bytes for int16) + EXPECT_EQ(1, void_fill.rank()); + EXPECT_EQ(2, void_fill.shape()[0]); + + // The fill_value bytes should represent 0x1234 in little endian: 0x34, 0x12 + auto fill_bytes = static_cast(void_fill.data()); + EXPECT_EQ(0x34, fill_bytes[0]); + EXPECT_EQ(0x12, fill_bytes[1]); +} } // namespace From 53ced58d76215dceb0b9ee4a1548aab3dd33f264 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 18:07:21 +0000 Subject: [PATCH 37/59] zarr3: Add structured type support for void access Fix EncodeChunk to properly handle structured types: 1. For single non-structured field: encode directly (existing behavior) 2. For structured types (multiple fields): combine field arrays into a single byte array by copying each field's data at their respective byte offsets, then encode the combined byte array. This matches the pattern in DecodeChunk which extracts fields from a decoded byte array. Add OpenAsVoidStructuredType test that: - Creates an array with structured dtype (uint8 + int16 fields) - Writes data using field access - Opens with open_as_void=true - Verifies rank is original_rank + 1 - Verifies bytes dimension is 3 (1 + 2 bytes) - Verifies dtype is byte --- tensorstore/driver/zarr3/chunk_cache.cc | 54 ++++++++++++++++---- tensorstore/driver/zarr3/driver_test.cc | 68 ++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 17 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index f2a61f5c8..8f15a218c 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -252,16 +252,12 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, Result ZarrLeafChunkCache::EncodeChunk( span chunk_indices, span> component_arrays) { - assert(component_arrays.size() == 1); + const size_t num_fields = dtype_.fields.size(); // Special case: void access - encode bytes back to original format. - // - // For structured types: codec already expects bytes with extra dimension. - // Just encode directly. - // - // For non-structured types: reinterpret byte array as original dtype - // and shape before encoding. if (open_as_void_) { + assert(component_arrays.size() == 1); + if (original_is_structured_) { // Structured types: codec already expects bytes with extra dimension. return codec_state_->EncodeArray(component_arrays[0]); @@ -276,8 +272,6 @@ Result ZarrLeafChunkCache::EncodeChunk( std::vector original_shape(void_shape.begin(), void_shape.end() - 1); // Use the original dtype (stored during cache creation) for encoding. - // This is the dtype the codec was prepared for, not the void dtype. - // Create a view over the byte data with original dtype and layout. // Use the aliasing constructor to share ownership with byte_array but // interpret the data with the original dtype. @@ -293,7 +287,47 @@ Result ZarrLeafChunkCache::EncodeChunk( return codec_state_->EncodeArray(encoded_array); } - return codec_state_->EncodeArray(component_arrays[0]); + // For single non-structured field, encode directly + if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) { + assert(component_arrays.size() == 1); + return codec_state_->EncodeArray(component_arrays[0]); + } + + // For structured types, combine multiple field arrays into a single byte array + assert(component_arrays.size() == num_fields); + + // Build encode shape: [chunk_dims..., bytes_per_outer_element] + const auto& chunk_shape = grid().chunk_shape; + std::vector encode_shape(chunk_shape.begin(), chunk_shape.end()); + encode_shape.push_back(dtype_.bytes_per_outer_element); + + // Calculate number of outer elements + Index num_elements = 1; + for (size_t i = 0; i < chunk_shape.size(); ++i) { + num_elements *= chunk_shape[i]; + } + + // Allocate byte array for combined fields + auto byte_array = AllocateArray(encode_shape, c_order, value_init); + auto* dst_bytes = byte_array.data(); + + // Copy each field's data into the byte array at their respective offsets + for (size_t field_i = 0; field_i < num_fields; ++field_i) { + const auto& field = dtype_.fields[field_i]; + const auto& field_array = component_arrays[field_i]; + const auto* src = static_cast(field_array.data()); + const Index field_size = field.dtype->size; + + // Copy field data to each struct element + for (Index i = 0; i < num_elements; ++i) { + std::memcpy(dst_bytes + i * dtype_.bytes_per_outer_element + + field.byte_offset, + src + i * field_size, + field_size); + } + } + + return codec_state_->EncodeArray(byte_array); } kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index d592b8924..68ce6de60 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1886,13 +1886,67 @@ TEST(Zarr3DriverTest, OpenAsVoidSimpleType) { void_store.dtype()); } -// TODO(b/xxx): OpenAsVoidStructuredType test disabled pending implementation -// of proper rank handling in GetNewMetadata for void access with structured -// types. Creating new arrays with open_as_void=true and structured dtypes -// requires adding field_shape dimensions to chunked_rank and updating -// SetChunkLayoutFromMetadata to handle the dimension mismatch between -// metadata shape and full rank. This is a more extensive change that will -// be addressed separately. +TEST(Zarr3DriverTest, OpenAsVoidStructuredType) { + // Test open_as_void with a structured data type + auto context = Context::Default(); + + // Step 1: Create and write the array using a structured dtype (with field) + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"field", "y"}, + {"metadata", + { + {"data_type", + {{"name", "structured"}, + {"configuration", + {{"fields", + ::nlohmann::json::array({{"x", "uint8"}, {"y", "int16"}})}}}}}, + {"shape", {4, 4}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write some data to field y + auto data = tensorstore::MakeArray({{100, 200}, {300, 400}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Close the first store by letting it go out of scope + store = tensorstore::TensorStore(); + + // Step 2: Open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 3 bytes (1 byte for u1 + 2 bytes for i2) + EXPECT_EQ(3, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { // Test open_as_void with compression enabled From 13cae40bb6925ce8411bf1427a5df0841e73ec34 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 18:15:32 +0000 Subject: [PATCH 38/59] zarr3: Enhance structured type tests and add GetSpecInfo test 1. OpenAsVoidStructuredType: Now actually reads and verifies byte content - Reads raw bytes through void access - Uses proper stride calculation for the returned array - Verifies y field bytes at all 4 positions (little-endian int16) - x field is 0 (fill value) since we only wrote to y field 2. Add GetSpecInfoOpenAsVoidWithStructuredDtype test - Verifies spec rank = chunked_rank + 1 with structured dtype - Tests structured dtype with int32 + uint16 fields - Matches v2 test coverage --- tensorstore/driver/zarr3/driver_test.cc | 77 ++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 68ce6de60..65a646bbd 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -1891,6 +1891,7 @@ TEST(Zarr3DriverTest, OpenAsVoidStructuredType) { auto context = Context::Default(); // Step 1: Create and write the array using a structured dtype (with field) + // Struct layout: x (uint8, 1 byte) + y (int16, 2 bytes) = 3 bytes total ::nlohmann::json create_spec{ {"driver", "zarr3"}, {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, @@ -1914,14 +1915,16 @@ TEST(Zarr3DriverTest, OpenAsVoidStructuredType) { tensorstore::ReadWriteMode::read_write) .result()); - // Write some data to field y + // Write some data to field y (int16) + // int16 100 = 0x0064 in little endian = [0x64, 0x00] + // int16 200 = 0x00C8 in little endian = [0xC8, 0x00] auto data = tensorstore::MakeArray({{100, 200}, {300, 400}}); TENSORSTORE_EXPECT_OK( tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( {0, 0}, {2, 2})) .result()); - // Close the first store by letting it go out of scope + // Close store to ensure data is flushed store = tensorstore::TensorStore(); // Step 2: Open with open_as_void=true @@ -1946,6 +1949,49 @@ TEST(Zarr3DriverTest, OpenAsVoidStructuredType) { // The data type should be byte EXPECT_EQ(tensorstore::dtype_v, void_store.dtype()); + + // Step 3: Read and verify byte content for field y only + // Since we only wrote to field y, field x will be zeros (fill value) + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto byte_array, + tensorstore::Read( + void_store | tensorstore::Dims(0, 1, 2).SizedInterval({0, 0, 0}, + {2, 2, 3})) + .result()); + + EXPECT_EQ(3, byte_array.rank()); + EXPECT_EQ(2, byte_array.shape()[0]); + EXPECT_EQ(2, byte_array.shape()[1]); + EXPECT_EQ(3, byte_array.shape()[2]); + + // Verify bytes - we use the array's data() and strides + const auto* bytes = static_cast(byte_array.data()); + const Index stride0 = byte_array.byte_strides()[0]; + const Index stride1 = byte_array.byte_strides()[1]; + const Index stride2 = byte_array.byte_strides()[2]; + auto get_byte = [&](Index i, Index j, Index k) -> unsigned char { + return bytes[i * stride0 + j * stride1 + k * stride2]; + }; + + // Element [0,0]: x=0 (fill), y=100 (0x0064 LE = [0x64, 0x00]) + EXPECT_EQ(0, get_byte(0, 0, 0)); // x (fill value) + EXPECT_EQ(0x64, get_byte(0, 0, 1)); // y low byte + EXPECT_EQ(0x00, get_byte(0, 0, 2)); // y high byte + + // Element [0,1]: x=0 (fill), y=200 (0x00C8 LE = [0xC8, 0x00]) + EXPECT_EQ(0, get_byte(0, 1, 0)); // x (fill value) + EXPECT_EQ(0xC8, get_byte(0, 1, 1)); // y low byte + EXPECT_EQ(0x00, get_byte(0, 1, 2)); // y high byte + + // Element [1,0]: x=0 (fill), y=300 (0x012C LE = [0x2C, 0x01]) + EXPECT_EQ(0, get_byte(1, 0, 0)); // x (fill value) + EXPECT_EQ(0x2C, get_byte(1, 0, 1)); // y low byte + EXPECT_EQ(0x01, get_byte(1, 0, 2)); // y high byte + + // Element [1,1]: x=0 (fill), y=400 (0x0190 LE = [0x90, 0x01]) + EXPECT_EQ(0, get_byte(1, 1, 0)); // x (fill value) + EXPECT_EQ(0x90, get_byte(1, 1, 1)); // y low byte + EXPECT_EQ(0x01, get_byte(1, 1, 2)); // y high byte } TEST(Zarr3DriverTest, OpenAsVoidWithCompression) { @@ -2406,6 +2452,33 @@ TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithKnownRank) { EXPECT_EQ(3, spec.rank()); } +TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithStructuredDtype) { + // Test GetSpecInfo with open_as_void=true and a structured dtype. + // The bytes dimension should reflect the full struct size. + ::nlohmann::json json_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"data_type", + {{"name", "structured"}, + {"configuration", + {{"fields", + ::nlohmann::json::array({{"x", "int32"}, {"y", "uint16"}})}}}}}, + {"shape", {8}}, // 1D array + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {4}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto spec, + tensorstore::Spec::FromJson(json_spec)); + + // chunked_rank = 1, so full_rank = 2 + EXPECT_EQ(2, spec.rank()); +} + TEST(Zarr3DriverTest, GetSpecInfoOpenAsVoidWithDynamicRank) { // Test GetSpecInfo when open_as_void=true with dtype but no shape/chunks // (i.e., chunked_rank is dynamic). In this case, full_rank should remain From cfea1dda0e93d7ece90ac8b702e11a74d47cdc63 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 18:16:43 +0000 Subject: [PATCH 39/59] zarr3: Add OpenAsVoidIncompatibleMetadata test Test that open_as_void correctly detects when the underlying metadata has been changed to an incompatible dtype. ResolveBounds should fail with kFailedPrecondition when the stored metadata has a different bytes_per_outer_element than what was expected. This matches the v2 test that verifies metadata consistency checking works properly with void access. --- tensorstore/driver/zarr3/driver_test.cc | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 65a646bbd..970147c82 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2646,4 +2646,77 @@ TEST(Zarr3DriverTest, OpenAsVoidFillValue) { EXPECT_EQ(0x12, fill_bytes[1]); } +TEST(Zarr3DriverTest, OpenAsVoidIncompatibleMetadata) { + // Test that open_as_void correctly rejects incompatible metadata when the + // underlying storage is modified to have a different bytes_per_outer_element. + auto context = Context::Default(); + ::nlohmann::json storage_spec{{"driver", "memory"}}; + + // Create an array with 4-byte dtype + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", storage_spec}, + {"path", "prefix/"}, + {"metadata", + { + {"data_type", "int32"}, // 4 bytes + {"shape", {2, 2}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write some data + auto data = tensorstore::MakeArray({{1, 2}, {3, 4}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open with open_as_void + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", storage_spec}, + {"path", "prefix/"}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // Now overwrite the underlying storage with incompatible metadata + // (different bytes_per_outer_element: 2 bytes instead of 4) + ::nlohmann::json incompatible_spec{ + {"driver", "zarr3"}, + {"kvstore", storage_spec}, + {"path", "prefix/"}, + {"metadata", + { + {"data_type", "int16"}, // 2 bytes - incompatible + {"shape", {2, 2}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {2, 2}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto incompatible_store, + tensorstore::Open(incompatible_spec, context, + tensorstore::OpenMode::create | + tensorstore::OpenMode::delete_existing, + tensorstore::ReadWriteMode::read_write) + .result()); + + // ResolveBounds on the original void store should fail because the + // underlying metadata changed to an incompatible dtype + EXPECT_THAT(ResolveBounds(void_store).result(), + StatusIs(absl::StatusCode::kFailedPrecondition)); +} + } // namespace From dda05a8b139e2e0b0daaf2619e5fba2588d32b2d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 20:57:00 +0000 Subject: [PATCH 40/59] zarr3: Add OpenAsVoidWithSharding test Verifies that void access works correctly with sharded arrays: - Void access flags propagate through sharded caches - Reading bytes through sharded void access returns correct data - Writing bytes through sharded void access round-trips correctly --- tensorstore/driver/zarr3/driver_test.cc | 131 ++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/tensorstore/driver/zarr3/driver_test.cc b/tensorstore/driver/zarr3/driver_test.cc index 970147c82..08ee47f82 100644 --- a/tensorstore/driver/zarr3/driver_test.cc +++ b/tensorstore/driver/zarr3/driver_test.cc @@ -2719,4 +2719,135 @@ TEST(Zarr3DriverTest, OpenAsVoidIncompatibleMetadata) { StatusIs(absl::StatusCode::kFailedPrecondition)); } +TEST(Zarr3DriverTest, OpenAsVoidWithSharding) { + // Test open_as_void with sharding enabled. + // Verifies that void access flags propagate correctly through sharded caches. + auto context = Context::Default(); + + // Create a sharded array + ::nlohmann::json create_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"data_type", "int32"}, + {"shape", {8, 8}}, + {"chunk_grid", + {{"name", "regular"}, {"configuration", {{"chunk_shape", {8, 8}}}}}}, + {"codecs", + {{{"name", "sharding_indexed"}, + {"configuration", + {{"chunk_shape", {4, 4}}, + {"codecs", {{{"name", "bytes"}}}}, + {"index_codecs", + {{{"name", "bytes"}}, {{"name", "crc32c"}}}}}}}}}, + }}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Write some data + auto data = tensorstore::MakeArray( + {{0x01020304, 0x05060708, 0, 0, 0, 0, 0, 0}, + {0x090A0B0C, 0x0D0E0F10, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr3"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Verify rank is original + 1 for bytes dimension + EXPECT_EQ(3, void_store.rank()); + + // Verify bytes dimension is 4 (int32 = 4 bytes) + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // Read through void access and verify byte content + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto bytes_read, + tensorstore::Read( + void_store | tensorstore::Dims(0, 1, 2).SizedInterval({0, 0, 0}, + {2, 2, 4})) + .result()); + + EXPECT_EQ(3, bytes_read.rank()); + EXPECT_EQ(2, bytes_read.shape()[0]); + EXPECT_EQ(2, bytes_read.shape()[1]); + EXPECT_EQ(4, bytes_read.shape()[2]); + + // Verify the raw bytes (little endian) + const auto* bytes = static_cast(bytes_read.data()); + const Index stride0 = bytes_read.byte_strides()[0]; + const Index stride1 = bytes_read.byte_strides()[1]; + const Index stride2 = bytes_read.byte_strides()[2]; + auto get_byte = [&](Index i, Index j, Index k) -> unsigned char { + return bytes[i * stride0 + j * stride1 + k * stride2]; + }; + + // Element [0,0] = 0x01020304 in little endian: 04 03 02 01 + EXPECT_EQ(0x04, get_byte(0, 0, 0)); + EXPECT_EQ(0x03, get_byte(0, 0, 1)); + EXPECT_EQ(0x02, get_byte(0, 0, 2)); + EXPECT_EQ(0x01, get_byte(0, 0, 3)); + + // Element [0,1] = 0x05060708 in little endian: 08 07 06 05 + EXPECT_EQ(0x08, get_byte(0, 1, 0)); + EXPECT_EQ(0x07, get_byte(0, 1, 1)); + EXPECT_EQ(0x06, get_byte(0, 1, 2)); + EXPECT_EQ(0x05, get_byte(0, 1, 3)); + + // Write through void access + auto raw_bytes = tensorstore::AllocateArray( + {2, 2, 4}, tensorstore::c_order, tensorstore::value_init); + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + // Set element [0,0] to 0xAABBCCDD (little endian: DD CC BB AA) + raw_bytes_ptr[0] = 0xDD; + raw_bytes_ptr[1] = 0xCC; + raw_bytes_ptr[2] = 0xBB; + raw_bytes_ptr[3] = 0xAA; + + TENSORSTORE_EXPECT_OK( + tensorstore::Write(raw_bytes, + void_store | tensorstore::Dims(0, 1, 2).SizedInterval( + {0, 0, 0}, {2, 2, 4})) + .result()); + + // Read back through typed access and verify + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_read, + tensorstore::Read( + typed_store | tensorstore::Dims(0, 1).SizedInterval({0, 0}, {2, 2})) + .result()); + auto typed_ptr = static_cast(typed_read.data()); + + // Element [0,0] should be 0xAABBCCDD + EXPECT_EQ(static_cast(0xAABBCCDD), typed_ptr[0]); +} + } // namespace From d609dd8852e174f2fcd12b96f976ebc6d4b8837f Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 20:57:22 +0000 Subject: [PATCH 41/59] zarr3: Fix schema.yml field/open_as_void documentation for consistency with zarr2 - Remove invalid oneOf constraint that didn't properly express mutual exclusivity - Update field description to match zarr2 style (document mutual exclusivity) - Update open_as_void description to document mutual exclusivity with field - Add oneOf type constraint for field to match zarr2 (string or null) The actual mutual exclusivity validation is done in code via jb::Initialize. --- tensorstore/driver/zarr3/schema.yml | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tensorstore/driver/zarr3/schema.yml b/tensorstore/driver/zarr3/schema.yml index 9491027b1..22a4971e2 100644 --- a/tensorstore/driver/zarr3/schema.yml +++ b/tensorstore/driver/zarr3/schema.yml @@ -18,12 +18,15 @@ allOf: by combining these metadata constraints with any `Schema` constraints. $ref: driver/zarr3/Metadata field: - type: string - title: Field selection for structured arrays. + oneOf: + - type: string + - type: "null" + title: Name of field to open. description: | - Name of the field to select from a structured array. When specified, - the tensorstore will provide access to only the specified field of - each element in the structured array. + Must be specified if the `.metadata.data_type` specified in the array + metadata has more than one field. Cannot be specified together with + :json:`"open_as_void": true`. + default: null open_as_void: type: boolean default: false @@ -31,17 +34,8 @@ allOf: description: | When true, opens the array as raw bytes instead of interpreting it as structured data. The resulting array will have an additional - dimension representing the byte layout of each element. - oneOf: - - not: - anyOf: - - required: ["field"] - - required: ["open_as_void"] - - allOf: - - not: - required: ["field"] - - not: - required: ["open_as_void"] + dimension representing the byte layout of each element. Cannot be + :json:`true` if `.field` is also specified. examples: - driver: zarr3 kvstore: From 5d849f67efc3ff33936d7377ba81b01095ab6cf0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 20:58:24 +0000 Subject: [PATCH 42/59] zarr3: Add explicit implicit_lower_bounds in GetExternalToInternalTransform For consistency with GetDomain(), explicitly set implicit_lower_bounds in GetExternalToInternalTransform when building the void access transform. Both methods now follow the same pattern of explicitly setting both implicit_lower_bounds and implicit_upper_bounds. --- tensorstore/driver/zarr3/driver.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 6a9315b5c..e6b9cf617 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -767,10 +767,14 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { builder.input_shape(full_shape); builder.input_labels(span(&normalized_dimension_names[0], total_rank)); + // Set implicit bounds: array dims have implicit upper bounds (resizable), + // bytes dim has explicit bounds (fixed size). + DimensionSet implicit_lower_bounds(false); DimensionSet implicit_upper_bounds(false); for (DimensionIndex i = 0; i < rank; ++i) { implicit_upper_bounds[i] = true; } + builder.implicit_lower_bounds(implicit_lower_bounds); builder.implicit_upper_bounds(implicit_upper_bounds); for (DimensionIndex i = 0; i < total_rank; ++i) { From 1298bcb7f0ca5ee882c85c91fa1498eaac9674f9 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 20:58:49 +0000 Subject: [PATCH 43/59] zarr3: Add assertion in DecodeChunk for void access field count Add assertion that num_fields == 1 in the void access path of DecodeChunk. Void access always uses a single synthesized field, so this assertion helps catch any inconsistency between GetDataCache and DecodeChunk. --- tensorstore/driver/zarr3/chunk_cache.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 8f15a218c..80d96011a 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -168,6 +168,7 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, // For structured types: codec was already prepared for // [chunk_shape, bytes_per_elem] with byte dtype. Just decode directly. if (open_as_void_) { + assert(num_fields == 1); // Void access uses a single synthesized field const auto& void_component_shape = grid().components[0].shape(); if (original_is_structured_) { From 72968e8fa16492f804795f461faf27d8a8a41a50 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 21:27:14 +0000 Subject: [PATCH 44/59] zarr3: Add contiguity assertions for encode/decode chunk operations Add assertions in EncodeChunk and DecodeChunk to verify that arrays are C-contiguous before performing direct memcpy operations: - In EncodeChunk: verify component arrays are C-contiguous - In DecodeChunk: verify decoded byte arrays are C-contiguous These assertions validate assumptions about array layouts that the chunk cache relies on for correct operation. The chunk cache write path (AsyncWriteArray) allocates C-order arrays, and the codec chain produces C-contiguous decoded arrays. Also adds the necessary includes and BUILD dependencies for IsContiguousLayout and c_order. --- tensorstore/driver/zarr3/BUILD | 2 ++ tensorstore/driver/zarr3/chunk_cache.cc | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tensorstore/driver/zarr3/BUILD b/tensorstore/driver/zarr3/BUILD index 685050024..72b51a3df 100644 --- a/tensorstore/driver/zarr3/BUILD +++ b/tensorstore/driver/zarr3/BUILD @@ -226,6 +226,7 @@ tensorstore_cc_library( "//tensorstore:array_storage_statistics", "//tensorstore:batch", "//tensorstore:box", + "//tensorstore:contiguous_layout", "//tensorstore:index", "//tensorstore:index_interval", "//tensorstore:rank", @@ -242,6 +243,7 @@ tensorstore_cc_library( "//tensorstore/internal:intrusive_ptr", "//tensorstore/internal:lexicographical_grid_index_key", "//tensorstore/internal:regular_grid", + "//tensorstore:strided_layout", "//tensorstore/internal:storage_statistics", "//tensorstore/internal/cache", "//tensorstore/internal/cache:chunk_cache", diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 80d96011a..5675ff2ff 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -31,6 +31,7 @@ #include "absl/time/time.h" #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" +#include "tensorstore/contiguous_layout.h" #include "tensorstore/batch.h" #include "tensorstore/box.h" #include "tensorstore/driver/chunk.h" @@ -53,6 +54,7 @@ #include "tensorstore/internal/meta/type_traits.h" #include "tensorstore/internal/regular_grid.h" #include "tensorstore/internal/storage_statistics.h" +#include "tensorstore/strided_layout.h" #include "tensorstore/kvstore/driver.h" #include "tensorstore/kvstore/key_range.h" #include "tensorstore/kvstore/kvstore.h" @@ -192,6 +194,10 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, auto decoded_array, codec_state_->DecodeArray(original_chunk_shape, std::move(data))); + // Verify decoded array is C-contiguous (codec chain should guarantee this) + assert(IsContiguousLayout(decoded_array.layout(), c_order, + decoded_array.dtype().size())); + // Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem] auto byte_array = AllocateArray( void_component_shape, c_order, default_init, @@ -223,7 +229,10 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, TENSORSTORE_ASSIGN_OR_RETURN( auto byte_array, codec_state_->DecodeArray(decode_shape, std::move(data))); - // Extract each field from the byte array + // Extract each field from the byte array. + // Note: decoded byte_array should be C-contiguous (codec chain guarantees). + assert(IsContiguousLayout(byte_array.layout(), c_order, + byte_array.dtype().size())); const Index num_elements = byte_array.num_elements() / dtype_.bytes_per_outer_element; const auto* src_bytes = static_cast(byte_array.data()); @@ -312,10 +321,15 @@ Result ZarrLeafChunkCache::EncodeChunk( auto byte_array = AllocateArray(encode_shape, c_order, value_init); auto* dst_bytes = byte_array.data(); - // Copy each field's data into the byte array at their respective offsets + // Copy each field's data into the byte array at their respective offsets. + // Note: This assumes component arrays are C-contiguous, which is guaranteed + // by the chunk cache's write path (AsyncWriteArray allocates C-order arrays). for (size_t field_i = 0; field_i < num_fields; ++field_i) { const auto& field = dtype_.fields[field_i]; const auto& field_array = component_arrays[field_i]; + // Verify the array is C-contiguous as expected + assert(IsContiguousLayout(field_array.layout(), c_order, + field_array.dtype().size())); const auto* src = static_cast(field_array.data()); const Index field_size = field.dtype->size; From 14546a1273838590005b730d923132afff2d67a1 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 21:36:08 +0000 Subject: [PATCH 45/59] zarr3: Use CopyArray for safe structured type encode/decode Replace raw memcpy loops with CopyArray using strided ArrayViews for structured type encoding and decoding. This follows the standard TensorStore pattern (as used in zarr v2 with internal::EncodeArray) where array copies are done via IterateOverArrays which safely handles any source/destination strides. The key insight is creating an ArrayView with strides that represent the interleaved field positions within the struct layout: - For a field at byte_offset B within a struct of size S - The strides are [..., S] instead of [..., field_size] - This allows CopyArray to correctly interleave/deinterleave fields This approach: 1. Removes the need for contiguity assertions (CopyArray handles any layout) 2. Is consistent with zarr v2's use of internal::EncodeArray 3. Uses the standard IterateOverArrays iteration pattern The void access decode path retains its memcpy with assertion because it's a simple byte reinterpretation where both arrays are known to be C-contiguous (destination freshly allocated, source from codec chain). --- tensorstore/driver/zarr3/chunk_cache.cc | 79 +++++++++++++------------ 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 5675ff2ff..127f5a4c9 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -230,29 +230,35 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, auto byte_array, codec_state_->DecodeArray(decode_shape, std::move(data))); // Extract each field from the byte array. - // Note: decoded byte_array should be C-contiguous (codec chain guarantees). - assert(IsContiguousLayout(byte_array.layout(), c_order, - byte_array.dtype().size())); - const Index num_elements = byte_array.num_elements() / - dtype_.bytes_per_outer_element; - const auto* src_bytes = static_cast(byte_array.data()); - + // We create a strided view into the source that maps to each field's + // position within the interleaved struct layout, then use CopyArray which + // safely handles any layout differences via IterateOverArrays. for (size_t field_i = 0; field_i < num_fields; ++field_i) { const auto& field = dtype_.fields[field_i]; // Use the component's shape (from the grid) for the result array const auto& component_shape = grid().components[field_i].shape(); auto result_array = AllocateArray(component_shape, c_order, default_init, field.dtype); - auto* dst = static_cast(result_array.data()); - const Index field_size = field.dtype->size; - - // Copy field data from each struct element - for (Index i = 0; i < num_elements; ++i) { - std::memcpy(dst + i * field_size, - src_bytes + i * dtype_.bytes_per_outer_element + - field.byte_offset, - field_size); + + // Build strides for the source view: each element is separated by + // bytes_per_outer_element (the struct size), not field_size. + std::vector src_byte_strides(chunk_shape.size()); + Index stride = dtype_.bytes_per_outer_element; + for (DimensionIndex i = chunk_shape.size(); i-- > 0;) { + src_byte_strides[i] = stride; + stride *= chunk_shape[i]; } + + // Create source ArrayView pointing to this field's offset within + // the interleaved byte array, with strides that skip over other fields. + ArrayView src_field_view( + {static_cast( + static_cast(byte_array.data()) + field.byte_offset), + field.dtype}, + StridedLayoutView<>(chunk_shape, src_byte_strides)); + + // Use CopyArray which safely handles any layout differences + CopyArray(src_field_view, result_array); field_arrays[field_i] = std::move(result_array); } @@ -311,35 +317,34 @@ Result ZarrLeafChunkCache::EncodeChunk( std::vector encode_shape(chunk_shape.begin(), chunk_shape.end()); encode_shape.push_back(dtype_.bytes_per_outer_element); - // Calculate number of outer elements - Index num_elements = 1; - for (size_t i = 0; i < chunk_shape.size(); ++i) { - num_elements *= chunk_shape[i]; - } - // Allocate byte array for combined fields auto byte_array = AllocateArray(encode_shape, c_order, value_init); - auto* dst_bytes = byte_array.data(); // Copy each field's data into the byte array at their respective offsets. - // Note: This assumes component arrays are C-contiguous, which is guaranteed - // by the chunk cache's write path (AsyncWriteArray allocates C-order arrays). + // We create a strided view into the destination that maps to each field's + // position within the interleaved struct layout, then use CopyArray which + // safely handles any source array strides via IterateOverArrays. for (size_t field_i = 0; field_i < num_fields; ++field_i) { const auto& field = dtype_.fields[field_i]; const auto& field_array = component_arrays[field_i]; - // Verify the array is C-contiguous as expected - assert(IsContiguousLayout(field_array.layout(), c_order, - field_array.dtype().size())); - const auto* src = static_cast(field_array.data()); - const Index field_size = field.dtype->size; - - // Copy field data to each struct element - for (Index i = 0; i < num_elements; ++i) { - std::memcpy(dst_bytes + i * dtype_.bytes_per_outer_element + - field.byte_offset, - src + i * field_size, - field_size); + + // Build strides for the destination view: each element is separated by + // bytes_per_outer_element (the struct size), not field_size. + std::vector dest_byte_strides(chunk_shape.size()); + Index stride = dtype_.bytes_per_outer_element; + for (DimensionIndex i = chunk_shape.size(); i-- > 0;) { + dest_byte_strides[i] = stride; + stride *= chunk_shape[i]; } + + // Create destination ArrayView pointing to this field's offset within + // the interleaved byte array, with strides that skip over other fields. + ArrayView dest_field_view( + {static_cast(byte_array.data() + field.byte_offset), field.dtype}, + StridedLayoutView<>(chunk_shape, dest_byte_strides)); + + // Use CopyArray which safely handles any source strides via IterateOverArrays + CopyArray(field_array, dest_field_view); } return codec_state_->EncodeArray(byte_array); From faf4d4fff55796e29e4538f9091877aefb78adc7 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 21:40:02 +0000 Subject: [PATCH 46/59] zarr3: Use ComputeStrides utility for stride computation Replace manual stride computation loops with ComputeStrides() from contiguous_layout.h. This is the standard TensorStore utility for computing C-order (or Fortran-order) byte strides given a shape and innermost element stride. The manual loop: Index stride = bytes_per_outer_element; for (DimensionIndex i = rank; i-- > 0;) { strides[i] = stride; stride *= shape[i]; } Is exactly equivalent to: ComputeStrides(c_order, bytes_per_outer_element, shape, strides); --- tensorstore/driver/zarr3/chunk_cache.cc | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 127f5a4c9..9f37934eb 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -243,11 +243,8 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, // Build strides for the source view: each element is separated by // bytes_per_outer_element (the struct size), not field_size. std::vector src_byte_strides(chunk_shape.size()); - Index stride = dtype_.bytes_per_outer_element; - for (DimensionIndex i = chunk_shape.size(); i-- > 0;) { - src_byte_strides[i] = stride; - stride *= chunk_shape[i]; - } + ComputeStrides(c_order, dtype_.bytes_per_outer_element, chunk_shape, + src_byte_strides); // Create source ArrayView pointing to this field's offset within // the interleaved byte array, with strides that skip over other fields. @@ -331,11 +328,8 @@ Result ZarrLeafChunkCache::EncodeChunk( // Build strides for the destination view: each element is separated by // bytes_per_outer_element (the struct size), not field_size. std::vector dest_byte_strides(chunk_shape.size()); - Index stride = dtype_.bytes_per_outer_element; - for (DimensionIndex i = chunk_shape.size(); i-- > 0;) { - dest_byte_strides[i] = stride; - stride *= chunk_shape[i]; - } + ComputeStrides(c_order, dtype_.bytes_per_outer_element, chunk_shape, + dest_byte_strides); // Create destination ArrayView pointing to this field's offset within // the interleaved byte array, with strides that skip over other fields. From f0a5dbcc83fc5b2e63da8e7c93abbb2d065d9f6e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 21:41:20 +0000 Subject: [PATCH 47/59] zarr3: Use DimensionSet::UpTo and std::fill_n/std::copy_n utilities Replace manual loops with standard library and TensorStore utilities: 1. DimensionSet::UpTo(rank) - Creates a DimensionSet with bits [0, rank) set to true. Replaces: DimensionSet s(false); for (i = 0; i < rank; ++i) s[i] = true; 2. std::fill_n for origins (all zeros) and std::copy_n for shape copy. This is more idiomatic and clearer than explicit index loops. These are standard patterns used throughout TensorStore for similar operations on dimension sets and shape vectors. --- tensorstore/driver/zarr3/driver.cc | 32 +++++++++--------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index e6b9cf617..61c573381 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -189,24 +189,15 @@ class ZarrDriverSpec const DimensionIndex original_rank = metadata_constraints.shape->size(); IndexDomainBuilder builder(original_rank + 1); - // Set original dimensions from metadata - for (DimensionIndex i = 0; i < original_rank; ++i) { - builder.origin()[i] = 0; - builder.shape()[i] = (*metadata_constraints.shape)[i]; - } - - // Add bytes dimension - builder.origin()[original_rank] = 0; + // Set original dimensions from metadata (all origins are 0) + std::fill_n(builder.origin().begin(), original_rank + 1, Index{0}); + std::copy_n(metadata_constraints.shape->begin(), original_rank, + builder.shape().begin()); builder.shape()[original_rank] = bytes_per_elem; - // Set implicit bounds: array dims are implicit, bytes dim is explicit - DimensionSet implicit_lower(false); - DimensionSet implicit_upper(false); - for (DimensionIndex i = 0; i < original_rank; ++i) { - implicit_upper[i] = true; // Array dimensions are resizable - } - builder.implicit_lower_bounds(implicit_lower); - builder.implicit_upper_bounds(implicit_upper); + // Set implicit bounds: array dims are implicit (resizable), bytes dim is explicit + builder.implicit_lower_bounds(DimensionSet(false)); + builder.implicit_upper_bounds(DimensionSet::UpTo(original_rank)); // Copy dimension names if available if (metadata_constraints.dimension_names) { @@ -769,13 +760,8 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { // Set implicit bounds: array dims have implicit upper bounds (resizable), // bytes dim has explicit bounds (fixed size). - DimensionSet implicit_lower_bounds(false); - DimensionSet implicit_upper_bounds(false); - for (DimensionIndex i = 0; i < rank; ++i) { - implicit_upper_bounds[i] = true; - } - builder.implicit_lower_bounds(implicit_lower_bounds); - builder.implicit_upper_bounds(implicit_upper_bounds); + builder.implicit_lower_bounds(DimensionSet(false)); + builder.implicit_upper_bounds(DimensionSet::UpTo(rank)); for (DimensionIndex i = 0; i < total_rank; ++i) { builder.output_single_input_dimension(i, i); From ac98313f5b2bb35bad4eaa338ac7285d86701bbd Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 21:59:36 +0000 Subject: [PATCH 48/59] zarr3: Fix open_as_void with sharding for non-structured types The sub-chunk cache in sharding mode uses a grid from the sharding codec state, which doesn't know about void access. This caused issues: 1. Shape mismatch: The grid's component shape was [4, 4] but decoded arrays had shape [4, 4, 4] (with bytes dimension) 2. Invalid key generation: The grid's chunk_shape affected cell indexing Fix by: - Add `grid_has_void_dimension_` flag to track whether the grid includes the bytes dimension (false for sub-chunk caches) - For sub-chunk caches with void access on non-structured types, create a modified grid with: - Component chunk_shape including bytes dimension [4, 4, 4] - Grid chunk_shape unchanged [4, 4] (for cell indexing) - Proper chunked_to_cell_dimensions mapping This enables void access to work correctly with sharding codecs. --- tensorstore/driver/zarr3/chunk_cache.cc | 36 ++++++++++++++----- tensorstore/driver/zarr3/chunk_cache.h | 48 ++++++++++++++++++++++--- 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 9f37934eb..b60963288 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -78,13 +78,15 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/, - bool open_as_void, bool original_is_structured, DataType original_dtype) + bool open_as_void, bool original_is_structured, DataType original_dtype, + bool grid_has_void_dimension) : Base(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), open_as_void_(open_as_void), original_is_structured_(original_is_structured), - original_dtype_(original_dtype) {} + original_dtype_(original_dtype), + grid_has_void_dimension_(grid_has_void_dimension) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver chunk_indices, // Non-structured types: codec expects original dtype without extra // dimension. Decode, then reinterpret as bytes. - const auto& void_chunk_shape = grid().chunk_shape; - std::vector original_chunk_shape( - void_chunk_shape.begin(), - void_chunk_shape.end() - 1); // Strip bytes dimension + // + // For top-level caches, grid().chunk_shape includes bytes dimension. + // For sub-chunk caches (inside sharding), grid() returns the sharding + // codec's sub_chunk_grid which doesn't have bytes dimension. + const Index bytes_per_element = dtype_.bytes_per_outer_element; + const auto& grid_chunk_shape = grid().chunk_shape; + + std::vector original_chunk_shape; + if (grid_has_void_dimension_) { + // Strip the bytes dimension to get original shape + original_chunk_shape.assign(grid_chunk_shape.begin(), + grid_chunk_shape.end() - 1); + } else { + // Sub-chunk cache: grid shape is already the original shape + original_chunk_shape.assign(grid_chunk_shape.begin(), + grid_chunk_shape.end()); + } // Decode using original codec shape TENSORSTORE_ASSIGN_OR_RETURN( @@ -198,9 +213,13 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, assert(IsContiguousLayout(decoded_array.layout(), c_order, decoded_array.dtype().size())); + // Build the void output shape: original_shape + [bytes_per_element] + std::vector void_output_shape = original_chunk_shape; + void_output_shape.push_back(bytes_per_element); + // Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem] auto byte_array = AllocateArray( - void_component_shape, c_order, default_init, + void_output_shape, c_order, default_init, dtype_v); // Copy decoded data to byte array (handles potential layout differences) @@ -351,7 +370,8 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, - bool open_as_void, bool original_is_structured, DataType original_dtype) + bool open_as_void, bool original_is_structured, DataType original_dtype, + bool /*grid_has_void_dimension*/) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index 58b1d4c68..6e5bacdb9 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include @@ -161,7 +163,8 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, internal::CachePool::WeakPtr data_cache_pool, bool open_as_void, bool original_is_structured, - DataType original_dtype); + DataType original_dtype, + bool grid_has_void_dimension = true); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -278,9 +283,37 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), std::move(dtype), std::move(data_cache_pool), - open_as_void, original_is_structured, original_dtype), + open_as_void, original_is_structured, original_dtype, + /*grid_has_void_dimension=*/false), sharding_state_(std::move(sharding_state)), - executor_(std::move(executor)) {} + executor_(std::move(executor)), + open_as_void_(open_as_void), + original_is_structured_(original_is_structured), + bytes_per_element_(dtype.bytes_per_outer_element) { + // For void access on non-structured types, create a modified grid + // with the bytes dimension added to the component shape. + // The grid's chunk_shape stays the same (determines cell layout). + if (open_as_void_ && !original_is_structured_) { + const auto& original_grid = *sharding_state_->sub_chunk_grid; + const auto& orig_comp = original_grid.components[0]; + // Component chunk_shape gets bytes dimension, grid chunk_shape doesn't + std::vector void_comp_shape = orig_comp.chunk_shape; + void_comp_shape.push_back(bytes_per_element_); + // Create zero fill value with the void shape + auto fill_value = AllocateArray(void_comp_shape, c_order, value_init, + dtype_v); + // chunked_to_cell_dimensions maps the grid dimensions to cell dimensions + // (the bytes dimension is unchunked, so not included here) + std::vector chunked_to_cell(original_grid.chunk_shape.size()); + std::iota(chunked_to_cell.begin(), chunked_to_cell.end(), 0); + internal::ChunkGridSpecification::ComponentList components; + components.emplace_back( + internal::AsyncWriteArray::Spec{std::move(fill_value), + Box<>(void_comp_shape.size())}, + void_comp_shape, std::move(chunked_to_cell)); + void_grid_.emplace(std::move(components)); + } + } const internal::LexicographicalGridIndexKeyParser& GetChunkStorageKeyParser() override { @@ -288,6 +321,9 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { } const internal::ChunkGridSpecification& grid() const override { + if (void_grid_) { + return *void_grid_; + } return *sharding_state_->sub_chunk_grid; } const Executor& executor() const override { return executor_; } @@ -296,6 +332,10 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { ZarrShardingCodec::PreparedState::Ptr sharding_state_; Executor executor_; + bool open_as_void_; + bool original_is_structured_; + Index bytes_per_element_; + std::optional void_grid_; }; // Creates a `ZarrChunkCache` for the specified `codec_chain`. From 2d7f34c90c0ac9fe27872667d3c42222056c4e31 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 26 Jan 2026 22:03:03 +0000 Subject: [PATCH 49/59] zarr3: Remove redundant member variables in ZarrShardSubChunkCache The ZarrShardSubChunkCache template had duplicate member variables (open_as_void_, original_is_structured_, bytes_per_element_) that were already present in the base class ChunkCacheImpl (ZarrLeafChunkCache). Access these through ChunkCacheImpl:: prefix instead to follow DRY principle and maintain consistency with other TensorStore patterns. --- tensorstore/driver/zarr3/chunk_cache.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index 6e5bacdb9..f698ee232 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -286,19 +286,17 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { open_as_void, original_is_structured, original_dtype, /*grid_has_void_dimension=*/false), sharding_state_(std::move(sharding_state)), - executor_(std::move(executor)), - open_as_void_(open_as_void), - original_is_structured_(original_is_structured), - bytes_per_element_(dtype.bytes_per_outer_element) { + executor_(std::move(executor)) { // For void access on non-structured types, create a modified grid // with the bytes dimension added to the component shape. // The grid's chunk_shape stays the same (determines cell layout). - if (open_as_void_ && !original_is_structured_) { + if (ChunkCacheImpl::open_as_void_ && + !ChunkCacheImpl::original_is_structured_) { const auto& original_grid = *sharding_state_->sub_chunk_grid; const auto& orig_comp = original_grid.components[0]; // Component chunk_shape gets bytes dimension, grid chunk_shape doesn't std::vector void_comp_shape = orig_comp.chunk_shape; - void_comp_shape.push_back(bytes_per_element_); + void_comp_shape.push_back(ChunkCacheImpl::dtype_.bytes_per_outer_element); // Create zero fill value with the void shape auto fill_value = AllocateArray(void_comp_shape, c_order, value_init, dtype_v); @@ -332,9 +330,6 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { ZarrShardingCodec::PreparedState::Ptr sharding_state_; Executor executor_; - bool open_as_void_; - bool original_is_structured_; - Index bytes_per_element_; std::optional void_grid_; }; From a20a68657f21fea90170db86089a3d2fe369b510 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 17:16:16 +0000 Subject: [PATCH 50/59] Fix includes. Resolves: https://github.com/google/tensorstore/pull/271#discussion_r2757581279, https://github.com/google/tensorstore/pull/271#discussion_r2757585156, https://github.com/google/tensorstore/pull/271#discussion_r2757612952, https://github.com/google/tensorstore/pull/271#discussion_r2757620298, --- tensorstore/driver/zarr3/driver.cc | 2 +- tensorstore/driver/zarr3/dtype.cc | 12 ++++++++++++ tensorstore/driver/zarr3/dtype.h | 7 +++++++ tensorstore/driver/zarr3/metadata.cc | 8 ++++---- tensorstore/driver/zarr3/metadata.h | 6 +++++- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 61c573381..b55246a62 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -25,6 +25,7 @@ #include #include +#include #include "absl/status/status.h" #include "absl/strings/ascii.h" #include "absl/strings/cord.h" @@ -32,7 +33,6 @@ #include "absl/strings/str_cat.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" #include "tensorstore/box.h" diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index b8aacaa68..965f2d63a 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -16,15 +16,27 @@ #include +#include +#include +#include #include +#include +#include +#include #include "absl/base/optimization.h" +#include "absl/status/status.h" #include "absl/strings/ascii.h" +#include "absl/strings/numbers.h" #include "tensorstore/data_type.h" +#include "tensorstore/index.h" +#include "tensorstore/internal/integer_overflow.h" #include "tensorstore/internal/json_binding/json_binding.h" #include "tensorstore/util/endian.h" #include "tensorstore/util/extents.h" #include "tensorstore/util/quote_string.h" +#include "tensorstore/util/result.h" +#include "tensorstore/util/span.h" #include "tensorstore/util/str_cat.h" namespace tensorstore { diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h index 73a6b0961..d9be43db0 100644 --- a/tensorstore/driver/zarr3/dtype.h +++ b/tensorstore/driver/zarr3/dtype.h @@ -20,9 +20,16 @@ /// See: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-type #include +#include +#include +#include + #include +#include "absl/status/status.h" #include "tensorstore/data_type.h" +#include "tensorstore/index.h" #include "tensorstore/internal/json_binding/bindable.h" +#include "tensorstore/json_serialization_options_base.h" #include "tensorstore/util/endian.h" #include "tensorstore/util/result.h" diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index ba4454de4..4f8146f10 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -31,18 +32,16 @@ #include #include -#include - +#include #include "absl/algorithm/container.h" -#include "absl/strings/escaping.h" #include "absl/base/casts.h" #include "absl/base/optimization.h" #include "absl/meta/type_traits.h" #include "absl/status/status.h" +#include "absl/strings/escaping.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/str_join.h" -#include #include "tensorstore/array.h" #include "tensorstore/box.h" #include "tensorstore/chunk_layout.h" @@ -78,6 +77,7 @@ #include "tensorstore/serialization/fwd.h" #include "tensorstore/serialization/json_bindable.h" #include "tensorstore/util/constant_vector.h" +#include "tensorstore/util/dimension_set.h" #include "tensorstore/util/iterate.h" #include "tensorstore/util/quote_string.h" #include "tensorstore/util/result.h" diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index d091dea22..88961cbb3 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -19,14 +19,18 @@ /// Support for encoding/decoding the JSON metadata for zarr arrays /// See: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata +#include + +#include #include #include #include +#include #include #include -#include "absl/status/status.h" #include +#include "absl/status/status.h" #include "tensorstore/array.h" #include "tensorstore/chunk_layout.h" #include "tensorstore/codec_spec.h" From e9ac8286dc87dadd02a69eafa12d966a5739672e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 17:18:17 +0000 Subject: [PATCH 51/59] Fix indentation --- tensorstore/driver/zarr3/dtype.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h index d9be43db0..dc4c8e4f3 100644 --- a/tensorstore/driver/zarr3/dtype.h +++ b/tensorstore/driver/zarr3/dtype.h @@ -137,13 +137,13 @@ absl::Status ValidateDType(ZarrDType& dtype); /// unstructured scalar array, otherwise `std::nullopt`. std::optional GetScalarDataType(const ZarrDType& dtype); - /// Parses a Zarr 3 data type string. - /// - /// \error `absl::StatusCode::kInvalidArgument` if `dtype` is not valid. - Result ParseBaseDType(std::string_view dtype); +/// Parses a Zarr 3 data type string. +/// +/// \error `absl::StatusCode::kInvalidArgument` if `dtype` is not valid. +Result ParseBaseDType(std::string_view dtype); - /// Chooses a zarr data type corresponding to `dtype`. - Result ChooseBaseDType(DataType dtype); +/// Chooses a zarr data type corresponding to `dtype`. +Result ChooseBaseDType(DataType dtype); } // namespace internal_zarr3 } // namespace tensorstore From 858e40d8ea7cb02c041eb6eff4ff25f04d259cf7 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 17:23:39 +0000 Subject: [PATCH 52/59] Fix imports. Resolves: https://github.com/google/tensorstore/pull/271#discussion_r2757654099, https://github.com/google/tensorstore/pull/271#discussion_r2757655270, https://github.com/google/tensorstore/pull/271#discussion_r2757659108 --- tensorstore/driver/zarr3/chunk_cache.cc | 5 +++-- tensorstore/driver/zarr3/chunk_cache.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index b60963288..16942b009 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -31,14 +30,16 @@ #include "absl/time/time.h" #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" -#include "tensorstore/contiguous_layout.h" #include "tensorstore/batch.h" #include "tensorstore/box.h" +#include "tensorstore/contiguous_layout.h" +#include "tensorstore/data_type.h" #include "tensorstore/driver/chunk.h" #include "tensorstore/driver/chunk_receiver_utils.h" #include "tensorstore/driver/read_request.h" #include "tensorstore/driver/write_request.h" #include "tensorstore/driver/zarr3/codec/codec.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/index.h" #include "tensorstore/index_interval.h" #include "tensorstore/index_space/index_transform.h" diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index f698ee232..b574652a9 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -29,6 +29,7 @@ #include "absl/synchronization/mutex.h" #include "absl/time/time.h" #include "tensorstore/array.h" +#include "tensorstore/data_type.h" #include "tensorstore/driver/chunk.h" #include "tensorstore/driver/read_request.h" #include "tensorstore/driver/write_request.h" From be1ab7c14836fa282461d12cdca5bdc4e17cbe6f Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 19:28:00 +0000 Subject: [PATCH 53/59] friend inline the equality and inequality operator overloads. Resolves: https://github.com/google/tensorstore/pull/271#discussion_r2757596957 --- tensorstore/driver/zarr3/dtype.cc | 32 -------------------------- tensorstore/driver/zarr3/dtype.h | 38 +++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 41 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 965f2d63a..f48167357 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -296,38 +296,6 @@ Result ParseDType(const nlohmann::json& value) { return dtype; } -bool operator==(const ZarrDType::BaseDType& a, - const ZarrDType::BaseDType& b) { - return a.encoded_dtype == b.encoded_dtype && a.dtype == b.dtype && - a.flexible_shape == b.flexible_shape; -} - -bool operator!=(const ZarrDType::BaseDType& a, - const ZarrDType::BaseDType& b) { - return !(a == b); -} - -bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b) { - return static_cast(a) == - static_cast(b) && - a.outer_shape == b.outer_shape && a.name == b.name && - a.field_shape == b.field_shape && - a.num_inner_elements == b.num_inner_elements && - a.byte_offset == b.byte_offset && a.num_bytes == b.num_bytes; -} - -bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b) { - return !(a == b); -} - -bool operator==(const ZarrDType& a, const ZarrDType& b) { - return a.has_fields == b.has_fields && - a.bytes_per_outer_element == b.bytes_per_outer_element && - a.fields == b.fields; -} - -bool operator!=(const ZarrDType& a, const ZarrDType& b) { return !(a == b); } - void to_json(::nlohmann::json& out, const ZarrDType::Field& field) { using array_t = ::nlohmann::json::array_t; if (field.outer_shape.empty()) { diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h index dc4c8e4f3..039aaf072 100644 --- a/tensorstore/driver/zarr3/dtype.h +++ b/tensorstore/driver/zarr3/dtype.h @@ -66,6 +66,14 @@ struct ZarrDType { /// For "flexible" data types that are themselves arrays, this specifies the /// shape. For regular data types, this is empty. std::vector flexible_shape; + + friend bool operator==(const BaseDType& a, const BaseDType& b) { + return a.encoded_dtype == b.encoded_dtype && a.dtype == b.dtype && + a.flexible_shape == b.flexible_shape; + } + friend bool operator!=(const BaseDType& a, const BaseDType& b) { + return !(a == b); + } }; /// Decoded representation of a single field. @@ -92,6 +100,18 @@ struct ZarrDType { /// Number of bytes occupied by this field within an "outer" element /// (derived value). Index num_bytes; + + friend bool operator==(const Field& a, const Field& b) { + return static_cast(a) == + static_cast(b) && + a.outer_shape == b.outer_shape && a.name == b.name && + a.field_shape == b.field_shape && + a.num_inner_elements == b.num_inner_elements && + a.byte_offset == b.byte_offset && a.num_bytes == b.num_bytes; + } + friend bool operator!=(const Field& a, const Field& b) { + return !(a == b); + } }; /// Equal to `true` if the zarr "dtype" was specified as an array, in which @@ -110,16 +130,16 @@ struct ZarrDType { friend void to_json(::nlohmann::json& out, // NOLINT const ZarrDType& dtype); -}; -bool operator==(const ZarrDType::BaseDType& a, - const ZarrDType::BaseDType& b); -bool operator!=(const ZarrDType::BaseDType& a, - const ZarrDType::BaseDType& b); -bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b); -bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b); -bool operator==(const ZarrDType& a, const ZarrDType& b); -bool operator!=(const ZarrDType& a, const ZarrDType& b); + friend bool operator==(const ZarrDType& a, const ZarrDType& b) { + return a.has_fields == b.has_fields && + a.bytes_per_outer_element == b.bytes_per_outer_element && + a.fields == b.fields; + } + friend bool operator!=(const ZarrDType& a, const ZarrDType& b) { + return !(a == b); + } +}; /// Parses a zarr metadata "dtype" JSON specification. /// From 24e16c254343428b849288eb1bcb6649a4b24d04 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 19:51:16 +0000 Subject: [PATCH 54/59] Prefer absl::StrFormat over tensorstore::StrCat --- tensorstore/driver/zarr3/dtype.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index f48167357..e9ba761f6 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -28,6 +28,7 @@ #include "absl/status/status.h" #include "absl/strings/ascii.h" #include "absl/strings/numbers.h" +#include "absl/strings/str_format.h" #include "tensorstore/data_type.h" #include "tensorstore/index.h" #include "tensorstore/internal/integer_overflow.h" @@ -77,9 +78,10 @@ Result ParseBaseDType(std::string_view dtype) { if (!absl::SimpleAtoi(suffix, &num_bits) || num_bits == 0 || num_bits % 8 != 0) { - return absl::InvalidArgumentError(tensorstore::StrCat( - dtype, " data type is invalid; expected r where N is a positive " - "multiple of 8")); + return absl::InvalidArgumentError(absl::StrFormat( + "%s data type is invalid; expected r where N is a positive " + "multiple of 8", + dtype)); } Index num_bytes = num_bits / 8; return ZarrDType::BaseDType{std::string(dtype), @@ -89,18 +91,18 @@ Result ParseBaseDType(std::string_view dtype) { // Handle bare "r" - must have a number after it if (dtype.size() >= 1 && dtype[0] == 'r') { - return absl::InvalidArgumentError(tensorstore::StrCat( - dtype, " data type is invalid; expected r where N is a positive " - "multiple of 8")); + return absl::InvalidArgumentError(absl::StrFormat( + "%s data type is invalid; expected r where N is a positive " + "multiple of 8", + dtype)); } constexpr std::string_view kSupported = "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " "bfloat16, float16, float32, float64, complex64, complex128, r"; - return absl::InvalidArgumentError( - tensorstore::StrCat(dtype, " data type is not one of the supported " - "data types: ", - kSupported)); + return absl::InvalidArgumentError(absl::StrFormat( + "%s data type is not one of the supported data types: %s", dtype, + kSupported)); } namespace { From f9c675057e4257654c3a0dfdf0c66cbc463c10a0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 20:46:38 +0000 Subject: [PATCH 55/59] Add return type annotation to lambdas. Resolves: https://github.com/google/tensorstore/pull/271#discussion_r2757673423 --- tensorstore/driver/zarr3/driver.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index b55246a62..b9c39cc97 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -118,7 +118,7 @@ class ZarrDriverSpec static inline const auto default_json_binder = jb::Sequence( jb::Validate( - [](const auto& options, auto* obj) { + [](const auto& options, auto* obj) -> absl::Status { if (obj->schema.dtype().valid()) { return ValidateDataType(obj->schema.dtype()); } @@ -128,7 +128,7 @@ class ZarrDriverSpec jb::Member( "metadata", jb::Validate( - [](const auto& options, auto* obj) { + [](const auto& options, auto* obj) -> absl::Status { if (obj->metadata_constraints.data_type) { if (auto dtype = GetScalarDataType( *obj->metadata_constraints.data_type)) { @@ -153,7 +153,7 @@ class ZarrDriverSpec jb::Member("open_as_void", jb::Projection<&ZarrDriverSpec::open_as_void>( jb::DefaultValue( [](auto* v) { *v = false; }))), - jb::Initialize([](auto* obj) { + jb::Initialize([](auto* obj) -> absl::Status { // Validate that field and open_as_void are mutually exclusive if (obj->open_as_void && !obj->selected_field.empty()) { return absl::InvalidArgumentError( From 6a773e27badcc64ba6bc3883e2622d741f739ae0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 3 Feb 2026 21:15:14 +0000 Subject: [PATCH 56/59] Update key generation to handle grid indices safely by ensuring the subspan does not exceed the grid size. This prevents potential out-of-bounds access when generating keys. Resolves: https://github.com/google/tensorstore/pull/271#issuecomment-3839662461 --- tensorstore/driver/zarr3/driver.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index b9c39cc97..2255abb3c 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -557,7 +557,9 @@ class DataCacheBase [](std::string& out, DimensionIndex dim, Index grid_index) { absl::StrAppend(&out, grid_index); }, - rank, grid_indices.subspan(0, rank)); + rank, + grid_indices.subspan( + 0, std::min(grid_indices.size(), rank))); return key; } From 6028b5b7dbc77c63bf7d757ac146a149ecf38983 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 4 Feb 2026 15:17:47 +0000 Subject: [PATCH 57/59] Prefer `empty()` over `size()` Resolves: https://github.com/google/tensorstore/pull/271#discussion_r2761074528, https://github.com/google/tensorstore/pull/271#discussion_r2761076335 --- tensorstore/driver/zarr3/driver.cc | 2 +- tensorstore/driver/zarr3/dtype.cc | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 2255abb3c..bbb5f29f3 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -998,7 +998,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { // Get the original dtype for void access encoding (needed by EncodeChunk). // For non-structured types, this is the single field's dtype. - DataType original_dtype = metadata.data_type.fields.size() > 0 + DataType original_dtype = !metadata.data_type.fields.empty() ? metadata.data_type.fields[0].dtype : DataType{}; diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index e9ba761f6..b48e8bc33 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -72,7 +72,8 @@ Result ParseBaseDType(std::string_view dtype) { return make_dtype(dtype_v<::tensorstore::dtypes::complex128_t>); // Handle r raw bits type where N is number of bits (must be multiple of 8) - if (dtype.size() > 1 && dtype[0] == 'r' && absl::ascii_isdigit(dtype[1])) { + if (!dtype.empty() && dtype[0] == 'r' && dtype.size() > 1 && + absl::ascii_isdigit(dtype[1])) { std::string_view suffix = dtype.substr(1); Index num_bits = 0; if (!absl::SimpleAtoi(suffix, &num_bits) || @@ -90,7 +91,7 @@ Result ParseBaseDType(std::string_view dtype) { } // Handle bare "r" - must have a number after it - if (dtype.size() >= 1 && dtype[0] == 'r') { + if (!dtype.empty() && dtype[0] == 'r') { return absl::InvalidArgumentError(absl::StrFormat( "%s data type is invalid; expected r where N is a positive " "multiple of 8", From 0cb06370de34ec2c644a32005cb4f7b45b460b2a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Fri, 6 Feb 2026 15:54:32 +0000 Subject: [PATCH 58/59] Refactor `tensorstore::StrCat` to `absl::StrFormat`. Resolves https://github.com/google/tensorstore/pull/271#discussion_r2761080035 with TODO question pending --- tensorstore/driver/zarr3/codec/blosc.cc | 6 +- tensorstore/driver/zarr3/codec/bytes.cc | 21 ++++-- .../driver/zarr3/codec/codec_chain_spec.cc | 12 ++-- .../driver/zarr3/codec/sharding_indexed.cc | 4 ++ tensorstore/driver/zarr3/codec/transpose.cc | 5 ++ tensorstore/driver/zarr3/driver.cc | 7 +- tensorstore/driver/zarr3/dtype.cc | 23 +++--- tensorstore/driver/zarr3/metadata.cc | 72 ++++++++++--------- 8 files changed, 86 insertions(+), 64 deletions(-) diff --git a/tensorstore/driver/zarr3/codec/blosc.cc b/tensorstore/driver/zarr3/codec/blosc.cc index ea8718d85..b11677411 100644 --- a/tensorstore/driver/zarr3/codec/blosc.cc +++ b/tensorstore/driver/zarr3/codec/blosc.cc @@ -160,9 +160,9 @@ constexpr auto CodecBinder() { return jb::Validate([](const auto& options, std::string* cname) { if (cname->find('\0') != std::string::npos || blosc_compname_to_compcode(cname->c_str()) == -1) { - return absl::InvalidArgumentError( - tensorstore::StrCat("Expected one of ", blosc_list_compressors(), - " but received: ", QuoteString(*cname))); + return absl::InvalidArgumentError(absl::StrFormat( + "Expected one of %s but received: %s", blosc_list_compressors(), + QuoteString(*cname))); } return absl::OkStatus(); }); diff --git a/tensorstore/driver/zarr3/codec/bytes.cc b/tensorstore/driver/zarr3/codec/bytes.cc index cb3c62934..c8c4de059 100644 --- a/tensorstore/driver/zarr3/codec/bytes.cc +++ b/tensorstore/driver/zarr3/codec/bytes.cc @@ -22,6 +22,7 @@ #include #include "absl/status/status.h" +#include "absl/strings/str_format.h" #include "riegeli/bytes/reader.h" #include "riegeli/bytes/writer.h" #include "tensorstore/array.h" @@ -52,8 +53,8 @@ namespace internal_zarr3 { namespace { absl::Status InvalidDataTypeError(DataType dtype) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Data type ", dtype, " not compatible with \"bytes\" codec")); + return absl::InvalidArgumentError(absl::StrFormat( + "Data type %v not compatible with \"bytes\" codec", dtype)); } class BytesCodec : public ZarrArrayToBytesCodec { @@ -118,23 +119,27 @@ Result BytesCodecSpec::Resolve( const bool is_endian_invariant = internal::IsEndianInvariantDataType(decoded.dtype); if (!options.constraints && !is_endian_invariant && !options.endianness) { - return absl::InvalidArgumentError( - tensorstore::StrCat("\"bytes\" codec requires that \"endian\" option " - "is specified for data type ", - decoded.dtype)); + return absl::InvalidArgumentError(absl::StrFormat( + "\"bytes\" codec requires that \"endian\" option is specified for " + "data type %v", + decoded.dtype)); } encoded.item_bits = decoded.dtype.size() * 8; DimensionIndex rank = decoded.rank; if (decoded.codec_chunk_shape) { + // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has + // AbslStringify support, allowing use of %v format specifier. return absl::InvalidArgumentError(tensorstore::StrCat( "\"bytes\" codec does not support codec_chunk_shape (", span(decoded.codec_chunk_shape->data(), rank), - " was specified")); + " was specified)")); } if (decoded.inner_order) { auto& decoded_inner_order = *decoded.inner_order; for (DimensionIndex i = 0; i < rank; ++i) { if (decoded_inner_order[i] != i) { + // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has + // AbslStringify support, allowing use of %v format specifier. return absl::InvalidArgumentError(tensorstore::StrCat( "\"bytes\" codec does not support inner_order of ", span(decoded_inner_order.data(), rank))); @@ -206,6 +211,8 @@ Result BytesCodec::Prepare( int64_t bytes = dtype_.size(); for (auto size : decoded_shape) { if (internal::MulOverflow(size, bytes, &bytes)) { + // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has + // AbslStringify support, allowing use of %v format specifier. return absl::OutOfRangeError(tensorstore::StrCat( "Integer overflow computing encoded size of array of shape ", decoded_shape)); diff --git a/tensorstore/driver/zarr3/codec/codec_chain_spec.cc b/tensorstore/driver/zarr3/codec/codec_chain_spec.cc index c3bacd6cd..7dc9b6ced 100644 --- a/tensorstore/driver/zarr3/codec/codec_chain_spec.cc +++ b/tensorstore/driver/zarr3/codec/codec_chain_spec.cc @@ -132,8 +132,8 @@ constexpr auto ZarrCodecChainSpecJsonBinderImpl = jb::Compose< } for (; it != end; ++it) { if ((*it)->kind() != ZarrCodecKind::kBytesToBytes) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected bytes -> bytes codec, but received: ", + return absl::InvalidArgumentError(absl::StrFormat( + "Expected bytes -> bytes codec, but received: %s", jb::ToJson(*it, ZarrCodecJsonBinder).value().dump())); } obj->bytes_to_bytes.push_back( @@ -164,16 +164,16 @@ Result GetDefaultArrayToBytesCodecSpec( if (internal::IsTrivialDataType(decoded.dtype)) { return DefaultBytesCodec(); } - return absl::InternalError(tensorstore::StrCat( - "No default codec defined for data type ", decoded.dtype)); + return absl::InternalError(absl::StrFormat( + "No default codec defined for data type %v", decoded.dtype)); } absl::Status CodecResolveError(const ZarrCodecSpec& codec_spec, std::string_view message, const absl::Status& status) { return tensorstore::MaybeAnnotateStatus( - status, tensorstore::StrCat( - "Error ", message, " through ", + status, absl::StrFormat( + "Error %s through %s", message, jb::ToJson(&codec_spec, ZarrCodecJsonBinder).value().dump())); } } // namespace diff --git a/tensorstore/driver/zarr3/codec/sharding_indexed.cc b/tensorstore/driver/zarr3/codec/sharding_indexed.cc index 3f11298c2..2700f9887 100644 --- a/tensorstore/driver/zarr3/codec/sharding_indexed.cc +++ b/tensorstore/driver/zarr3/codec/sharding_indexed.cc @@ -60,6 +60,8 @@ namespace tensorstore { namespace internal_zarr3 { +// TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has +// AbslStringify support, allowing use of %v format specifier. absl::Status SubChunkRankMismatch(span sub_chunk_shape, DimensionIndex outer_rank) { return absl::InvalidArgumentError(tensorstore::StrCat( @@ -67,6 +69,8 @@ absl::Status SubChunkRankMismatch(span sub_chunk_shape, " is not compatible with array of rank ", outer_rank)); } +// TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has +// AbslStringify support, allowing use of %v format specifier. absl::Status SubChunkShapeMismatch(span sub_chunk_shape, span chunk_shape) { return absl::InvalidArgumentError(tensorstore::StrCat( diff --git a/tensorstore/driver/zarr3/codec/transpose.cc b/tensorstore/driver/zarr3/codec/transpose.cc index f52609c9b..13e05ae17 100644 --- a/tensorstore/driver/zarr3/codec/transpose.cc +++ b/tensorstore/driver/zarr3/codec/transpose.cc @@ -50,6 +50,9 @@ namespace internal_zarr3 { namespace { namespace jb = ::tensorstore::internal_json_binding; + +// TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has +// AbslStringify support, allowing use of %v format specifier. absl::Status InvalidPermutationError(span order, DimensionIndex rank) { return absl::InvalidArgumentError(tensorstore::StrCat( @@ -62,6 +65,8 @@ constexpr auto OrderJsonBinder() { jb::Validate( [](const auto& options, auto* obj) { if (!IsValidPermutation(*obj)) { + // TODO(BrianMichell): Convert to absl::StrFormat once + // tensorstore::span has AbslStringify support. return absl::InvalidArgumentError( tensorstore::StrCat(span(*obj), " is not a valid permutation")); diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index bbb5f29f3..36ea61e20 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -31,6 +31,7 @@ #include "absl/strings/cord.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "tensorstore/array.h" @@ -385,9 +386,9 @@ class DataCacheBase auto existing_key = existing_metadata.GetCompatibilityKey(); auto new_key = new_metadata.GetCompatibilityKey(); if (existing_key == new_key) return absl::OkStatus(); - return absl::FailedPreconditionError(tensorstore::StrCat( - "Updated zarr metadata ", new_key, - " is incompatible with existing metadata ", existing_key)); + return absl::FailedPreconditionError(absl::StrFormat( + "Updated zarr metadata %s is incompatible with existing metadata %s", + new_key, existing_key)); } void GetChunkGridBounds(const void* metadata_ptr, MutableBoxView<> bounds, diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index b48e8bc33..31fb3644d 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -132,8 +132,9 @@ absl::Status ParseFieldsArray(const nlohmann::json& fields_json, x, [&](ptrdiff_t size) { if (size < 2 || size > 3) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected array of size 2 or 3, but received: ", x.dump())); + return absl::InvalidArgumentError(absl::StrFormat( + "Expected array of size 2 or 3, but received: %s", + x.dump())); } return absl::OkStatus(); }, @@ -143,8 +144,8 @@ absl::Status ParseFieldsArray(const nlohmann::json& fields_json, if (internal_json::JsonRequireValueAs(v, &field.name).ok()) { if (!field.name.empty()) return absl::OkStatus(); } - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected non-empty string, but received: ", v.dump())); + return absl::InvalidArgumentError(absl::StrFormat( + "Expected non-empty string, but received: %s", v.dump())); case 1: { std::string dtype_string; TENSORSTORE_RETURN_IF_ERROR( @@ -238,9 +239,9 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { ParseBaseDType(type_name)); return out; } - return absl::InvalidArgumentError(tensorstore::StrCat( + return absl::InvalidArgumentError(absl::StrFormat( "Expected string, array, or object with 'name' and 'configuration', " - "but received: ", + "but received: %s", value.dump())); } // Handle array format: [["field1", "type1"], ["field2", "type2"], ...] @@ -257,17 +258,19 @@ absl::Status ValidateDType(ZarrDType& dtype) { if (std::any_of( dtype.fields.begin(), dtype.fields.begin() + field_i, [&](const ZarrDType::Field& f) { return f.name == field.name; })) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Field name ", QuoteString(field.name), " occurs more than once")); + return absl::InvalidArgumentError(absl::StrFormat( + "Field name %s occurs more than once", QuoteString(field.name))); } field.field_shape.resize(field.flexible_shape.size() + - field.outer_shape.size()); + field.outer_shape.size());ß std::copy(field.flexible_shape.begin(), field.flexible_shape.end(), std::copy(field.outer_shape.begin(), field.outer_shape.end(), field.field_shape.begin())); field.num_inner_elements = ProductOfExtents(span(field.field_shape)); if (field.num_inner_elements == std::numeric_limits::max()) { + // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has + // AbslStringify support, allowing use of %v format specifier. return absl::InvalidArgumentError(tensorstore::StrCat( "Product of dimensions ", span(field.field_shape), " is too large")); } @@ -378,7 +381,7 @@ Result ChooseBaseDType(DataType dtype) { return base_dtype; } return absl::InvalidArgumentError( - tensorstore::StrCat("Data type not supported: ", dtype)); + absl::StrFormat("Data type not supported: %v", dtype)); } } // namespace internal_zarr3 diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 4f8146f10..29652f911 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -116,9 +116,9 @@ std::string GetSupportedDataTypes() { absl::Status ValidateDataType(DataType dtype) { if (!absl::c_linear_search(kSupportedDataTypes, dtype.id())) { - return absl::InvalidArgumentError(tensorstore::StrCat( - dtype, " data type is not one of the supported data types: ", - GetSupportedDataTypes())); + return absl::InvalidArgumentError(absl::StrFormat( + "%v data type is not one of the supported data types: %s", + dtype, GetSupportedDataTypes())); } return absl::OkStatus(); } @@ -296,17 +296,17 @@ absl::Status FillValueJsonBinder::operator()( } std::string b64_decoded; if (!absl::Base64Unescape(j->get(), &b64_decoded)) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected valid base64-encoded fill value, but received: ", + return absl::InvalidArgumentError(absl::StrFormat( + "Expected valid base64-encoded fill value, but received: %s", j->dump())); } // Verify size matches expected byte array size Index expected_size = dtype.fields[0].num_inner_elements; if (static_cast(b64_decoded.size()) != expected_size) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected ", expected_size, - " base64-encoded bytes for fill_value, but received ", - b64_decoded.size(), " bytes")); + return absl::InvalidArgumentError(absl::StrFormat( + "Expected %d base64-encoded bytes for fill_value, but received " + "%d bytes", + expected_size, b64_decoded.size())); } // Create fill value array auto fill_arr = AllocateArray(dtype.fields[0].field_shape, c_order, @@ -323,17 +323,17 @@ absl::Status FillValueJsonBinder::operator()( // Decode base64-encoded fill value for entire struct std::string b64_decoded; if (!absl::Base64Unescape(j->get(), &b64_decoded)) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected valid base64-encoded fill value, but received: ", + return absl::InvalidArgumentError(absl::StrFormat( + "Expected valid base64-encoded fill value, but received: %s", j->dump())); } // Verify size matches expected struct size if (static_cast(b64_decoded.size()) != dtype.bytes_per_outer_element) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "Expected ", dtype.bytes_per_outer_element, - " base64-encoded bytes for fill_value, but received ", - b64_decoded.size(), " bytes")); + return absl::InvalidArgumentError(absl::StrFormat( + "Expected %d base64-encoded bytes for fill_value, but received " + "%d bytes", + dtype.bytes_per_outer_element, b64_decoded.size())); } // Extract per-field fill values from decoded bytes for (size_t i = 0; i < dtype.fields.size(); ++i) { @@ -347,7 +347,7 @@ absl::Status FillValueJsonBinder::operator()( } else if (j->is_array()) { if (j->size() != dtype.fields.size()) { return internal_json::ExpectedError( - *j, tensorstore::StrCat("array of size ", dtype.fields.size())); + *j, absl::StrFormat("array of size %d", dtype.fields.size())); } for (size_t i = 0; i < dtype.fields.size(); ++i) { TENSORSTORE_RETURN_IF_ERROR( @@ -480,9 +480,10 @@ constexpr auto UnknownExtensionAttributesJsonBinder = continue; } } - return absl::InvalidArgumentError(tensorstore::StrCat( - "Unsupported metadata field ", tensorstore::QuoteString(key), - " is not marked {\"must_understand\": false}")); + return absl::InvalidArgumentError(absl::StrFormat( + "Unsupported metadata field %s is not marked " + "{\"must_understand\": false}", + tensorstore::QuoteString(key))); } return absl::OkStatus(); }); @@ -813,23 +814,23 @@ Result GetFieldIndex(const ZarrDType& dtype, if (selected_field.empty()) { if (dtype.fields.size() != 1) { - return absl::FailedPreconditionError(tensorstore::StrCat( - "Must specify a \"field\" that is one of: ", GetFieldNames(dtype))); + return absl::FailedPreconditionError(absl::StrFormat( + "Must specify a \"field\" that is one of: %s", GetFieldNames(dtype))); } return 0; } if (!dtype.has_fields) { - return absl::FailedPreconditionError( - tensorstore::StrCat("Requested field ", QuoteString(selected_field), - " but dtype does not have named fields")); + return absl::FailedPreconditionError(absl::StrFormat( + "Requested field %s but dtype does not have named fields", + QuoteString(selected_field))); } for (size_t field_index = 0; field_index < dtype.fields.size(); ++field_index) { if (dtype.fields[field_index].name == selected_field) return field_index; } - return absl::FailedPreconditionError( - tensorstore::StrCat("Requested field ", QuoteString(selected_field), - " is not one of: ", GetFieldNames(dtype))); + return absl::FailedPreconditionError(absl::StrFormat( + "Requested field %s is not one of: %s", QuoteString(selected_field), + GetFieldNames(dtype))); } SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, @@ -1056,10 +1057,10 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, const auto& field = metadata.data_type.fields[field_index]; if (!RankConstraint::EqualOrUnspecified(schema.rank(), info.chunked_rank)) { - return absl::FailedPreconditionError(tensorstore::StrCat( - "Rank specified by schema (", schema.rank(), - ") does not match rank specified by metadata (", info.chunked_rank, - ")")); + return absl::FailedPreconditionError(absl::StrFormat( + "Rank specified by schema (%d) does not match rank specified by " + "metadata (%d)", + schema.rank(), info.chunked_rank)); } if (schema.domain().valid()) { @@ -1075,9 +1076,9 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, if (auto dtype = schema.dtype(); !IsPossiblySameDataType(field.dtype, dtype)) { - return absl::FailedPreconditionError( - tensorstore::StrCat("data_type from metadata (", field.dtype, - ") does not match dtype in schema (", dtype, ")")); + return absl::FailedPreconditionError(absl::StrFormat( + "data_type from metadata (%v) does not match dtype in schema (%v)", + field.dtype, dtype)); } if (schema.chunk_layout().rank() != dynamic_rank) { @@ -1103,7 +1104,8 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, skip_repeated_elements, field.dtype)); if (!AreArraysIdenticallyEqual(converted_fill_value, fill_value)) { auto binder = FillValueJsonBinder{metadata.data_type}; - // Error message generation might be tricky with binder + // TODO(BrianMichellß): Convert to absl::StrFormat once SharedArray has + // AbslStringify support, allowing use of %v format specifier. return absl::FailedPreconditionError(tensorstore::StrCat( "Invalid fill_value: schema requires fill value of ", schema_fill_value, ", but metadata specifies fill value of ", From 6ba070f9fd9e8b2b16f9fffdffb07427b760eb25 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Fri, 6 Feb 2026 17:26:08 +0000 Subject: [PATCH 59/59] Use stringify for spans as well. Resolves: https://github.com/google/tensorstore/pull/271/#discussion_r2774836435 --- tensorstore/driver/zarr3/codec/bytes.cc | 27 ++++++++----------- .../driver/zarr3/codec/sharding_indexed.cc | 20 +++++++------- tensorstore/driver/zarr3/dtype.cc | 11 ++++---- tensorstore/driver/zarr3/metadata.cc | 18 ++++++------- 4 files changed, 34 insertions(+), 42 deletions(-) diff --git a/tensorstore/driver/zarr3/codec/bytes.cc b/tensorstore/driver/zarr3/codec/bytes.cc index abc3f8909..95633f446 100644 --- a/tensorstore/driver/zarr3/codec/bytes.cc +++ b/tensorstore/driver/zarr3/codec/bytes.cc @@ -127,22 +127,19 @@ Result BytesCodecSpec::Resolve( encoded.item_bits = decoded.dtype.size() * 8; DimensionIndex rank = decoded.rank; if (decoded.codec_chunk_shape) { - // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has - // AbslStringify support, allowing use of %v format specifier. - return absl::InvalidArgumentError(tensorstore::StrCat( - "\"bytes\" codec does not support codec_chunk_shape (", - span(decoded.codec_chunk_shape->data(), rank), - " was specified)")); + return absl::InvalidArgumentError(absl::StrFormat( + "\"bytes\" codec does not support codec_chunk_shape (%s was specified)", + absl::FormatStreamed( + span(decoded.codec_chunk_shape->data(), rank)))); } if (decoded.inner_order) { auto& decoded_inner_order = *decoded.inner_order; for (DimensionIndex i = 0; i < rank; ++i) { if (decoded_inner_order[i] != i) { - // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has - // AbslStringify support, allowing use of %v format specifier. - return absl::InvalidArgumentError(tensorstore::StrCat( - "\"bytes\" codec does not support inner_order of ", - span(decoded_inner_order.data(), rank))); + return absl::InvalidArgumentError(absl::StrFormat( + "\"bytes\" codec does not support inner_order of %s", + absl::FormatStreamed( + span(decoded_inner_order.data(), rank)))); } } } @@ -211,11 +208,9 @@ Result BytesCodec::Prepare( int64_t bytes = dtype_.size(); for (auto size : decoded_shape) { if (internal::MulOverflow(size, bytes, &bytes)) { - // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has - // AbslStringify support, allowing use of %v format specifier. - return absl::OutOfRangeError(tensorstore::StrCat( - "Integer overflow computing encoded size of array of shape ", - decoded_shape)); + return absl::OutOfRangeError(absl::StrFormat( + "Integer overflow computing encoded size of array of shape %s", + absl::FormatStreamed(decoded_shape))); } } auto state = internal::MakeIntrusivePtr(); diff --git a/tensorstore/driver/zarr3/codec/sharding_indexed.cc b/tensorstore/driver/zarr3/codec/sharding_indexed.cc index 2700f9887..453a9168d 100644 --- a/tensorstore/driver/zarr3/codec/sharding_indexed.cc +++ b/tensorstore/driver/zarr3/codec/sharding_indexed.cc @@ -24,6 +24,7 @@ #include #include "absl/status/status.h" +#include "absl/strings/str_format.h" #include "riegeli/bytes/reader.h" #include "riegeli/bytes/writer.h" #include "tensorstore/array.h" @@ -60,22 +61,21 @@ namespace tensorstore { namespace internal_zarr3 { -// TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has -// AbslStringify support, allowing use of %v format specifier. absl::Status SubChunkRankMismatch(span sub_chunk_shape, DimensionIndex outer_rank) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "sharding_indexed sub-chunk shape of ", sub_chunk_shape, - " is not compatible with array of rank ", outer_rank)); + return absl::InvalidArgumentError(absl::StrFormat( + "sharding_indexed sub-chunk shape of %s is not compatible with array of " + "rank %d", + absl::FormatStreamed(sub_chunk_shape), outer_rank)); } -// TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has -// AbslStringify support, allowing use of %v format specifier. absl::Status SubChunkShapeMismatch(span sub_chunk_shape, span chunk_shape) { - return absl::InvalidArgumentError(tensorstore::StrCat( - "sharding_indexed sub-chunk shape of ", sub_chunk_shape, - " does not evenly divide chunk shape of ", chunk_shape)); + return absl::InvalidArgumentError(absl::StrFormat( + "sharding_indexed sub-chunk shape of %s does not evenly divide chunk " + "shape of %s", + absl::FormatStreamed(sub_chunk_shape), + absl::FormatStreamed(chunk_shape))); } namespace { diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 31fb3644d..c799fff2f 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -259,20 +259,19 @@ absl::Status ValidateDType(ZarrDType& dtype) { dtype.fields.begin(), dtype.fields.begin() + field_i, [&](const ZarrDType::Field& f) { return f.name == field.name; })) { return absl::InvalidArgumentError(absl::StrFormat( - "Field name %s occurs more than once", QuoteString(field.name))); + "Field name %v occurs more than once", QuoteString(field.name))); } field.field_shape.resize(field.flexible_shape.size() + - field.outer_shape.size());ß + field.outer_shape.size()); std::copy(field.flexible_shape.begin(), field.flexible_shape.end(), std::copy(field.outer_shape.begin(), field.outer_shape.end(), field.field_shape.begin())); field.num_inner_elements = ProductOfExtents(span(field.field_shape)); if (field.num_inner_elements == std::numeric_limits::max()) { - // TODO(BrianMichell): Convert to absl::StrFormat once tensorstore::span has - // AbslStringify support, allowing use of %v format specifier. - return absl::InvalidArgumentError(tensorstore::StrCat( - "Product of dimensions ", span(field.field_shape), " is too large")); + return absl::InvalidArgumentError(absl::StrFormat( + "Product of dimensions %s is too large", + absl::FormatStreamed(span(field.field_shape)))); } if (internal::MulOverflow(field.num_inner_elements, static_cast(field.dtype->size), diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 29652f911..92f01368c 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -481,7 +481,7 @@ constexpr auto UnknownExtensionAttributesJsonBinder = } } return absl::InvalidArgumentError(absl::StrFormat( - "Unsupported metadata field %s is not marked " + "Unsupported metadata field %v is not marked " "{\"must_understand\": false}", tensorstore::QuoteString(key))); } @@ -821,7 +821,7 @@ Result GetFieldIndex(const ZarrDType& dtype, } if (!dtype.has_fields) { return absl::FailedPreconditionError(absl::StrFormat( - "Requested field %s but dtype does not have named fields", + "Requested field %v but dtype does not have named fields", QuoteString(selected_field))); } for (size_t field_index = 0; field_index < dtype.fields.size(); @@ -829,7 +829,7 @@ Result GetFieldIndex(const ZarrDType& dtype, if (dtype.fields[field_index].name == selected_field) return field_index; } return absl::FailedPreconditionError(absl::StrFormat( - "Requested field %s is not one of: %s", QuoteString(selected_field), + "Requested field %v is not one of: %s", QuoteString(selected_field), GetFieldNames(dtype))); } @@ -1103,13 +1103,11 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, tensorstore::MakeCopy(std::move(broadcast_fill_value), skip_repeated_elements, field.dtype)); if (!AreArraysIdenticallyEqual(converted_fill_value, fill_value)) { - auto binder = FillValueJsonBinder{metadata.data_type}; - // TODO(BrianMichellß): Convert to absl::StrFormat once SharedArray has - // AbslStringify support, allowing use of %v format specifier. - return absl::FailedPreconditionError(tensorstore::StrCat( - "Invalid fill_value: schema requires fill value of ", - schema_fill_value, ", but metadata specifies fill value of ", - fill_value)); + return absl::FailedPreconditionError(absl::StrFormat( + "Invalid fill_value: schema requires fill value of %s, but metadata " + "specifies fill value of %s", + absl::FormatStreamed(schema_fill_value), + absl::FormatStreamed(fill_value))); } }