From bab8d1eddccb396eaae424cdfced68957939efa9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 18:37:47 +0000 Subject: [PATCH 01/32] Add decimal(n,f) to core types (Tier 2) Add fixed-point decimal as a core DataJoint type, allowing it to be recorded in field comments using :type: syntax for reconstruction. This provides scientists with a standardized type for exact numeric precision use cases (financial data, coordinates, etc.). Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 1 + src/datajoint/declare.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 668fdfdf5..483a10cf7 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -66,6 +66,7 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty | `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | | `float32` | 32-bit float | `FLOAT` | | `float64` | 64-bit float | `DOUBLE` | +| `decimal(n,f)` | Fixed-point | `DECIMAL(n,f)` | ### String Types diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 758c709e5..f9de07f4e 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -44,6 +44,8 @@ "varchar": (r"varchar\s*\(\d+\)$", None), # Enumeration "enum": (r"enum\s*\(.+\)$", None), + # Fixed-point decimal + "decimal": (r"decimal\s*\(\d+\s*,\s*\d+\)$", None), } # Compile core type patterns From 21dd7006b484a22f36420b37f890794b5fc13f10 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 18:50:00 +0000 Subject: [PATCH 02/32] Rename core type 'blob' to 'bytes' for cross-database portability Change the core binary type from 'blob' to 'bytes' to: - Enable cross-database portability (LONGBLOB in MySQL, BYTEA in PostgreSQL) - Free up native blob types (tinyblob, blob, mediumblob, longblob) - Use Pythonic naming that matches the stored/returned type Update all documentation to include PostgreSQL type mappings alongside MySQL mappings, making the cross-database support explicit. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 36 +++++----- docs/src/design/tables/customtype.md | 2 +- .../storage-types-implementation-plan.md | 4 +- docs/src/design/tables/storage-types-spec.md | 70 +++++++++---------- src/datajoint/declare.py | 6 +- 5 files changed, 60 insertions(+), 58 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 2e8105e7c..9111e5b03 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -89,25 +89,26 @@ into a DataJoint pipeline. conversion between Python objects and database storage formats. Use this to store complex data types like graphs, domain-specific objects, or custom data structures. -## Numeric type aliases +## Core type aliases -DataJoint provides convenient type aliases that map to standard MySQL numeric types. +DataJoint provides convenient type aliases that map to standard database types. These aliases use familiar naming conventions from NumPy and other numerical computing -libraries, making table definitions more readable and explicit about data precision. - -| Alias | MySQL Type | Description | -|-------|------------|-------------| -| `bool` | `tinyint` | Boolean value (0 or 1) | -| `int8` | `tinyint` | 8-bit signed integer (-128 to 127) | -| `uint8` | `tinyint unsigned` | 8-bit unsigned integer (0 to 255) | -| `int16` | `smallint` | 16-bit signed integer (-32,768 to 32,767) | -| `uint16` | `smallint unsigned` | 16-bit unsigned integer (0 to 65,535) | -| `int32` | `int` | 32-bit signed integer | -| `uint32` | `int unsigned` | 32-bit unsigned integer | -| `int64` | `bigint` | 64-bit signed integer | -| `uint64` | `bigint unsigned` | 64-bit unsigned integer | -| `float32` | `float` | 32-bit single-precision floating point | -| `float64` | `double` | 64-bit double-precision floating point | +libraries, making table definitions more readable and portable across database backends. + +| Alias | MySQL | PostgreSQL | Description | +|-------|-------|------------|-------------| +| `bool` | `TINYINT` | `BOOLEAN` | Boolean value (0 or 1) | +| `int8` | `TINYINT` | `SMALLINT` | 8-bit signed integer (-128 to 127) | +| `uint8` | `TINYINT UNSIGNED` | `SMALLINT` | 8-bit unsigned integer (0 to 255) | +| `int16` | `SMALLINT` | `SMALLINT` | 16-bit signed integer | +| `uint16` | `SMALLINT UNSIGNED` | `INTEGER` | 16-bit unsigned integer | +| `int32` | `INT` | `INTEGER` | 32-bit signed integer | +| `uint32` | `INT UNSIGNED` | `BIGINT` | 32-bit unsigned integer | +| `int64` | `BIGINT` | `BIGINT` | 64-bit signed integer | +| `uint64` | `BIGINT UNSIGNED` | `NUMERIC(20)` | 64-bit unsigned integer | +| `float32` | `FLOAT` | `REAL` | 32-bit single-precision float | +| `float64` | `DOUBLE` | `DOUBLE PRECISION` | 64-bit double-precision float | +| `bytes` | `LONGBLOB` | `BYTEA` | Raw binary data | Example usage: @@ -122,6 +123,7 @@ class Measurement(dj.Manual): sample_count : uint32 # unsigned 32-bit counter sensor_flags : uint8 # 8-bit status flags is_valid : bool # boolean flag + raw_data : bytes # raw binary data """ ``` diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 267e0420b..4a73224ec 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -161,8 +161,8 @@ The `dtype` property specifies how data is stored in the database: | dtype | Use Case | Stored Format | |-------|----------|---------------| +| `"bytes"` | Raw binary data (core type) | Binary (LONGBLOB/BYTEA) | | `"longblob"` | Complex Python objects, arrays | Serialized binary | -| `"blob"` | Smaller objects | Serialized binary | | `"json"` | JSON-serializable data | JSON string | | `"varchar(N)"` | String representations | Text | | `"int"` | Integer identifiers | Integer | diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index c15a2292c..f3e4debcc 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -50,8 +50,8 @@ CORE_TYPES = { "uuid": (r"uuid$", "binary(16)"), # JSON "json": (r"json$", None), - # Binary (blob maps to longblob) - "blob": (r"blob$", "longblob"), + # Binary (bytes maps to longblob in MySQL, bytea in PostgreSQL) + "bytes": (r"bytes$", "longblob"), # Temporal "date": (r"date$", None), "datetime": (r"datetime$", None), diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 483a10cf7..65879a4f9 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -18,8 +18,8 @@ This document defines a three-layer type architecture: │ Core DataJoint Types (Layer 2) │ │ │ │ float32 float64 int64 uint64 int32 uint32 int16 uint16 │ -│ int8 uint8 bool uuid json blob date datetime │ -│ char(n) varchar(n) enum(...) │ +│ int8 uint8 bool uuid json bytes date datetime │ +│ char(n) varchar(n) enum(...) decimal(n,f) │ ├───────────────────────────────────────────────────────────────────┤ │ Native Database Types (Layer 1) │ │ │ @@ -54,56 +54,56 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty ### Numeric Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `int8` | 8-bit signed | `TINYINT` | -| `int16` | 16-bit signed | `SMALLINT` | -| `int32` | 32-bit signed | `INT` | -| `int64` | 64-bit signed | `BIGINT` | -| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | -| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | -| `uint32` | 32-bit unsigned | `INT UNSIGNED` | -| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | -| `float32` | 32-bit float | `FLOAT` | -| `float64` | 64-bit float | `DOUBLE` | -| `decimal(n,f)` | Fixed-point | `DECIMAL(n,f)` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` | +| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` | +| `int32` | 32-bit signed | `INT` | `INTEGER` | +| `int64` | 64-bit signed | `BIGINT` | `BIGINT` | +| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` | +| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` | +| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` | +| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` | +| `float32` | 32-bit float | `FLOAT` | `REAL` | +| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` | +| `decimal(n,f)` | Fixed-point | `DECIMAL(n,f)` | `NUMERIC(n,f)` | ### String Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `char(n)` | Fixed-length | `CHAR(n)` | -| `varchar(n)` | Variable-length | `VARCHAR(n)` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | +| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | ### Boolean -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `bool` | True/False | `TINYINT` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `bool` | True/False | `TINYINT` | `BOOLEAN` | ### Date/Time Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `date` | Date only | `DATE` | -| `datetime` | Date and time | `DATETIME` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `date` | Date only | `DATE` | `DATE` | +| `datetime` | Date and time | `DATETIME` | `TIMESTAMP` | ### Binary Types -The core `blob` type stores raw bytes without any serialization. Use `` AttributeType +The core `bytes` type stores raw bytes without any serialization. Use `` AttributeType for serialized Python objects. -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `blob` | Raw bytes | `LONGBLOB` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `bytes` | Raw bytes | `LONGBLOB` | `BYTEA` | ### Other Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `json` | JSON document | `JSON` | -| `uuid` | UUID | `BINARY(16)` | -| `enum(...)` | Enumeration | `ENUM(...)` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `json` | JSON document | `JSON` | `JSONB` | +| `uuid` | UUID | `BINARY(16)` | `UUID` | +| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + check | ### Native Passthrough Types diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index f9de07f4e..69dc10743 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -34,8 +34,8 @@ "uuid": (r"uuid$", "binary(16)"), # JSON "json": (r"json$", None), # json passes through as-is - # Binary (blob maps to longblob) - "blob": (r"blob$", "longblob"), + # Binary (bytes maps to longblob in MySQL, bytea in PostgreSQL) + "bytes": (r"bytes$", "longblob"), # Temporal "date": (r"date$", None), "datetime": (r"datetime$", None), @@ -456,7 +456,7 @@ def substitute_special_type(match, category, foreign_key_sql, context): Substitute special types with their native SQL equivalents. Special types are: - - Core DataJoint types (float32 → float, uuid → binary(16), blob → longblob, etc.) + - Core DataJoint types (float32 → float, uuid → binary(16), bytes → longblob, etc.) - ADAPTED types (AttributeTypes in angle brackets) :param match: dict containing with keys "type" and "comment" -- will be modified in place From 2f38734e6bd3d59230195b1f3d51da48b360fa6d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 18:51:35 +0000 Subject: [PATCH 03/32] Fix dtype table: longblob is native type for raw bytes Correct the dtype documentation to clarify: - longblob is a native MySQL type for raw binary data (not serialized) - should be used as dtype for serialized Python objects Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/customtype.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 4a73224ec..c1844fb99 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -162,12 +162,13 @@ The `dtype` property specifies how data is stored in the database: | dtype | Use Case | Stored Format | |-------|----------|---------------| | `"bytes"` | Raw binary data (core type) | Binary (LONGBLOB/BYTEA) | -| `"longblob"` | Complex Python objects, arrays | Serialized binary | +| `"longblob"` | Raw binary data (native type, MySQL-specific) | Binary | | `"json"` | JSON-serializable data | JSON string | | `"varchar(N)"` | String representations | Text | | `"int"` | Integer identifiers | Integer | | `"blob@store"` | Large objects in external storage | UUID reference | | `"object"` | Files/folders in object storage | JSON metadata | +| `""` | Serialized Python objects | DJ blob format | | `""` | Chain to another custom type | Varies | ### External Storage From 4399f51e745305e0796932aa7bd2e5120b54e79f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:01:07 +0000 Subject: [PATCH 04/32] Update enum PostgreSQL mapping to use native ENUM type PostgreSQL supports native ENUM via CREATE TYPE ... AS ENUM, which provides similar semantics to MySQL ENUM (efficient storage, value enforcement, definition-order ordering). DataJoint will handle the separate type creation automatically. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 65879a4f9..c645c9eb3 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -103,7 +103,7 @@ for serialized Python objects. |-----------|-------------|-------|------------| | `json` | JSON document | `JSON` | `JSONB` | | `uuid` | UUID | `BINARY(16)` | `UUID` | -| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + check | +| `enum(...)` | Enumeration | `ENUM(...)` | `CREATE TYPE ... AS ENUM` | ### Native Passthrough Types From 7e32089eed5a9360494d2536af23884ef5a90f20 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:09:36 +0000 Subject: [PATCH 05/32] Document UTC timezone policy for datetime type - Rewrite attributes.md to prioritize core types over native types - Add timezone policy: all datetime values stored as UTC - Timezone conversion is a presentation concern, not database concern - Update storage-types-spec.md with UTC policy and CURRENT_TIMESTAMP example Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 122 ++++++++++--------- docs/src/design/tables/storage-types-spec.md | 11 ++ 2 files changed, 74 insertions(+), 59 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 9111e5b03..3bdd44d42 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -5,67 +5,71 @@ To conserve database resources, use the smallest and most restrictive datatype sufficient for your data. This also ensures that only valid data are entered into the pipeline. -## Most common datatypes - -- `tinyint`: an 8-bit integer number, ranging from -128 to 127. -- `tinyint unsigned`: an 8-bit positive integer number, ranging from 0 to 255. -- `smallint`: a 16-bit integer number, ranging from -32,768 to 32,767. -- `smallint unsigned`: a 16-bit positive integer, ranging from 0 to 65,535. -- `int`: a 32-bit integer number, ranging from -2,147,483,648 to 2,147,483,647. -- `int unsigned`: a 32-bit positive integer, ranging from 0 to 4,294,967,295. -- `enum`: one of several explicitly enumerated values specified as strings. - Use this datatype instead of text strings to avoid spelling variations and to save - storage space. - For example, the datatype for an anesthesia attribute could be - `enum("urethane", "isoflurane", "fentanyl")`. - Do not use enums in primary keys due to the difficulty of changing their definitions - consistently in multiple tables. +## Core datatypes (recommended) + +Use these portable, scientist-friendly types for cross-database compatibility. + +### Integers + +- `int8`: 8-bit signed integer (-128 to 127) +- `uint8`: 8-bit unsigned integer (0 to 255) +- `int16`: 16-bit signed integer (-32,768 to 32,767) +- `uint16`: 16-bit unsigned integer (0 to 65,535) +- `int32`: 32-bit signed integer +- `uint32`: 32-bit unsigned integer +- `int64`: 64-bit signed integer +- `uint64`: 64-bit unsigned integer +- `bool`: boolean value (True/False, stored as 0/1) + +### Floating-point + +- `float32`: 32-bit single-precision floating-point. Sufficient for many measurements. +- `float64`: 64-bit double-precision floating-point. + Avoid using floating-point types in primary keys due to equality comparison issues. +- `decimal(n,f)`: fixed-point number with *n* total digits and *f* fractional digits. + Use for exact decimal representation (e.g., currency, coordinates). + Safe for primary keys due to well-defined precision. + +### Strings + +- `char(n)`: fixed-length string of exactly *n* characters. +- `varchar(n)`: variable-length string up to *n* characters. +- `enum(...)`: one of several enumerated values, e.g., `enum("low", "medium", "high")`. + Do not use enums in primary keys due to difficulty changing definitions. + +### Date/Time - `date`: date as `'YYYY-MM-DD'`. -- `time`: time as `'HH:MM:SS'`. -- `datetime`: Date and time to the second as `'YYYY-MM-DD HH:MM:SS'` -- `timestamp`: Date and time to the second as `'YYYY-MM-DD HH:MM:SS'`. - The default value may be set to `CURRENT_TIMESTAMP`. - Unlike `datetime`, a `timestamp` value will be adjusted to the local time zone. - -- `char(N)`: a character string up to *N* characters (but always takes the entire *N* -bytes to store). -- `varchar(N)`: a text string of arbitrary length up to *N* characters that takes -*M+1* or *M+2* bytes of storage, where *M* is the actual length of each stored string. -- `float`: a single-precision floating-point number. - Takes 4 bytes. - Single precision is sufficient for many measurements. - -- `double`: a double-precision floating-point number. - Takes 8 bytes. - Because equality comparisons are error-prone, neither `float` nor `double` should be - used in primary keys. -- `decimal(N,F)`: a fixed-point number with *N* total decimal digits and *F* -fractional digits. - This datatype is well suited to represent numbers whose magnitude is well defined - and does not warrant the use of floating-point representation or requires precise - decimal representations (e.g. dollars and cents). - Because of its well-defined precision, `decimal` values can be used in equality - comparison and be included in primary keys. - -- `longblob`: raw binary data, up to 4 -[GiB](http://en.wikipedia.org/wiki/Gibibyte) in size. - Stores and returns raw bytes without serialization. - For serialized Python objects (arrays, dicts, etc.), use `` instead. - The `longblob` and other `blob` datatypes can be configured to store data - [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax. - -## Less common (but supported) datatypes - -- `decimal(N,F) unsigned`: same as `decimal`, but limited to nonnegative values. -- `mediumint` a 24-bit integer number, ranging from -8,388,608 to 8,388,607. -- `mediumint unsigned`: a 24-bit positive integer, ranging from 0 to 16,777,216. -- `mediumblob`: arbitrary numeric array, up to 16 -[MiB](http://en.wikipedia.org/wiki/Mibibyte) -- `blob`: arbitrary numeric array, up to 64 -[KiB](http://en.wikipedia.org/wiki/Kibibyte) -- `tinyblob`: arbitrary numeric array, up to 256 bytes (actually smaller due to header -info). +- `datetime`: date and time as `'YYYY-MM-DD HH:MM:SS'`. + Use `CURRENT_TIMESTAMP` as default for auto-populated timestamps. + +**Timezone policy:** All `datetime` values should be stored as **UTC**. Timezone +conversion is a presentation concern handled by the application layer. This ensures +reproducible computations regardless of server location or timezone settings. + +### Binary + +- `bytes`: raw binary data (up to 4 GiB). Stores and returns raw bytes without + serialization. For serialized Python objects (arrays, dicts, etc.), use ``. + +### Other + +- `uuid`: 128-bit universally unique identifier. +- `json`: JSON document for structured data. + +## Native datatypes (advanced) + +Native database types are available for advanced use cases but are **not recommended** +for portable pipelines. Using native types will generate a warning. + +- `tinyint`, `smallint`, `int`, `bigint` (with optional `unsigned`) +- `float`, `double`, `real` +- `tinyblob`, `blob`, `mediumblob`, `longblob` +- `text`, `mediumtext`, `longtext` +- `time`, `timestamp`, `year` +- `mediumint`, `serial` + +See the [storage types spec](storage-types-spec.md) for complete mappings. ## Special DataJoint-only datatypes diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index c645c9eb3..726cfaec8 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -88,6 +88,17 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty | `date` | Date only | `DATE` | `DATE` | | `datetime` | Date and time | `DATETIME` | `TIMESTAMP` | +**Timezone policy:** All `datetime` values should be stored as **UTC**. Timezone conversion is a +presentation concern handled by the application layer, not the database. This ensures: +- Reproducible computations regardless of server or client timezone settings +- Simple arithmetic on temporal values (no DST ambiguity) +- Portable data across systems and regions + +Use `CURRENT_TIMESTAMP` for auto-populated creation times: +``` +created_at : datetime = CURRENT_TIMESTAMP +``` + ### Binary Types The core `bytes` type stores raw bytes without any serialization. Use `` AttributeType From 5d2c9f6051ac7c10a2e8fee9ac8545210407fa28 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:20:42 +0000 Subject: [PATCH 06/32] Add text to core types and document type modifier policy Core types: - Add `text` as a core type for unlimited-length text (TEXT in both MySQL and PostgreSQL) Type modifiers policy: - Document that SQL modifiers (NOT NULL, DEFAULT, PRIMARY KEY, UNIQUE, COMMENT) are not allowed - DataJoint has its own syntax - Document that AUTO_INCREMENT is discouraged but allowed with native types - UNSIGNED is allowed as part of type semantics Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 12 ++++---- docs/src/design/tables/storage-types-spec.md | 29 ++++++++++++++++++-- src/datajoint/declare.py | 2 ++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 3bdd44d42..f8fadeed6 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -34,6 +34,7 @@ Use these portable, scientist-friendly types for cross-database compatibility. - `char(n)`: fixed-length string of exactly *n* characters. - `varchar(n)`: variable-length string up to *n* characters. +- `text`: unlimited-length text for long-form content (notes, descriptions, abstracts). - `enum(...)`: one of several enumerated values, e.g., `enum("low", "medium", "high")`. Do not use enums in primary keys due to difficulty changing definitions. @@ -65,9 +66,9 @@ for portable pipelines. Using native types will generate a warning. - `tinyint`, `smallint`, `int`, `bigint` (with optional `unsigned`) - `float`, `double`, `real` - `tinyblob`, `blob`, `mediumblob`, `longblob` -- `text`, `mediumtext`, `longtext` +- `tinytext`, `mediumtext`, `longtext` (size variants) - `time`, `timestamp`, `year` -- `mediumint`, `serial` +- `mediumint`, `serial`, `int auto_increment` See the [storage types spec](storage-types-spec.md) for complete mappings. @@ -133,10 +134,9 @@ class Measurement(dj.Manual): ## Datatypes not (yet) supported -- `binary` -- `text` -- `longtext` -- `bit` +- `binary(n)` / `varbinary(n)` - use `bytes` instead +- `bit(n)` - use `int` types with bitwise operations +- `set(...)` - use `json` for multiple selections For additional information about these datatypes, see http://dev.mysql.com/doc/refman/5.6/en/data-types.html diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 726cfaec8..6f0423997 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -18,7 +18,7 @@ This document defines a three-layer type architecture: │ Core DataJoint Types (Layer 2) │ │ │ │ float32 float64 int64 uint64 int32 uint32 int16 uint16 │ -│ int8 uint8 bool uuid json bytes date datetime │ +│ int8 uint8 bool uuid json bytes date datetime text │ │ char(n) varchar(n) enum(...) decimal(n,f) │ ├───────────────────────────────────────────────────────────────────┤ │ Native Database Types (Layer 1) │ @@ -74,6 +74,7 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty |-----------|-------------|-------|------------| | `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | | `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | +| `text` | Unlimited text | `TEXT` | `TEXT` | ### Boolean @@ -118,10 +119,34 @@ for serialized Python objects. ### Native Passthrough Types -Users may use native database types directly (e.g., `text`, `mediumint auto_increment`), +Users may use native database types directly (e.g., `mediumint`, `tinyblob`), but these will generate a warning about non-standard usage. Native types are not recorded in field comments and may have portability issues across database backends. +### Type Modifiers Policy + +DataJoint table definitions have their own syntax for constraints and metadata. SQL type +modifiers are **not allowed** in type specifications because they conflict with DataJoint's +declarative syntax: + +| Modifier | Status | DataJoint Alternative | +|----------|--------|----------------------| +| `NOT NULL` / `NULL` | ❌ Not allowed | Position above/below `---` determines nullability | +| `DEFAULT value` | ❌ Not allowed | Use `= value` syntax after type | +| `PRIMARY KEY` | ❌ Not allowed | Position above `---` line | +| `UNIQUE` | ❌ Not allowed | Use DataJoint index syntax | +| `COMMENT 'text'` | ❌ Not allowed | Use `# comment` syntax | +| `AUTO_INCREMENT` | ⚠️ Discouraged | Allowed with native types only, generates warning | +| `UNSIGNED` | ✅ Allowed | Part of type semantics (use `uint*` core types) | + +**Auto-increment policy:** DataJoint discourages `AUTO_INCREMENT` / `SERIAL` because: +- Breaks reproducibility (IDs depend on insertion order) +- Makes pipelines non-deterministic +- Complicates data migration and replication +- Primary keys should be meaningful, not arbitrary + +If required, use native types: `int auto_increment` or `serial` (with warning). + ## AttributeTypes (Layer 3) AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 69dc10743..e3bbb96fd 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -42,6 +42,8 @@ # String types (with parameters) "char": (r"char\s*\(\d+\)$", None), "varchar": (r"varchar\s*\(\d+\)$", None), + # Unlimited text + "text": (r"text$", None), # Enumeration "enum": (r"enum\s*\(.+\)$", None), # Fixed-point decimal From e55c9a71fe978e31adc80f0c31992ef17565c86c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:27:21 +0000 Subject: [PATCH 07/32] Add encoding and collation policy documentation - UTF-8 required: utf8mb4 (MySQL) / UTF8 (PostgreSQL) - Case-sensitive by default: utf8mb4_bin / C collation - Database-level configuration via dj.config, not per-column - CHARACTER SET and COLLATE modifiers not allowed in type definitions - Like timezone, encoding is infrastructure configuration Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 4 +++ docs/src/design/tables/storage-types-spec.md | 31 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index f8fadeed6..908950559 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -38,6 +38,10 @@ Use these portable, scientist-friendly types for cross-database compatibility. - `enum(...)`: one of several enumerated values, e.g., `enum("low", "medium", "high")`. Do not use enums in primary keys due to difficulty changing definitions. +**Encoding policy:** All strings use UTF-8 encoding (`utf8mb4` in MySQL, `UTF8` in PostgreSQL). +Character encoding and collation are database-level configuration, not part of type definitions. +Comparisons are case-sensitive by default. + ### Date/Time - `date`: date as `'YYYY-MM-DD'`. diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 6f0423997..f4841058c 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -76,6 +76,9 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty | `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | | `text` | Unlimited text | `TEXT` | `TEXT` | +**Encoding:** All strings use UTF-8 (`utf8mb4` in MySQL, `UTF8` in PostgreSQL). +See [Encoding and Collation Policy](#encoding-and-collation-policy) for details. + ### Boolean | Core Type | Description | MySQL | PostgreSQL | @@ -136,6 +139,8 @@ declarative syntax: | `PRIMARY KEY` | ❌ Not allowed | Position above `---` line | | `UNIQUE` | ❌ Not allowed | Use DataJoint index syntax | | `COMMENT 'text'` | ❌ Not allowed | Use `# comment` syntax | +| `CHARACTER SET` | ❌ Not allowed | Database-level configuration | +| `COLLATE` | ❌ Not allowed | Database-level configuration | | `AUTO_INCREMENT` | ⚠️ Discouraged | Allowed with native types only, generates warning | | `UNSIGNED` | ✅ Allowed | Part of type semantics (use `uint*` core types) | @@ -147,6 +152,32 @@ declarative syntax: If required, use native types: `int auto_increment` or `serial` (with warning). +### Encoding and Collation Policy + +Character encoding and collation are **database-level configuration**, not part of type +definitions. This ensures consistent behavior across all tables and simplifies portability. + +**Configuration** (in `dj.config` or `datajoint.json`): +```json +{ + "database.charset": "utf8mb4", + "database.collation": "utf8mb4_bin" +} +``` + +**Defaults:** + +| Setting | MySQL | PostgreSQL | +|---------|-------|------------| +| Charset | `utf8mb4` | `UTF8` | +| Collation | `utf8mb4_bin` | `C` | + +**Policy:** +- **UTF-8 required**: DataJoint validates charset is UTF-8 compatible at connection time +- **Case-sensitive by default**: Binary collation (`utf8mb4_bin` / `C`) ensures predictable comparisons +- **No per-column overrides**: `CHARACTER SET` and `COLLATE` are rejected in type definitions +- **Like timezone**: Encoding is infrastructure configuration, not part of the data model + ## AttributeTypes (Layer 3) AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are From 4fa802a51a6da28f1dc0c7fe1f9f08bfe70a5f5f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:30:52 +0000 Subject: [PATCH 08/32] Document AttributeType naming conventions comprehensively - Reorganize "Special DataJoint-only datatypes" as "AttributeTypes" - Add naming convention explanation (dj prefix, x prefix, @store suffix) - List all built-in AttributeTypes with categories: - Serialization types: , - File storage types: , - File attachment types: , - File reference types: - Fix inconsistent angle bracket notation throughout docs - Update example to use int32 core type and include - Expand naming conventions in Key Design Decisions section Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 59 +++++++++++++++----- docs/src/design/tables/storage-types-spec.md | 12 ++-- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 908950559..3a04008e4 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -76,27 +76,55 @@ for portable pipelines. Using native types will generate a warning. See the [storage types spec](storage-types-spec.md) for complete mappings. -## Special DataJoint-only datatypes +## AttributeTypes (special datatypes) -These types abstract certain kinds of non-database data to facilitate use -together with DataJoint. +AttributeTypes provide `encode()`/`decode()` semantics for complex data that doesn't +fit native database types. They are denoted with angle brackets: ``. + +### Naming conventions + +- **`dj` prefix**: DataJoint-specific internal serialization (``) +- **`x` prefix**: External/content-addressed variant (``, ``) +- **`@store` suffix**: Specifies which configured store to use + +### Built-in AttributeTypes + +**Serialization types** - for Python objects: - ``: DataJoint's native serialization format for Python objects. Supports -NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with -MATLAB. See [custom types](customtype.md) for details. + NumPy arrays, dicts, lists, datetime objects, and nested structures. Stores in + database. Compatible with MATLAB. See [custom types](customtype.md) for details. + +- `` / ``: Like `` but stores externally with content- + addressed deduplication. Use for large arrays that may be duplicated across rows. + +**File storage types** - for managed files: + +- `` / ``: Managed file and folder storage with path derived + from primary key. Supports Zarr, HDF5, and direct writes via fsspec. Returns + `ObjectRef` for lazy access. See [object storage](object.md). + +- `` / ``: Content-addressed storage for raw bytes with + SHA256 deduplication. Use via `` or `` rather than directly. + +**File attachment types** - for file transfer: + +- ``: File attachment stored in database with filename preserved. Similar + to email attachments. Good for small files (<16MB). See [attachments](attach.md). + +- `` / ``: Like `` but stores externally with + deduplication. Use for large files. -- `object`: managed [file and folder storage](object.md) with support for direct writes -(Zarr, HDF5) and fsspec integration. Recommended for new pipelines. +**File reference types** - for external files: -- `attach`: a [file attachment](attach.md) similar to email attachments facillitating -sending/receiving an opaque data file to/from a DataJoint pipeline. +- ``: Reference to existing file in a configured store. No file + copying occurs. Returns `ObjectRef` for lazy access. See [filepath](filepath.md). -- `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files -into a DataJoint pipeline. +### User-defined AttributeTypes -- ``: a [custom attribute type](customtype.md) that defines bidirectional -conversion between Python objects and database storage formats. Use this to store -complex data types like graphs, domain-specific objects, or custom data structures. +- ``: Define your own [custom attribute type](customtype.md) with + bidirectional conversion between Python objects and database storage. Use for + graphs, domain-specific objects, or custom data structures. ## Core type aliases @@ -125,7 +153,7 @@ Example usage: @schema class Measurement(dj.Manual): definition = """ - measurement_id : int + measurement_id : int32 --- temperature : float32 # single-precision temperature reading precise_value : float64 # double-precision measurement @@ -133,6 +161,7 @@ class Measurement(dj.Manual): sensor_flags : uint8 # 8-bit status flags is_valid : bool # boolean flag raw_data : bytes # raw binary data + processed : # serialized Python object """ ``` diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index f4841058c..3e8e6e4ac 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -652,11 +652,11 @@ def garbage_collect(project): 8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent 9. **Content type**: Single-blob, content-addressed, deduplicated storage 10. **Parameterized types**: `` passes store parameter -11. **Naming convention**: - - `` = internal serialized (database) - - `` = external serialized (content-addressed) - - `` = internal file (single file) - - `` = external file (single file) +11. **Naming conventions**: + - `dj` prefix = DataJoint-specific internal serialization (``) + - `x` prefix = external/content-addressed variant (``, ``) + - `@store` suffix = specifies which configured store to use + - Types without prefix: core storage mechanisms (``, ``, ``, ``) 12. **Transparent access**: AttributeTypes return Python objects or file paths 13. **Lazy access**: ``, ``, and `` return ObjectRef @@ -668,7 +668,7 @@ def garbage_collect(project): | `blob@store` | `` | | `attach` | `` | | `attach@store` | `` | -| `filepath@store` (copy-based) | `filepath@store` (ObjectRef-based, upgraded) | +| `filepath@store` (copy-based) | `` (ObjectRef-based, upgraded) | ### Migration from Legacy `~external_*` Stores From 11d67a63cd0a6d59847ec66cb1754516b7554f13 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 19:55:21 +0000 Subject: [PATCH 09/32] Redesign AttributeType naming with @ storage mode convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The @ character now indicates external storage (object store vs database): - No @ = internal (database): , - @ present = external (object store): , - @ alone = default store: - @name = named store: Key changes: - Rename to (internal) and to (external) - Rename to (external variant of ) - Mark , , as external-only types - Replace dtype property with get_dtype(is_external) method - Use core type 'bytes' instead of 'longblob' for portability - Add type resolution and chaining documentation - Update Storage Comparison and Built-in AttributeType Comparison tables - Simplify from 7 built-in types to 5: blob, attach, object, content, filepath Type chaining at declaration time: → get_dtype(False) → "bytes" → LONGBLOB/BYTEA → get_dtype(True) → "" → json → JSON/JSONB → get_dtype(True) → "json" → JSON/JSONB Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 32 +- docs/src/design/tables/storage-types-spec.md | 381 ++++++++++--------- 2 files changed, 221 insertions(+), 192 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 3a04008e4..caac46a6a 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -55,7 +55,7 @@ reproducible computations regardless of server location or timezone settings. ### Binary - `bytes`: raw binary data (up to 4 GiB). Stores and returns raw bytes without - serialization. For serialized Python objects (arrays, dicts, etc.), use ``. + serialization. For serialized Python objects (arrays, dicts, etc.), use ``. ### Other @@ -81,44 +81,47 @@ See the [storage types spec](storage-types-spec.md) for complete mappings. AttributeTypes provide `encode()`/`decode()` semantics for complex data that doesn't fit native database types. They are denoted with angle brackets: ``. -### Naming conventions +### Storage mode: `@` convention -- **`dj` prefix**: DataJoint-specific internal serialization (``) -- **`x` prefix**: External/content-addressed variant (``, ``) -- **`@store` suffix**: Specifies which configured store to use +The `@` character indicates **external storage** (object store vs database): + +- **No `@`**: Internal storage (database) - e.g., ``, `` +- **`@` present**: External storage (object store) - e.g., ``, `` +- **`@` alone**: Use default store - e.g., `` +- **`@name`**: Use named store - e.g., `` ### Built-in AttributeTypes **Serialization types** - for Python objects: -- ``: DataJoint's native serialization format for Python objects. Supports +- ``: DataJoint's native serialization format for Python objects. Supports NumPy arrays, dicts, lists, datetime objects, and nested structures. Stores in database. Compatible with MATLAB. See [custom types](customtype.md) for details. -- `` / ``: Like `` but stores externally with content- +- `` / ``: Like `` but stores externally with content- addressed deduplication. Use for large arrays that may be duplicated across rows. **File storage types** - for managed files: -- `` / ``: Managed file and folder storage with path derived +- `` / ``: Managed file and folder storage with path derived from primary key. Supports Zarr, HDF5, and direct writes via fsspec. Returns - `ObjectRef` for lazy access. See [object storage](object.md). + `ObjectRef` for lazy access. External only. See [object storage](object.md). -- `` / ``: Content-addressed storage for raw bytes with - SHA256 deduplication. Use via `` or `` rather than directly. +- `` / ``: Content-addressed storage for raw bytes with + SHA256 deduplication. External only. Use via `` or `` rather than directly. **File attachment types** - for file transfer: - ``: File attachment stored in database with filename preserved. Similar to email attachments. Good for small files (<16MB). See [attachments](attach.md). -- `` / ``: Like `` but stores externally with +- `` / ``: Like `` but stores externally with deduplication. Use for large files. **File reference types** - for external files: - ``: Reference to existing file in a configured store. No file - copying occurs. Returns `ObjectRef` for lazy access. See [filepath](filepath.md). + copying occurs. Returns `ObjectRef` for lazy access. External only. See [filepath](filepath.md). ### User-defined AttributeTypes @@ -161,7 +164,8 @@ class Measurement(dj.Manual): sensor_flags : uint8 # 8-bit status flags is_valid : bool # boolean flag raw_data : bytes # raw binary data - processed : # serialized Python object + processed : # serialized Python object + large_array : # external storage with deduplication """ ``` diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 3e8e6e4ac..c951fc6bd 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -12,7 +12,7 @@ This document defines a three-layer type architecture: ┌───────────────────────────────────────────────────────────────────┐ │ AttributeTypes (Layer 3) │ │ │ -│ Built-in: │ +│ Built-in: │ │ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ @@ -31,7 +31,8 @@ This document defines a three-layer type architecture: **Syntax distinction:** - Core types: `int32`, `float64`, `varchar(255)` - no brackets -- AttributeTypes: ``, ``, `` - angle brackets +- AttributeTypes: ``, ``, `` - angle brackets +- The `@` character indicates external storage (object store vs database) ### OAS Storage Regions @@ -105,7 +106,7 @@ created_at : datetime = CURRENT_TIMESTAMP ### Binary Types -The core `bytes` type stores raw bytes without any serialization. Use `` AttributeType +The core `bytes` type stores raw bytes without any serialization. Use `` AttributeType for serialized Python objects. | Core Type | Description | MySQL | PostgreSQL | @@ -183,9 +184,44 @@ definitions. This ensures consistent behavior across all tables and simplifies p AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are composable and can be built-in or user-defined. -### `` / `` - Path-Addressed Storage +### Storage Mode: `@` Convention -**Built-in AttributeType.** OAS (Object-Augmented Schema) storage: +The `@` character in AttributeType syntax indicates **external storage** (object store): + +- **No `@`**: Internal storage (database) - e.g., ``, `` +- **`@` present**: External storage (object store) - e.g., ``, `` +- **`@` alone**: Use default store - e.g., `` +- **`@name`**: Use named store - e.g., `` + +Some types support both modes (``, ``), others are external-only (``, ``, ``). + +### Type Resolution and Chaining + +AttributeTypes resolve to core types through chaining. The `get_dtype(is_external)` method +returns the appropriate dtype based on storage mode: + +``` +Resolution at declaration time: + + → get_dtype(False) → "bytes" → LONGBLOB/BYTEA + → get_dtype(True) → "" → json → JSON/JSONB + → get_dtype(True) → "" → json (store=cold) + + → get_dtype(False) → "bytes" → LONGBLOB/BYTEA + → get_dtype(True) → "" → json → JSON/JSONB + + → get_dtype(True) → "json" → JSON/JSONB + → get_dtype(False) → ERROR (external only) + + → get_dtype(True) → "json" → JSON/JSONB + → get_dtype(True) → "json" → JSON/JSONB +``` + +### `` / `` - Path-Addressed Storage + +**Built-in AttributeType. External only.** + +OAS (Object-Augmented Schema) storage for files and folders: - Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/` - One-to-one relationship with table row @@ -199,7 +235,7 @@ class Analysis(dj.Computed): definition = """ -> Recording --- - results : # default store + results : # default store archive : # specific store """ ``` @@ -208,30 +244,29 @@ class Analysis(dj.Computed): ```python class ObjectType(AttributeType): - """Built-in AttributeType for path-addressed OAS storage.""" + """Path-addressed OAS storage. External only.""" type_name = "object" - dtype = "json" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, value, *, key=None, store_name=None) -> dict: store = get_store(store_name or dj.config['stores']['default']) path = self._compute_path(key) # {schema}/{table}/{pk}/{attr}/ store.put(path, value) - return { - "path": path, - "store": store_name, - # Additional metadata (size, timestamps, etc.) - } + return {"path": path, "store": store_name, ...} def decode(self, stored: dict, *, key=None) -> ObjectRef: - return ObjectRef( - store=get_store(stored["store"]), - path=stored["path"] - ) + return ObjectRef(store=get_store(stored["store"]), path=stored["path"]) ``` -### `` / `` - Content-Addressed Storage +### `` / `` - Content-Addressed Storage -**Built-in AttributeType.** Content-addressed storage with deduplication: +**Built-in AttributeType. External only.** + +Content-addressed storage with deduplication: - **Single blob only**: stores a single file or serialized object (not folders) - **Per-project scope**: content is shared across all schemas in a project (not per-schema) @@ -255,9 +290,13 @@ store_root/ ```python class ContentType(AttributeType): - """Built-in AttributeType for content-addressed storage.""" + """Content-addressed storage. External only.""" type_name = "content" - dtype = "json" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, data: bytes, *, key=None, store_name=None) -> dict: """Store content, return metadata as JSON.""" @@ -273,11 +312,7 @@ class ContentType(AttributeType): 'size': len(data) }, skip_duplicates=True) - return { - "hash": content_hash, - "store": store_name, - "size": len(data) - } + return {"hash": content_hash, "store": store_name, "size": len(data)} def decode(self, stored: dict, *, key=None) -> bytes: """Retrieve content by hash.""" @@ -288,7 +323,7 @@ class ContentType(AttributeType): #### Database Column -The `` type stores JSON metadata: +The `` type stores JSON metadata: ```sql -- content column (MySQL) @@ -301,7 +336,9 @@ features JSONB NOT NULL ### `` - Portable External Reference -**Built-in AttributeType.** Relative path references within configured stores: +**Built-in AttributeType. External only (store required).** + +Relative path references within configured stores: - **Relative paths**: paths within a configured store (portable across environments) - **Store-aware**: resolves paths against configured store backend @@ -351,31 +388,22 @@ just use `varchar`. A string is simpler and more transparent. ```python class FilepathType(AttributeType): - """Built-in AttributeType for store-relative file references.""" + """Store-relative file references. External only.""" type_name = "filepath" - dtype = "json" - def encode(self, relative_path: str, *, key=None, store_name=None, - compute_checksum: bool = False) -> dict: + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @store") + return "json" + + def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: """Register reference to file in store.""" store = get_store(store_name) # store_name required for filepath - metadata = {'path': relative_path, 'store': store_name} - - if compute_checksum: - full_path = store.resolve(relative_path) - if store.exists(full_path): - metadata['checksum'] = compute_file_checksum(store, full_path) - metadata['size'] = store.size(full_path) - - return metadata + return {'path': relative_path, 'store': store_name} def decode(self, stored: dict, *, key=None) -> ObjectRef: """Return ObjectRef for lazy access.""" - return ObjectRef( - store=get_store(stored['store']), - path=stored['path'], - checksum=stored.get('checksum') # optional verification - ) + return ObjectRef(store=get_store(stored['store']), path=stored['path']) ``` #### Database Column @@ -419,64 +447,29 @@ The `json` database type: - Automatically uses appropriate type for database backend - Supports JSON path queries where available -## Parameterized AttributeTypes - -AttributeTypes can be parameterized with `` syntax. The parameter specifies -which store to use: - -```python -class AttributeType: - type_name: str # Name used in or as bare type - dtype: str # Database type or built-in AttributeType - - # When user writes type_name@param, resolved store becomes param -``` - -**Resolution examples:** -``` - → uses type → default store - → uses type → cold store - → dtype = "longblob" → database (no store) - → uses type → cold store -``` +## Built-in AttributeTypes -AttributeTypes can use other AttributeTypes as their dtype (composition): -- `` uses `` - adds djblob serialization on top of content-addressed storage -- `` uses `` - adds filename preservation on top of content-addressed storage +### `` / `` - Serialized Python Objects -## User-Defined AttributeTypes +**Supports both internal and external storage.** -### `` - Internal Serialized Blob +Serializes Python objects (NumPy arrays, dicts, lists, etc.) using DataJoint's +blob format. Compatible with MATLAB. -Serialized Python object stored in database. +- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) +- **``**: Stored externally via `` with deduplication +- **``**: Stored in specific named store ```python @dj.register_type -class DJBlobType(AttributeType): - type_name = "djblob" - dtype = "longblob" # MySQL type - - def encode(self, value, *, key=None) -> bytes: - from . import blob - return blob.pack(value, compress=True) - - def decode(self, stored, *, key=None) -> Any: - from . import blob - return blob.unpack(stored) -``` - -### `` / `` - External Serialized Blob - -Serialized Python object stored in content-addressed storage. +class BlobType(AttributeType): + """Serialized Python objects. Supports internal and external.""" + type_name = "blob" -```python -@dj.register_type -class XBlobType(AttributeType): - type_name = "xblob" - dtype = "content" # Core type - uses default store - # dtype = "content@store" for specific store + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" - def encode(self, value, *, key=None) -> bytes: + def encode(self, value, *, key=None, store_name=None) -> bytes: from . import blob return blob.pack(value, compress=True) @@ -491,55 +484,39 @@ class ProcessedData(dj.Computed): definition = """ -> RawData --- - small_result : # internal (in database) - large_result : # external (default store) - archive_result : # external (specific store) + small_result : # internal (in database) + large_result : # external (default store) + archive_result : # external (specific store) """ ``` -### `` - Internal File Attachment +### `` / `` - File Attachments -File stored in database with filename preserved. +**Supports both internal and external storage.** + +Stores files with filename preserved. On fetch, extracts to configured download path. + +- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) +- **``**: Stored externally via `` with deduplication +- **``**: Stored in specific named store ```python @dj.register_type class AttachType(AttributeType): + """File attachment with filename. Supports internal and external.""" type_name = "attach" - dtype = "longblob" - - def encode(self, filepath, *, key=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() - - def decode(self, stored, *, key=None) -> str: - filename, contents = stored.split(b"\0", 1) - filename = filename.decode() - download_path = Path(dj.config['download_path']) / filename - download_path.parent.mkdir(parents=True, exist_ok=True) - download_path.write_bytes(contents) - return str(download_path) -``` - -### `` / `` - External File Attachment -File stored in content-addressed storage with filename preserved. + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" -```python -@dj.register_type -class XAttachType(AttributeType): - type_name = "xattach" - dtype = "content" # Core type - - def encode(self, filepath, *, key=None) -> bytes: + def encode(self, filepath, *, key=None, store_name=None) -> bytes: path = Path(filepath) - # Include filename in stored data return path.name.encode() + b"\0" + path.read_bytes() def decode(self, stored, *, key=None) -> str: filename, contents = stored.split(b"\0", 1) filename = filename.decode() download_path = Path(dj.config['download_path']) / filename - download_path.parent.mkdir(parents=True, exist_ok=True) download_path.write_bytes(contents) return str(download_path) ``` @@ -548,29 +525,75 @@ Usage: ```python class Attachments(dj.Manual): definition = """ - attachment_id : int + attachment_id : int32 --- config : # internal (small file in DB) - data_file : # external (default store) - archive : # external (specific store) + data_file : # external (default store) + archive : # external (specific store) """ ``` +## User-Defined AttributeTypes + +Users can define custom AttributeTypes for domain-specific data: + +```python +@dj.register_type +class GraphType(AttributeType): + """Store NetworkX graphs. Internal only (no external support).""" + type_name = "graph" + + def get_dtype(self, is_external: bool) -> str: + if is_external: + raise DataJointError(" does not support external storage") + return "" # Chain to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +Custom types can support both modes by returning different dtypes: + +```python +@dj.register_type +class ImageType(AttributeType): + """Store images. Supports both internal and external.""" + type_name = "image" + + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" + + def encode(self, image, *, key=None, store_name=None) -> bytes: + # Convert PIL Image to PNG bytes + buffer = io.BytesIO() + image.save(buffer, format='PNG') + return buffer.getvalue() + + def decode(self, stored: bytes, *, key=None): + return PIL.Image.open(io.BytesIO(stored)) +``` + ## Storage Comparison -| Type | dtype | Storage Location | Dedup | Returns | -|------|-------|------------------|-------|---------| -| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `_content/{hash}` | Yes | bytes | -| `` | `json` | `_content/{hash}` | Yes | bytes | -| `` | `json` | Configured store (relative path) | No | ObjectRef | -| `` | `longblob` | Database | No | Python object | -| `` | `` | `_content/{hash}` | Yes | Python object | -| `` | `` | `_content/{hash}` | Yes | Python object | -| `` | `longblob` | Database | No | Local file path | -| `` | `` | `_content/{hash}` | Yes | Local file path | -| `` | `` | `_content/{hash}` | Yes | Local file path | +| Type | get_dtype | Resolves To | Storage Location | Dedup | Returns | +|------|-----------|-------------|------------------|-------|---------| +| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Python object | +| `` | `` | `json` | `_content/{hash}` | Yes | Python object | +| `` | `` | `json` | `_content/{hash}` | Yes | Python object | +| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Local file path | +| `` | `` | `json` | `_content/{hash}` | Yes | Local file path | +| `` | `` | `json` | `_content/{hash}` | Yes | Local file path | +| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `JSON`/`JSONB` | `_content/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | `_content/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | Configured store | No | ObjectRef | ## Reference Counting for Content Type @@ -619,22 +642,23 @@ def garbage_collect(project): ## Built-in AttributeType Comparison -| Feature | `` | `` | `` | -|---------|------------|-------------|---------------------| -| dtype | `json` | `json` | `json` | -| Location | OAS store | OAS store | Configured store | -| Addressing | Primary key | Content hash | Relative path | -| Path control | DataJoint | DataJoint | User | -| Deduplication | No | Yes | No | -| Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) | -| Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | -| GC | Deleted with row | Reference counted | N/A (user managed) | -| Integrity | DataJoint managed | DataJoint managed | User managed | +| Feature | `` | `` | `` | `` | `` | +|---------|----------|------------|-------------|--------------|---------------| +| Storage modes | Both | Both | External only | External only | External only | +| Internal dtype | `bytes` | `bytes` | N/A | N/A | N/A | +| External dtype | `` | `` | `json` | `json` | `json` | +| Addressing | Content hash | Content hash | Primary key | Content hash | Relative path | +| Deduplication | Yes (external) | Yes (external) | No | Yes | No | +| Structure | Single blob | Single file | Files, folders | Single blob | Any | +| Returns | Python object | Local path | ObjectRef | bytes | ObjectRef | +| GC | Ref counted | Ref counted | With row | Ref counted | User managed | **When to use each:** -- **``**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) -- **``**: Deduplicated serialized data or file attachments via ``, `` -- **``**: Portable references to files in configured stores +- **``**: Serialized Python objects (NumPy arrays, dicts). Use `` for large/duplicated data +- **``**: File attachments with filename preserved. Use `` for large files +- **``**: Large/complex file structures (Zarr, HDF5) where DataJoint controls organization +- **``**: Raw bytes with deduplication (typically used via `` or ``) +- **``**: Portable references to externally-managed files - **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed ## Key Design Decisions @@ -643,32 +667,34 @@ def garbage_collect(project): - Layer 1: Native database types (backend-specific, discouraged) - Layer 2: Core DataJoint types (standardized, scientist-friendly) - Layer 3: AttributeTypes (encode/decode, composable) -2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)` -3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types -4. **AttributeTypes are composable**: `` uses ``, which uses `json` -5. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) -6. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -7. **Filepath for portability**: `` uses relative paths within stores for environment portability -8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -9. **Content type**: Single-blob, content-addressed, deduplicated storage -10. **Parameterized types**: `` passes store parameter +2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool`, `bytes` instead of `FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB` +3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types +4. **`@` indicates external storage**: No `@` = database, `@` present = object store +5. **`get_dtype(is_external)` method**: Types resolve dtype at declaration time based on storage mode +6. **AttributeTypes are composable**: `` uses ``, which uses `json` +7. **Built-in external types use JSON dtype**: Stores metadata (path, hash, store name, etc.) +8. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +9. **Filepath for portability**: `` uses relative paths within stores for environment portability +10. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent 11. **Naming conventions**: - - `dj` prefix = DataJoint-specific internal serialization (``) - - `x` prefix = external/content-addressed variant (``, ``) - - `@store` suffix = specifies which configured store to use - - Types without prefix: core storage mechanisms (``, ``, ``, ``) -12. **Transparent access**: AttributeTypes return Python objects or file paths -13. **Lazy access**: ``, ``, and `` return ObjectRef + - `@` = external storage (object store) + - No `@` = internal storage (database) + - `@` alone = default store + - `@name` = named store +12. **Dual-mode types**: `` and `` support both internal and external storage +13. **External-only types**: ``, ``, `` require `@` +14. **Transparent access**: AttributeTypes return Python objects or file paths +15. **Lazy access**: `` and `` return ObjectRef ## Migration from Legacy Types | Legacy | New Equivalent | |--------|----------------| -| `longblob` (auto-serialized) | `` | -| `blob@store` | `` | +| `longblob` (auto-serialized) | `` | +| `blob@store` | `` | | `attach` | `` | -| `attach@store` | `` | -| `filepath@store` (copy-based) | `` (ObjectRef-based, upgraded) | +| `attach@store` | `` | +| `filepath@store` (copy-based) | `` (ObjectRef-based) | ### Migration from Legacy `~external_*` Stores @@ -728,6 +754,5 @@ def migrate_external_store(schema, store_name): ## Open Questions -1. Should `content` without `@store` use a default store, or require explicit store? -2. Should we support `` without `@store` syntax (implying default store)? -3. How long should the backward compatibility layer support legacy `~external_*` format? +1. How long should the backward compatibility layer support legacy `~external_*` format? +2. Should `` (without store name) use a default store or require explicit store name? From d6bdd8081bbda9044da11a49194a44b970fffb07 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:01:40 +0000 Subject: [PATCH 10/32] Fix content@ notation in json dtype description Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index c951fc6bd..0f6c15e22 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -442,7 +442,7 @@ column_name JSONB NOT NULL ``` The `json` database type: -- Used as dtype by built-in AttributeTypes (``, ``, ``) +- Used as dtype by built-in AttributeTypes (``, ``, ``) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available From 1ad327dc86b03dce00cdd035806c1a55857574e8 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:07:58 +0000 Subject: [PATCH 11/32] Rename content type to hash for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename to throughout documentation: - More descriptive: indicates hash-based addressing mechanism - Familiar concept: works like a hash data structure - Storage folder: _content/ → _hash/ - Registry: ContentRegistry → HashRegistry The type provides: - SHA256 hash-based addressing - Automatic deduplication - External-only storage (requires @) - Used as dtype by and Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 4 +- docs/src/design/tables/storage-types-spec.md | 130 +++++++++---------- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index caac46a6a..dbdfe06b2 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -98,7 +98,7 @@ The `@` character indicates **external storage** (object store vs database): NumPy arrays, dicts, lists, datetime objects, and nested structures. Stores in database. Compatible with MATLAB. See [custom types](customtype.md) for details. -- `` / ``: Like `` but stores externally with content- +- `` / ``: Like `` but stores externally with hash- addressed deduplication. Use for large arrays that may be duplicated across rows. **File storage types** - for managed files: @@ -107,7 +107,7 @@ The `@` character indicates **external storage** (object store vs database): from primary key. Supports Zarr, HDF5, and direct writes via fsspec. Returns `ObjectRef` for lazy access. External only. See [object storage](object.md). -- `` / ``: Content-addressed storage for raw bytes with +- `` / ``: Hash-addressed storage for raw bytes with SHA256 deduplication. External only. Use via `` or `` rather than directly. **File attachment types** - for file transfer: diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 0f6c15e22..fd4967435 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -12,7 +12,7 @@ This document defines a three-layer type architecture: ┌───────────────────────────────────────────────────────────────────┐ │ AttributeTypes (Layer 3) │ │ │ -│ Built-in: │ +│ Built-in: │ │ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ @@ -39,7 +39,7 @@ This document defines a three-layer type architecture: | Region | Path Pattern | Addressing | Use Case | |--------|--------------|------------|----------| | Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | -| Content | `_content/{hash}` | Content hash | Deduplicated blobs/files | +| Hash | `_hash/{hash}` | SHA256 hash | Deduplicated blobs/files | ### External References @@ -193,7 +193,7 @@ The `@` character in AttributeType syntax indicates **external storage** (object - **`@` alone**: Use default store - e.g., `` - **`@name`**: Use named store - e.g., `` -Some types support both modes (``, ``), others are external-only (``, ``, ``). +Some types support both modes (``, ``), others are external-only (``, ``, ``). ### Type Resolution and Chaining @@ -204,16 +204,16 @@ returns the appropriate dtype based on storage mode: Resolution at declaration time: → get_dtype(False) → "bytes" → LONGBLOB/BYTEA - → get_dtype(True) → "" → json → JSON/JSONB - → get_dtype(True) → "" → json (store=cold) + → get_dtype(True) → "" → json → JSON/JSONB + → get_dtype(True) → "" → json (store=cold) → get_dtype(False) → "bytes" → LONGBLOB/BYTEA - → get_dtype(True) → "" → json → JSON/JSONB + → get_dtype(True) → "" → json → JSON/JSONB → get_dtype(True) → "json" → JSON/JSONB → get_dtype(False) → ERROR (external only) - → get_dtype(True) → "json" → JSON/JSONB + → get_dtype(True) → "json" → JSON/JSONB → get_dtype(True) → "json" → JSON/JSONB ``` @@ -262,15 +262,15 @@ class ObjectType(AttributeType): return ObjectRef(store=get_store(stored["store"]), path=stored["path"]) ``` -### `` / `` - Content-Addressed Storage +### `` / `` - Hash-Addressed Storage **Built-in AttributeType. External only.** -Content-addressed storage with deduplication: +Hash-addressed storage with deduplication: - **Single blob only**: stores a single file or serialized object (not folders) - **Per-project scope**: content is shared across all schemas in a project (not per-schema) -- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}` +- Path derived from content hash: `_hash/{hash[:2]}/{hash[2:4]}/{hash}` - Many-to-one: multiple rows (even across schemas) can reference same content - Reference counted for garbage collection - Deduplication: identical content stored once across the entire project @@ -282,48 +282,48 @@ store_root/ ├── {schema}/{table}/{pk}/ # object storage (path-addressed by PK) │ └── {attribute}/ │ -└── _content/ # content storage (content-addressed) +└── _hash/ # content storage (hash-addressed) └── {hash[:2]}/{hash[2:4]}/{hash} ``` #### Implementation ```python -class ContentType(AttributeType): - """Content-addressed storage. External only.""" - type_name = "content" +class HashType(AttributeType): + """Hash-addressed storage. External only.""" + type_name = "hash" def get_dtype(self, is_external: bool) -> str: if not is_external: - raise DataJointError(" requires @ (external storage only)") + raise DataJointError(" requires @ (external storage only)") return "json" def encode(self, data: bytes, *, key=None, store_name=None) -> dict: """Store content, return metadata as JSON.""" - content_hash = hashlib.sha256(data).hexdigest() + hash_id = hashlib.sha256(data).hexdigest() store = get_store(store_name or dj.config['stores']['default']) - path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" if not store.exists(path): store.put(path, data) - ContentRegistry().insert1({ - 'content_hash': content_hash, + HashRegistry().insert1({ + 'hash_id': hash_id, 'store': store_name, 'size': len(data) }, skip_duplicates=True) - return {"hash": content_hash, "store": store_name, "size": len(data)} + return {"hash": hash_id, "store": store_name, "size": len(data)} def decode(self, stored: dict, *, key=None) -> bytes: """Retrieve content by hash.""" store = get_store(stored["store"]) - path = f"_content/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" + path = f"_hash/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" return store.get(path) ``` #### Database Column -The `` type stores JSON metadata: +The `` type stores JSON metadata: ```sql -- content column (MySQL) @@ -442,7 +442,7 @@ column_name JSONB NOT NULL ``` The `json` database type: -- Used as dtype by built-in AttributeTypes (``, ``, ``) +- Used as dtype by built-in AttributeTypes (``, ``, ``) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available @@ -457,7 +457,7 @@ Serializes Python objects (NumPy arrays, dicts, lists, etc.) using DataJoint's blob format. Compatible with MATLAB. - **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) -- **``**: Stored externally via `` with deduplication +- **``**: Stored externally via `` with deduplication - **``**: Stored in specific named store ```python @@ -467,7 +467,7 @@ class BlobType(AttributeType): type_name = "blob" def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "bytes" + return "" if is_external else "bytes" def encode(self, value, *, key=None, store_name=None) -> bytes: from . import blob @@ -497,7 +497,7 @@ class ProcessedData(dj.Computed): Stores files with filename preserved. On fetch, extracts to configured download path. - **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) -- **``**: Stored externally via `` with deduplication +- **``**: Stored externally via `` with deduplication - **``**: Stored in specific named store ```python @@ -507,7 +507,7 @@ class AttachType(AttributeType): type_name = "attach" def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "bytes" + return "" if is_external else "bytes" def encode(self, filepath, *, key=None, store_name=None) -> bytes: path = Path(filepath) @@ -567,7 +567,7 @@ class ImageType(AttributeType): type_name = "image" def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "bytes" + return "" if is_external else "bytes" def encode(self, image, *, key=None, store_name=None) -> bytes: # Convert PIL Image to PNG bytes @@ -584,31 +584,31 @@ class ImageType(AttributeType): | Type | get_dtype | Resolves To | Storage Location | Dedup | Returns | |------|-----------|-------------|------------------|-------|---------| | `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Python object | -| `` | `` | `json` | `_content/{hash}` | Yes | Python object | -| `` | `` | `json` | `_content/{hash}` | Yes | Python object | +| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | +| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | | `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Local file path | -| `` | `` | `json` | `_content/{hash}` | Yes | Local file path | -| `` | `` | `json` | `_content/{hash}` | Yes | Local file path | +| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | +| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | | `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | | `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `JSON`/`JSONB` | `_content/{hash}` | Yes | bytes | -| `` | `json` | `JSON`/`JSONB` | `_content/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | | `` | `json` | `JSON`/`JSONB` | Configured store | No | ObjectRef | -## Reference Counting for Content Type +## Reference Counting for Hash Type -The `ContentRegistry` is a **project-level** table that tracks content-addressed objects +The `HashRegistry` is a **project-level** table that tracks hash-addressed objects across all schemas. This differs from the legacy `~external_*` tables which were per-schema. ```python -class ContentRegistry: +class HashRegistry: """ - Project-level content registry. - Stored in a designated database (e.g., `{project}_content`). + Project-level hash registry. + Stored in a designated database (e.g., `{project}_hash`). """ definition = """ - # Content-addressed object registry (project-wide) - content_hash : char(64) # SHA256 hex + # Hash-addressed object registry (project-wide) + hash_id : char(64) # SHA256 hex --- store : varchar(64) # Store name size : bigint unsigned # Size in bytes @@ -620,34 +620,34 @@ Garbage collection scans **all schemas** in the project: ```python def garbage_collect(project): - """Remove content not referenced by any table in any schema.""" + """Remove data not referenced by any table in any schema.""" # Get all registered hashes - registered = set(ContentRegistry().fetch('content_hash', 'store')) + registered = set(HashRegistry().fetch('hash_id', 'store')) # Get all referenced hashes from ALL schemas in the project referenced = set() for schema in project.schemas: for table in schema.tables: for attr in table.heading.attributes: - if attr.type in ('content', 'content@...'): + if attr.type in ('hash', 'hash@...'): hashes = table.fetch(attr.name) referenced.update((h, attr.store) for h in hashes) - # Delete orphaned content - for content_hash, store in (registered - referenced): + # Delete orphaned data + for hash_id, store in (registered - referenced): store_backend = get_store(store) - store_backend.delete(content_path(content_hash)) - (ContentRegistry() & {'content_hash': content_hash}).delete() + store_backend.delete(hash_path(hash_id)) + (HashRegistry() & {'hash_id': hash_id}).delete() ``` ## Built-in AttributeType Comparison -| Feature | `` | `` | `` | `` | `` | +| Feature | `` | `` | `` | `` | `` | |---------|----------|------------|-------------|--------------|---------------| | Storage modes | Both | Both | External only | External only | External only | | Internal dtype | `bytes` | `bytes` | N/A | N/A | N/A | -| External dtype | `` | `` | `json` | `json` | `json` | -| Addressing | Content hash | Content hash | Primary key | Content hash | Relative path | +| External dtype | `` | `` | `json` | `json` | `json` | +| Addressing | Hash | Hash | Primary key | Hash | Relative path | | Deduplication | Yes (external) | Yes (external) | No | Yes | No | | Structure | Single blob | Single file | Files, folders | Single blob | Any | | Returns | Python object | Local path | ObjectRef | bytes | ObjectRef | @@ -657,7 +657,7 @@ def garbage_collect(project): - **``**: Serialized Python objects (NumPy arrays, dicts). Use `` for large/duplicated data - **``**: File attachments with filename preserved. Use `` for large files - **``**: Large/complex file structures (Zarr, HDF5) where DataJoint controls organization -- **``**: Raw bytes with deduplication (typically used via `` or ``) +- **``**: Raw bytes with deduplication (typically used via `` or ``) - **``**: Portable references to externally-managed files - **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed @@ -671,9 +671,9 @@ def garbage_collect(project): 3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types 4. **`@` indicates external storage**: No `@` = database, `@` present = object store 5. **`get_dtype(is_external)` method**: Types resolve dtype at declaration time based on storage mode -6. **AttributeTypes are composable**: `` uses ``, which uses `json` +6. **AttributeTypes are composable**: `` uses ``, which uses `json` 7. **Built-in external types use JSON dtype**: Stores metadata (path, hash, store name, etc.) -8. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +8. **Two OAS regions**: object (PK-addressed) and hash (hash-addressed) within managed stores 9. **Filepath for portability**: `` uses relative paths within stores for environment portability 10. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent 11. **Naming conventions**: @@ -682,7 +682,7 @@ def garbage_collect(project): - `@` alone = default store - `@name` = named store 12. **Dual-mode types**: `` and `` support both internal and external storage -13. **External-only types**: ``, ``, `` require `@` +13. **External-only types**: ``, ``, `` require `@` 14. **Transparent access**: AttributeTypes return Python objects or file paths 15. **Lazy access**: `` and `` return ObjectRef @@ -699,20 +699,20 @@ def garbage_collect(project): ### Migration from Legacy `~external_*` Stores Legacy external storage used per-schema `~external_{store}` tables. Migration to the new -per-project `ContentRegistry` requires: +per-project `HashRegistry` requires: ```python def migrate_external_store(schema, store_name): """ - Migrate legacy ~external_{store} to new ContentRegistry. + Migrate legacy ~external_{store} to new HashRegistry. 1. Read all entries from ~external_{store} 2. For each entry: - Fetch content from legacy location - Compute SHA256 hash - - Copy to _content/{hash}/ if not exists + - Copy to _hash/{hash}/ if not exists - Update table column from UUID to hash - - Register in ContentRegistry + - Register in HashRegistry 3. After all schemas migrated, drop ~external_{store} tables """ external_table = schema.external[store_name] @@ -724,17 +724,17 @@ def migrate_external_store(schema, store_name): content = external_table.get(legacy_uuid) # Compute new content hash - content_hash = hashlib.sha256(content).hexdigest() + hash_id = hashlib.sha256(content).hexdigest() # Store in new location if not exists - new_path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + new_path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" store = get_store(store_name) if not store.exists(new_path): store.put(new_path, content) - # Register in project-wide ContentRegistry - ContentRegistry().insert1({ - 'content_hash': content_hash, + # Register in project-wide HashRegistry + HashRegistry().insert1({ + 'hash_id': hash_id, 'store': store_name, 'size': len(content) }, skip_duplicates=True) @@ -755,4 +755,4 @@ def migrate_external_store(schema, store_name): ## Open Questions 1. How long should the backward compatibility layer support legacy `~external_*` format? -2. Should `` (without store name) use a default store or require explicit store name? +2. Should `` (without store name) use a default store or require explicit store name? From 92647f115841b0c066000b69897f6b86b19f8ec7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:08:31 +0000 Subject: [PATCH 12/32] Fix DataJoint syntax in HashRegistry definition - Use '= CURRENT_TIMESTAMP : datetime' syntax (not SQL DEFAULT) - Use uint64 core type instead of 'bigint unsigned' native type Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index fd4967435..d7e722496 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -611,8 +611,8 @@ class HashRegistry: hash_id : char(64) # SHA256 hex --- store : varchar(64) # Store name - size : bigint unsigned # Size in bytes - created : timestamp DEFAULT CURRENT_TIMESTAMP + size : uint64 # Size in bytes + created = CURRENT_TIMESTAMP : datetime """ ``` From 7d0a5a5448ad30d5274e5e019d24ce2b60886763 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:11:03 +0000 Subject: [PATCH 13/32] Clarify nullability and default value handling DataJoint handles nullability through the default value syntax: - Attribute is nullable iff default is NULL - No separate NOT NULL / NULL modifier needed - Examples: required, nullable, and default value cases Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index d7e722496..e287e7725 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -135,8 +135,8 @@ declarative syntax: | Modifier | Status | DataJoint Alternative | |----------|--------|----------------------| -| `NOT NULL` / `NULL` | ❌ Not allowed | Position above/below `---` determines nullability | -| `DEFAULT value` | ❌ Not allowed | Use `= value` syntax after type | +| `NOT NULL` / `NULL` | ❌ Not allowed | Use `= NULL` for nullable; omit default for required | +| `DEFAULT value` | ❌ Not allowed | Use `= value` syntax before the type | | `PRIMARY KEY` | ❌ Not allowed | Position above `---` line | | `UNIQUE` | ❌ Not allowed | Use DataJoint index syntax | | `COMMENT 'text'` | ❌ Not allowed | Use `# comment` syntax | @@ -145,6 +145,20 @@ declarative syntax: | `AUTO_INCREMENT` | ⚠️ Discouraged | Allowed with native types only, generates warning | | `UNSIGNED` | ✅ Allowed | Part of type semantics (use `uint*` core types) | +**Nullability and defaults:** DataJoint handles nullability through the default value syntax. +An attribute is nullable if and only if its default is `NULL`: + +``` +# Required (NOT NULL, no default) +name : varchar(100) + +# Nullable (default is NULL) +nickname = NULL : varchar(100) + +# Required with default value +status = "active" : varchar(20) +``` + **Auto-increment policy:** DataJoint discourages `AUTO_INCREMENT` / `SERIAL` because: - Breaks reproducibility (IDs depend on insertion order) - Makes pipelines non-deterministic From af7c76c4e2a4c15ea428b9031815c6088bd6ea51 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:15:41 +0000 Subject: [PATCH 14/32] Remove HashRegistry table, use JSON field scanning for GC Hash metadata (hash, store, size) is stored directly in each table's JSON column - no separate registry table is needed. Garbage collection now scans all tables to find referenced hashes in JSON fields directly. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 52 +++++++------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index e287e7725..185ec721a 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -609,49 +609,31 @@ class ImageType(AttributeType): | `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | | `` | `json` | `JSON`/`JSONB` | Configured store | No | ObjectRef | -## Reference Counting for Hash Type +## Garbage Collection for Hash Storage -The `HashRegistry` is a **project-level** table that tracks hash-addressed objects -across all schemas. This differs from the legacy `~external_*` tables which were per-schema. +Hash metadata (hash, store, size) is stored directly in each table's JSON column - no separate +registry table is needed. Garbage collection scans all tables to find referenced hashes: ```python -class HashRegistry: - """ - Project-level hash registry. - Stored in a designated database (e.g., `{project}_hash`). - """ - definition = """ - # Hash-addressed object registry (project-wide) - hash_id : char(64) # SHA256 hex - --- - store : varchar(64) # Store name - size : uint64 # Size in bytes - created = CURRENT_TIMESTAMP : datetime - """ -``` - -Garbage collection scans **all schemas** in the project: - -```python -def garbage_collect(project): - """Remove data not referenced by any table in any schema.""" - # Get all registered hashes - registered = set(HashRegistry().fetch('hash_id', 'store')) +def garbage_collect(store_name): + """Remove hash-addressed data not referenced by any table.""" + # Scan store for all hash files + store = get_store(store_name) + all_hashes = set(store.list_hashes()) # from _hash/ directory - # Get all referenced hashes from ALL schemas in the project + # Scan all tables for referenced hashes referenced = set() for schema in project.schemas: for table in schema.tables: for attr in table.heading.attributes: - if attr.type in ('hash', 'hash@...'): - hashes = table.fetch(attr.name) - referenced.update((h, attr.store) for h in hashes) - - # Delete orphaned data - for hash_id, store in (registered - referenced): - store_backend = get_store(store) - store_backend.delete(hash_path(hash_id)) - (HashRegistry() & {'hash_id': hash_id}).delete() + if uses_hash_storage(attr): # , , + for row in table.fetch(attr.name): + if row and row.get('store') == store_name: + referenced.add(row['hash']) + + # Delete orphaned files + for hash_id in (all_hashes - referenced): + store.delete(hash_path(hash_id)) ``` ## Built-in AttributeType Comparison From 691edcc91bdc5a0cbf203045145b6e2fb2a70012 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:27:21 +0000 Subject: [PATCH 15/32] Use MD5 instead of SHA256 for content hashing MD5 (128-bit, 32-char hex) is sufficient for content-addressed deduplication: - Birthday bound ~2^64 provides adequate collision resistance for scientific data - 32-char vs 64-char hashes reduces storage overhead in JSON metadata - MD5 is ~2-3x faster than SHA256 for large files - Consistent with existing dj.hash module (key_hash, uuid_from_buffer) - Simplifies migration since only storage format changes, not the algorithm Added Hash Algorithm Choice section documenting the rationale. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 2 +- docs/src/design/tables/storage-types-spec.md | 71 +++++++++++++------- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index dbdfe06b2..2120099d1 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -108,7 +108,7 @@ The `@` character indicates **external storage** (object store vs database): `ObjectRef` for lazy access. External only. See [object storage](object.md). - `` / ``: Hash-addressed storage for raw bytes with - SHA256 deduplication. External only. Use via `` or `` rather than directly. + MD5 deduplication. External only. Use via `` or `` rather than directly. **File attachment types** - for file transfer: diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 185ec721a..f0e28a704 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -39,7 +39,7 @@ This document defines a three-layer type architecture: | Region | Path Pattern | Addressing | Use Case | |--------|--------------|------------|----------| | Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | -| Hash | `_hash/{hash}` | SHA256 hash | Deduplicated blobs/files | +| Hash | `_hash/{hash}` | MD5 hash | Deduplicated blobs/files | ### External References @@ -314,18 +314,14 @@ class HashType(AttributeType): def encode(self, data: bytes, *, key=None, store_name=None) -> dict: """Store content, return metadata as JSON.""" - hash_id = hashlib.sha256(data).hexdigest() + hash_id = hashlib.md5(data).hexdigest() # 32-char hex store = get_store(store_name or dj.config['stores']['default']) path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" if not store.exists(path): store.put(path, data) - HashRegistry().insert1({ - 'hash_id': hash_id, - 'store': store_name, - 'size': len(data) - }, skip_duplicates=True) + # Metadata stored in JSON column (no separate registry) return {"hash": hash_id, "store": store_name, "size": len(data)} def decode(self, stored: dict, *, key=None) -> bytes: @@ -681,6 +677,41 @@ def garbage_collect(store_name): 13. **External-only types**: ``, ``, `` require `@` 14. **Transparent access**: AttributeTypes return Python objects or file paths 15. **Lazy access**: `` and `` return ObjectRef +16. **MD5 for content hashing**: See [Hash Algorithm Choice](#hash-algorithm-choice) below +17. **No separate registry**: Hash metadata stored in JSON columns, not a separate table + +### Hash Algorithm Choice + +Content-addressed storage uses **MD5** (128-bit, 32-char hex) rather than SHA256 (256-bit, 64-char hex). + +**Rationale:** + +1. **Practical collision resistance is sufficient**: The birthday bound for MD5 is ~2^64 operations + before 50% collision probability. No scientific project will store anywhere near 10^19 files. + For content deduplication (not cryptographic verification), MD5 provides adequate uniqueness. + +2. **Storage efficiency**: 32-char hashes vs 64-char hashes in every JSON metadata field. + With millions of records, this halves the storage overhead for hash identifiers. + +3. **Performance**: MD5 is ~2-3x faster than SHA256 for large files. While both are fast, + the difference is measurable when hashing large scientific datasets. + +4. **Legacy compatibility**: DataJoint's existing `uuid_from_buffer()` function uses MD5. + The new system changes only the storage format (hex string in JSON vs binary UUID), + not the underlying hash algorithm. This simplifies migration. + +5. **Consistency with existing codebase**: The `dj.hash` module already uses MD5 for + `key_hash()` (job reservation) and `uuid_from_buffer()` (query caching). + +**Why not SHA256?** + +SHA256 is the modern standard for content-addressable storage (Git, Docker, IPFS). However: +- These systems prioritize cryptographic security against adversarial collision attacks +- Scientific data pipelines face no adversarial threat model +- The practical benefits (storage, speed, compatibility) outweigh theoretical security gains + +**Note**: If cryptographic verification is ever needed (e.g., for compliance or reproducibility +audits), SHA256 checksums can be computed on-demand without changing the storage addressing scheme. ## Migration from Legacy Types @@ -694,8 +725,8 @@ def garbage_collect(store_name): ### Migration from Legacy `~external_*` Stores -Legacy external storage used per-schema `~external_{store}` tables. Migration to the new -per-project `HashRegistry` requires: +Legacy external storage used per-schema `~external_{store}` tables with UUID references. +Migration to the new JSON-based hash storage requires: ```python def migrate_external_store(schema, store_name): @@ -705,10 +736,9 @@ def migrate_external_store(schema, store_name): 1. Read all entries from ~external_{store} 2. For each entry: - Fetch content from legacy location - - Compute SHA256 hash + - Compute MD5 hash - Copy to _hash/{hash}/ if not exists - - Update table column from UUID to hash - - Register in HashRegistry + - Update table column to new hash format 3. After all schemas migrated, drop ~external_{store} tables """ external_table = schema.external[store_name] @@ -720,7 +750,7 @@ def migrate_external_store(schema, store_name): content = external_table.get(legacy_uuid) # Compute new content hash - hash_id = hashlib.sha256(content).hexdigest() + hash_id = hashlib.md5(content).hexdigest() # Store in new location if not exists new_path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" @@ -728,14 +758,8 @@ def migrate_external_store(schema, store_name): if not store.exists(new_path): store.put(new_path, content) - # Register in project-wide HashRegistry - HashRegistry().insert1({ - 'hash_id': hash_id, - 'store': store_name, - 'size': len(content) - }, skip_duplicates=True) - - # Update referencing tables (UUID -> hash) + # Update referencing tables: convert UUID column to JSON with hash metadata + # The JSON column stores {"hash": hash_id, "store": store_name, "size": len(content)} # ... update all tables that reference this UUID ... # After migration complete for all schemas: @@ -743,8 +767,9 @@ def migrate_external_store(schema, store_name): ``` **Migration considerations:** -- Legacy UUIDs were based on content hash but stored as `binary(16)` -- New system uses `char(64)` SHA256 hex strings +- Legacy UUIDs were based on MD5 content hash stored as `binary(16)` (UUID format) +- New system uses `char(32)` MD5 hex strings stored in JSON +- The hash algorithm is unchanged (MD5), only the storage format differs - Migration can be done incrementally per schema - Backward compatibility layer can read both formats during transition From 905c463ada322a332a3c4e235ebab324f68ae120 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:33:20 +0000 Subject: [PATCH 16/32] Remove dead code: uuid_from_file and uuid_from_stream - uuid_from_file was never called anywhere in the codebase - uuid_from_stream only existed to support uuid_from_file - Inlined the logic directly into uuid_from_buffer - Removed unused io and pathlib imports Co-authored-by: dimitri-yatsenko --- src/datajoint/hash.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/datajoint/hash.py b/src/datajoint/hash.py index f58c65732..88a737fb7 100644 --- a/src/datajoint/hash.py +++ b/src/datajoint/hash.py @@ -1,7 +1,5 @@ import hashlib -import io import uuid -from pathlib import Path def key_hash(mapping): @@ -16,24 +14,14 @@ def key_hash(mapping): return hashed.hexdigest() -def uuid_from_stream(stream, *, init_string=""): +def uuid_from_buffer(buffer=b"", *, init_string=""): """ - :return: 16-byte digest of stream data - :stream: stream object or open file handle - :init_string: string to initialize the checksum + Compute MD5 hash of buffer data, returned as UUID. + + :param buffer: bytes to hash + :param init_string: string to initialize the checksum (for namespacing) + :return: UUID based on MD5 digest """ hashed = hashlib.md5(init_string.encode()) - chunk = True - chunk_size = 1 << 14 - while chunk: - chunk = stream.read(chunk_size) - hashed.update(chunk) + hashed.update(buffer) return uuid.UUID(bytes=hashed.digest()) - - -def uuid_from_buffer(buffer=b"", *, init_string=""): - return uuid_from_stream(io.BytesIO(buffer), init_string=init_string) - - -def uuid_from_file(filepath, *, init_string=""): - return uuid_from_stream(Path(filepath).open("rb"), init_string=init_string) From 57af0e8f5875a87c88c8c25126767536617f599c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 20:40:12 +0000 Subject: [PATCH 17/32] Remove outdated storage-types-implementation-plan.md The implementation plan was heavily outdated with: - Old type names (, , vs , , ) - Wrong hash algorithm (SHA256 vs MD5) - Wrong paths (_content/ vs _hash/) - References to removed HashRegistry table All relevant design information is now in storage-types-spec.md. Implementation details (ObjectRef API, staged_insert) will be documented in user-facing API docs when implemented. Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 464 ------------------ 1 file changed, 464 deletions(-) delete mode 100644 docs/src/design/tables/storage-types-implementation-plan.md diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md deleted file mode 100644 index f3e4debcc..000000000 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ /dev/null @@ -1,464 +0,0 @@ -# DataJoint Storage Types Redesign - Implementation Plan - -## Executive Summary - -This plan describes the implementation of a three-layer type architecture for DataJoint, building on the existing `AttributeType` infrastructure. The key goals are: - -1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes) -2. Implement content-addressed storage with deduplication -3. Provide composable, user-friendly types (``, ``, ``) -4. Enable project-wide garbage collection -5. Maintain backward compatibility with existing schemas - ---- - -## Implementation Status - -| Phase | Status | Notes | -|-------|--------|-------| -| Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | -| Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | -| Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | -| Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType | -| Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | -| Phase 5: Garbage Collection | ✅ Complete | gc.py with scan/collect functions | -| Phase 6: Documentation and Testing | ✅ Complete | Test files for all new types | - ---- - -## Phase 1: Core Type System Foundation ✅ - -**Status**: Complete - -### Implemented in `src/datajoint/declare.py`: - -```python -CORE_TYPES = { - # Numeric types (aliased to native SQL) - "float32": (r"float32$", "float"), - "float64": (r"float64$", "double"), - "int64": (r"int64$", "bigint"), - "uint64": (r"uint64$", "bigint unsigned"), - "int32": (r"int32$", "int"), - "uint32": (r"uint32$", "int unsigned"), - "int16": (r"int16$", "smallint"), - "uint16": (r"uint16$", "smallint unsigned"), - "int8": (r"int8$", "tinyint"), - "uint8": (r"uint8$", "tinyint unsigned"), - "bool": (r"bool$", "tinyint"), - # UUID (stored as binary) - "uuid": (r"uuid$", "binary(16)"), - # JSON - "json": (r"json$", None), - # Binary (bytes maps to longblob in MySQL, bytea in PostgreSQL) - "bytes": (r"bytes$", "longblob"), - # Temporal - "date": (r"date$", None), - "datetime": (r"datetime$", None), - # String types (with parameters) - "char": (r"char\s*\(\d+\)$", None), - "varchar": (r"varchar\s*\(\d+\)$", None), - # Enumeration - "enum": (r"enum\s*\(.+\)$", None), -} -``` - -### Key changes: -- Removed `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` -- Core types are recorded in field comments with `:type:` syntax -- Non-standard native types pass through with warning -- `parse_type_spec()` handles `` syntax -- `resolve_dtype()` returns `(final_dtype, type_chain, store_name)` tuple - ---- - -## Phase 2: Content-Addressed Storage ✅ - -**Status**: Complete (simplified design) - -### Design Decision: Functions vs Class - -The original plan proposed a `ContentRegistry` class with a database table. We implemented a simpler, stateless approach using functions in `content_registry.py`: - -**Why functions instead of a registry table:** -1. **Simpler** - No additional database table to manage -2. **Decoupled** - Content storage is independent of any schema -3. **GC by scanning** - Garbage collection scans tables for references rather than maintaining reference counts -4. **Less state** - No synchronization issues between registry and actual storage - -### Implemented in `src/datajoint/content_registry.py`: - -```python -def compute_content_hash(data: bytes) -> str: - """Compute SHA256 hash of content.""" - return hashlib.sha256(data).hexdigest() - -def build_content_path(content_hash: str) -> str: - """Build path: _content/{hash[:2]}/{hash[2:4]}/{hash}""" - return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - -def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: - """Store content with deduplication. Returns {hash, store, size}.""" - ... - -def get_content(content_hash: str, store_name: str | None = None) -> bytes: - """Retrieve content by hash with verification.""" - ... - -def content_exists(content_hash: str, store_name: str | None = None) -> bool: - """Check if content exists.""" - ... - -def delete_content(content_hash: str, store_name: str | None = None) -> bool: - """Delete content (use with caution - verify no references first).""" - ... -``` - -### Implemented AttributeTypes in `src/datajoint/attribute_type.py`: - -```python -class ContentType(AttributeType): - """Content-addressed storage. Stores bytes, returns JSON metadata.""" - type_name = "content" - dtype = "json" - - def encode(self, value: bytes, *, key=None, store_name=None) -> dict: - return put_content(value, store_name=store_name) - - def decode(self, stored: dict, *, key=None) -> bytes: - return get_content(stored["hash"], store_name=stored.get("store")) - - -class XBlobType(AttributeType): - """External serialized blob using content-addressed storage.""" - type_name = "xblob" - dtype = "" # Composition - - def encode(self, value, *, key=None, store_name=None) -> bytes: - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key=None) -> Any: - return blob.unpack(stored, squeeze=False) -``` - ---- - -## Phase 2b: Path-Addressed Storage (ObjectType) ✅ - -**Status**: Complete - -### Design: Path vs Content Addressing - -| Aspect | `` | `` | -|--------|-------------|------------| -| Addressing | Content-hash (SHA256) | Path (from primary key) | -| Path Format | `_content/{hash[:2]}/{hash[2:4]}/{hash}` | `{schema}/{table}/objects/{pk}/{field}_{token}.ext` | -| Deduplication | Yes (same content = same hash) | No (each row has unique path) | -| Deletion | GC when unreferenced | Deleted with row | -| Use case | Serialized blobs, attachments | Zarr, HDF5, folders | - -### Implemented in `src/datajoint/builtin_types.py`: - -```python -@register_type -class ObjectType(AttributeType): - """Path-addressed storage for files and folders.""" - type_name = "object" - dtype = "json" - - def encode(self, value, *, key=None, store_name=None) -> dict: - # value can be bytes, str path, or Path - # key contains _schema, _table, _field for path construction - path, token = build_object_path(schema, table, field, primary_key, ext) - backend.put_buffer(content, path) # or put_folder for directories - return { - "path": path, - "store": store_name, - "size": size, - "ext": ext, - "is_dir": is_dir, - "timestamp": timestamp.isoformat(), - } - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - # Returns lazy handle for fsspec-based access - return ObjectRef.from_json(stored, backend=backend) -``` - -### ObjectRef Features: -- `ref.path` - Storage path -- `ref.read()` - Read file content -- `ref.open()` - Open as file handle -- `ref.fsmap` - For `zarr.open(ref.fsmap)` -- `ref.download(dest)` - Download to local path -- `ref.listdir()` / `ref.walk()` - For directories - -### Staged Insert for Object Types - -For large objects like Zarr arrays, `staged_insert.py` provides direct writes to storage: - -```python -with table.staged_insert1 as staged: - # 1. Set primary key first (required for path construction) - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # 2. Get storage handle and write directly - z = zarr.open(staged.store('raw_data', '.zarr'), mode='w') - z[:] = large_array - - # 3. On exit: metadata computed, record inserted -``` - -**Flow comparison:** - -| Normal Insert | Staged Insert | -|--------------|---------------| -| `ObjectType.encode()` uploads content | Direct writes via `staged.store()` | -| Single operation | Two-phase: write then finalize | -| Good for files/folders | Ideal for Zarr, HDF5, streaming | - -Both produce the same JSON metadata format compatible with `ObjectRef.from_json()`. - -**Key methods:** -- `staged.store(field, ext)` - Returns `FSMap` for Zarr/xarray -- `staged.open(field, ext)` - Returns file handle for binary writes -- `staged.fs` - Raw fsspec filesystem access - ---- - -## Phase 3: User-Defined AttributeTypes ✅ - -**Status**: Complete - -All built-in AttributeTypes are implemented in `src/datajoint/builtin_types.py`. - -### 3.1 XBlobType ✅ -External serialized blobs using content-addressed storage. Composes with ``. - -### 3.2 AttachType ✅ - -```python -@register_type -class AttachType(AttributeType): - """Internal file attachment stored in database.""" - type_name = "attach" - dtype = "longblob" - - def encode(self, filepath, *, key=None, store_name=None) -> bytes: - # Returns: filename (UTF-8) + null byte + contents - return path.name.encode("utf-8") + b"\x00" + path.read_bytes() - - def decode(self, stored, *, key=None) -> str: - # Extracts to download_path, returns local path - ... -``` - -### 3.3 XAttachType ✅ - -```python -@register_type -class XAttachType(AttributeType): - """External file attachment using content-addressed storage.""" - type_name = "xattach" - dtype = "" # Composes with ContentType - # Same encode/decode as AttachType, but stored externally with dedup -``` - -### 3.4 FilepathType ✅ - -```python -@register_type -class FilepathType(AttributeType): - """Reference to existing file in configured store.""" - type_name = "filepath" - dtype = "json" - - def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: - # Verifies file exists, returns metadata - return {'path': path, 'store': store_name, 'size': size, ...} - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - # Returns ObjectRef for lazy access - return ObjectRef.from_json(stored, backend=backend) -``` - -### Type Comparison - -| Type | Storage | Copies File | Dedup | Returns | -|------|---------|-------------|-------|---------| -| `` | Database | Yes | No | Local path | -| `` | External | Yes | Yes | Local path | -| `` | Reference | No | N/A | ObjectRef | -| `` | External | Yes | No | ObjectRef | - ---- - -## Phase 4: Insert and Fetch Integration ✅ - -**Status**: Complete - -### Updated in `src/datajoint/table.py`: - -```python -def __make_placeholder(self, name, value, ...): - if attr.adapter: - from .attribute_type import resolve_dtype - attr.adapter.validate(value) - _, type_chain, resolved_store = resolve_dtype( - f"<{attr.adapter.type_name}>", store_name=attr.store - ) - # Apply type chain: outermost → innermost - for attr_type in type_chain: - try: - value = attr_type.encode(value, key=None, store_name=resolved_store) - except TypeError: - value = attr_type.encode(value, key=None) -``` - -### Updated in `src/datajoint/fetch.py`: - -```python -def _get(connection, attr, data, squeeze, download_path): - if attr.adapter: - from .attribute_type import resolve_dtype - final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") - - # Parse JSON if final storage is JSON - if final_dtype.lower() == "json": - data = json.loads(data) - - # Apply type chain in reverse: innermost → outermost - for attr_type in reversed(type_chain): - data = attr_type.decode(data, key=None) - - return data -``` - ---- - -## Phase 5: Garbage Collection ✅ - -**Status**: Complete - -### Implemented in `src/datajoint/gc.py`: - -```python -import datajoint as dj - -# Scan schemas and find orphaned content/objects -stats = dj.gc.scan(schema1, schema2, store_name='mystore') - -# Remove orphaned content/objects (dry_run=False to actually delete) -stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) - -# Format statistics for display -print(dj.gc.format_stats(stats)) -``` - -**Supported storage patterns:** - -1. **Content-Addressed Storage** (``, ``, ``): - - Stored at: `_content/{hash[:2]}/{hash[2:4]}/{hash}` - - Referenced by SHA256 hash in JSON metadata - -2. **Path-Addressed Storage** (``): - - Stored at: `{schema}/{table}/objects/{pk}/{field}_{token}/` - - Referenced by path in JSON metadata - -**Key functions:** -- `scan_references(*schemas, store_name=None)` - Scan tables for content hashes -- `scan_object_references(*schemas, store_name=None)` - Scan tables for object paths -- `list_stored_content(store_name=None)` - List all content in `_content/` directory -- `list_stored_objects(store_name=None)` - List all objects in `*/objects/` directories -- `scan(*schemas, store_name=None)` - Find orphaned content/objects without deleting -- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content/objects -- `delete_object(path, store_name=None)` - Delete an object directory -- `format_stats(stats)` - Human-readable statistics output - -**GC Process:** -1. Scan all tables in provided schemas for content-type and object-type attributes -2. Extract content hashes and object paths from JSON metadata columns -3. Scan storage for all stored content (`_content/`) and objects (`*/objects/`) -4. Compute orphaned = stored - referenced (for both types) -5. Optionally delete orphaned items (when `dry_run=False`) - ---- - -## Phase 6: Documentation and Testing ✅ - -**Status**: Complete - -### Test files created: -- `tests/test_content_storage.py` - Content-addressed storage functions -- `tests/test_type_composition.py` - Type chain encoding/decoding -- `tests/test_gc.py` - Garbage collection -- `tests/test_attribute_type.py` - AttributeType registry and DJBlobType (existing) - ---- - -## Critical Files Summary - -| File | Status | Changes | -|------|--------|---------| -| `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation | -| `src/datajoint/heading.py` | ✅ | Simplified attribute properties | -| `src/datajoint/attribute_type.py` | ✅ | Base class, registry, type chain resolution | -| `src/datajoint/builtin_types.py` | ✅ | DJBlobType, ContentType, XBlobType, ObjectType | -| `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | -| `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access | -| `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path | -| `src/datajoint/staged_insert.py` | ✅ | Staged insert for direct object storage writes | -| `src/datajoint/table.py` | ✅ | Type chain encoding on insert | -| `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | -| `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | -| `src/datajoint/gc.py` | ✅ | Garbage collection for content storage | -| `tests/test_content_storage.py` | ✅ | Tests for content_registry.py | -| `tests/test_type_composition.py` | ✅ | Tests for type chain encoding/decoding | -| `tests/test_gc.py` | ✅ | Tests for garbage collection | - ---- - -## Removed/Deprecated - -- `src/datajoint/attribute_adapter.py` - Deleted (hard deprecated) -- `bypass_serialization` flag in `blob.py` - Removed -- `database` field in Attribute - Removed (unused) -- `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` - Removed -- `is_attachment`, `is_filepath`, `is_object`, `is_external` flags - Removed - ---- - -## Architecture Summary - -``` -Layer 3: AttributeTypes (user-facing) - , , , , , , - ↓ encode() / ↑ decode() - -Layer 2: Core DataJoint Types - float32, int64, uuid, json, blob, varchar(n), etc. - ↓ SQL mapping - -Layer 1: Native Database Types - FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc. -``` - -**Built-in AttributeTypes:** -``` - → longblob (internal serialized storage) - → longblob (internal file attachment) - → json (path-addressed, for Zarr/HDF5/folders) - → json (reference to existing file in store) - → json (content-addressed with deduplication) - → json (external serialized with dedup) - → json (external file attachment with dedup) -``` - -**Type Composition Example:** -``` - → json (in DB) - -Insert: Python object → blob.pack() → put_content() → JSON metadata -Fetch: JSON metadata → get_content() → blob.unpack() → Python object -``` From 40d787199b6066a28539a338180ecaccca71106c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 15:01:09 -0600 Subject: [PATCH 18/32] Fix type pattern conflicts and BLOB reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename DECIMAL to NUMERIC in native types (decimal is in core types) - Rename TEXT to NATIVE_TEXT (text is in core types) - Change BLOB references to BYTES in heading.py (bytes is the core type name) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/declare.py | 4 ++-- src/datajoint/heading.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index e3bbb96fd..82ed02b15 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -70,12 +70,12 @@ **{name.upper(): pattern for name, (pattern, _) in CORE_TYPES.items()}, # Native SQL types (passthrough with warning for non-standard use) INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$", - DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$", + NUMERIC=r"numeric(\s*\(.+\))?(\s+unsigned)?$", # numeric is SQL alias, use decimal instead FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$", STRING=r"(var)?char\s*\(.+\)$", # Catches char/varchar not matched by core types TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$", # time, timestamp, year (not date/datetime) NATIVE_BLOB=r"(tiny|small|medium|long)blob$", # Specific blob variants - TEXT=r"(tiny|small|medium|long)?text$", # Text types + NATIVE_TEXT=r"(tiny|small|medium|long)text$", # Text variants (use plain 'text' instead) # AttributeTypes use angle brackets ADAPTED=r"<.+>$", ).items() diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 78b6af779..5c4482f3e 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -283,7 +283,7 @@ def _init_from_database(self): autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)), numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")), string=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("ENUM", "TEMPORAL", "STRING")), - is_blob=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BLOB", "NATIVE_BLOB")), + is_blob=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")), uuid=False, json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), adapter=None, @@ -329,8 +329,8 @@ def _init_from_database(self): attr["type"] = attr["adapter"].dtype if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") - # Update is_blob based on resolved dtype (check both BLOB and NATIVE_BLOB patterns) - attr["is_blob"] = any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BLOB", "NATIVE_BLOB")) + # Update is_blob based on resolved dtype (check both BYTES and NATIVE_BLOB patterns) + attr["is_blob"] = any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")) # Handle core type aliases (uuid, float32, etc.) if special: From 5cb5ae4f6493b1928f0c4ae85f6a7f90fca246b5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 21:09:52 +0000 Subject: [PATCH 19/32] Rename AttributeTypes to Codec Types in documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Terminology changes in spec and user docs: - "AttributeTypes" → "Codec Types" (category name) - "AttributeType" → "Codec" (base class) - "@register_type" → "@dj.codec" (decorator) - "type_name" → "name" (class attribute) The term "Codec" better conveys the encode/decode semantics of these types, drawing on the familiar audio/video codec analogy. Code changes (class renaming, backward-compat aliases) to follow. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/attributes.md | 12 +-- docs/src/design/tables/storage-types-spec.md | 92 ++++++++++---------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 2120099d1..e122253ef 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -76,10 +76,10 @@ for portable pipelines. Using native types will generate a warning. See the [storage types spec](storage-types-spec.md) for complete mappings. -## AttributeTypes (special datatypes) +## Codec types (special datatypes) -AttributeTypes provide `encode()`/`decode()` semantics for complex data that doesn't -fit native database types. They are denoted with angle brackets: ``. +Codecs provide `encode()`/`decode()` semantics for complex data that doesn't +fit native database types. They are denoted with angle brackets: ``. ### Storage mode: `@` convention @@ -90,7 +90,7 @@ The `@` character indicates **external storage** (object store vs database): - **`@` alone**: Use default store - e.g., `` - **`@name`**: Use named store - e.g., `` -### Built-in AttributeTypes +### Built-in codecs **Serialization types** - for Python objects: @@ -123,9 +123,9 @@ The `@` character indicates **external storage** (object store vs database): - ``: Reference to existing file in a configured store. No file copying occurs. Returns `ObjectRef` for lazy access. External only. See [filepath](filepath.md). -### User-defined AttributeTypes +### User-defined codecs -- ``: Define your own [custom attribute type](customtype.md) with +- ``: Define your own [custom codec](customtype.md) with bidirectional conversion between Python objects and database storage. Use for graphs, domain-specific objects, or custom data structures. diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index f0e28a704..9493f295b 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -6,13 +6,13 @@ This document defines a three-layer type architecture: 1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use. 2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`). -3. **AttributeTypes** - Programmatic types with `encode()`/`decode()` semantics. Composable. +3. **Codec Types** - Programmatic types with `encode()`/`decode()` semantics. Composable. ``` ┌───────────────────────────────────────────────────────────────────┐ -│ AttributeTypes (Layer 3) │ +│ Codec Types (Layer 3) │ │ │ -│ Built-in: │ +│ Built-in: │ │ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ @@ -31,7 +31,7 @@ This document defines a three-layer type architecture: **Syntax distinction:** - Core types: `int32`, `float64`, `varchar(255)` - no brackets -- AttributeTypes: ``, ``, `` - angle brackets +- Codec types: ``, ``, `` - angle brackets - The `@` character indicates external storage (object store vs database) ### OAS Storage Regions @@ -106,7 +106,7 @@ created_at : datetime = CURRENT_TIMESTAMP ### Binary Types -The core `bytes` type stores raw bytes without any serialization. Use `` AttributeType +The core `bytes` type stores raw bytes without any serialization. Use the `` codec for serialized Python objects. | Core Type | Description | MySQL | PostgreSQL | @@ -193,25 +193,25 @@ definitions. This ensures consistent behavior across all tables and simplifies p - **No per-column overrides**: `CHARACTER SET` and `COLLATE` are rejected in type definitions - **Like timezone**: Encoding is infrastructure configuration, not part of the data model -## AttributeTypes (Layer 3) +## Codec Types (Layer 3) -AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are +Codec types provide `encode()`/`decode()` semantics on top of core types. They are composable and can be built-in or user-defined. ### Storage Mode: `@` Convention -The `@` character in AttributeType syntax indicates **external storage** (object store): +The `@` character in codec syntax indicates **external storage** (object store): - **No `@`**: Internal storage (database) - e.g., ``, `` - **`@` present**: External storage (object store) - e.g., ``, `` - **`@` alone**: Use default store - e.g., `` - **`@name`**: Use named store - e.g., `` -Some types support both modes (``, ``), others are external-only (``, ``, ``). +Some codecs support both modes (``, ``), others are external-only (``, ``, ``). -### Type Resolution and Chaining +### Codec Resolution and Chaining -AttributeTypes resolve to core types through chaining. The `get_dtype(is_external)` method +Codecs resolve to core types through chaining. The `get_dtype(is_external)` method returns the appropriate dtype based on storage mode: ``` @@ -233,7 +233,7 @@ Resolution at declaration time: ### `` / `` - Path-Addressed Storage -**Built-in AttributeType. External only.** +**Built-in codec. External only.** OAS (Object-Augmented Schema) storage for files and folders: @@ -257,9 +257,9 @@ class Analysis(dj.Computed): #### Implementation ```python -class ObjectType(AttributeType): +class ObjectCodec(dj.Codec): """Path-addressed OAS storage. External only.""" - type_name = "object" + name = "object" def get_dtype(self, is_external: bool) -> str: if not is_external: @@ -278,7 +278,7 @@ class ObjectType(AttributeType): ### `` / `` - Hash-Addressed Storage -**Built-in AttributeType. External only.** +**Built-in codec. External only.** Hash-addressed storage with deduplication: @@ -303,9 +303,9 @@ store_root/ #### Implementation ```python -class HashType(AttributeType): +class HashCodec(dj.Codec): """Hash-addressed storage. External only.""" - type_name = "hash" + name = "hash" def get_dtype(self, is_external: bool) -> str: if not is_external: @@ -346,7 +346,7 @@ features JSONB NOT NULL ### `` - Portable External Reference -**Built-in AttributeType. External only (store required).** +**Built-in codec. External only (store required).** Relative path references within configured stores: @@ -397,9 +397,9 @@ just use `varchar`. A string is simpler and more transparent. #### Implementation ```python -class FilepathType(AttributeType): +class FilepathCodec(dj.Codec): """Store-relative file references. External only.""" - type_name = "filepath" + name = "filepath" def get_dtype(self, is_external: bool) -> str: if not is_external: @@ -452,12 +452,12 @@ column_name JSONB NOT NULL ``` The `json` database type: -- Used as dtype by built-in AttributeTypes (``, ``, ``) +- Used as dtype by built-in codecs (``, ``, ``) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available -## Built-in AttributeTypes +## Built-in Codecs ### `` / `` - Serialized Python Objects @@ -471,10 +471,10 @@ blob format. Compatible with MATLAB. - **``**: Stored in specific named store ```python -@dj.register_type -class BlobType(AttributeType): +@dj.codec +class BlobCodec(dj.Codec): """Serialized Python objects. Supports internal and external.""" - type_name = "blob" + name = "blob" def get_dtype(self, is_external: bool) -> str: return "" if is_external else "bytes" @@ -511,10 +511,10 @@ Stores files with filename preserved. On fetch, extracts to configured download - **``**: Stored in specific named store ```python -@dj.register_type -class AttachType(AttributeType): +@dj.codec +class AttachCodec(dj.Codec): """File attachment with filename. Supports internal and external.""" - type_name = "attach" + name = "attach" def get_dtype(self, is_external: bool) -> str: return "" if is_external else "bytes" @@ -543,15 +543,15 @@ class Attachments(dj.Manual): """ ``` -## User-Defined AttributeTypes +## User-Defined Codecs -Users can define custom AttributeTypes for domain-specific data: +Users can define custom codecs for domain-specific data: ```python -@dj.register_type -class GraphType(AttributeType): +@dj.codec +class GraphCodec(dj.Codec): """Store NetworkX graphs. Internal only (no external support).""" - type_name = "graph" + name = "graph" def get_dtype(self, is_external: bool) -> str: if is_external: @@ -568,13 +568,13 @@ class GraphType(AttributeType): return G ``` -Custom types can support both modes by returning different dtypes: +Custom codecs can support both modes by returning different dtypes: ```python -@dj.register_type -class ImageType(AttributeType): +@dj.codec +class ImageCodec(dj.Codec): """Store images. Supports both internal and external.""" - type_name = "image" + name = "image" def get_dtype(self, is_external: bool) -> str: return "" if is_external else "bytes" @@ -632,7 +632,7 @@ def garbage_collect(store_name): store.delete(hash_path(hash_id)) ``` -## Built-in AttributeType Comparison +## Built-in Codec Comparison | Feature | `` | `` | `` | `` | `` | |---------|----------|------------|-------------|--------------|---------------| @@ -658,13 +658,13 @@ def garbage_collect(store_name): 1. **Three-layer architecture**: - Layer 1: Native database types (backend-specific, discouraged) - Layer 2: Core DataJoint types (standardized, scientist-friendly) - - Layer 3: AttributeTypes (encode/decode, composable) + - Layer 3: Codec types (encode/decode, composable) 2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool`, `bytes` instead of `FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB` -3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types +3. **Codecs use angle brackets**: ``, ``, `` - distinguishes from core types 4. **`@` indicates external storage**: No `@` = database, `@` present = object store -5. **`get_dtype(is_external)` method**: Types resolve dtype at declaration time based on storage mode -6. **AttributeTypes are composable**: `` uses ``, which uses `json` -7. **Built-in external types use JSON dtype**: Stores metadata (path, hash, store name, etc.) +5. **`get_dtype(is_external)` method**: Codecs resolve dtype at declaration time based on storage mode +6. **Codecs are composable**: `` uses ``, which uses `json` +7. **Built-in external codecs use JSON dtype**: Stores metadata (path, hash, store name, etc.) 8. **Two OAS regions**: object (PK-addressed) and hash (hash-addressed) within managed stores 9. **Filepath for portability**: `` uses relative paths within stores for environment portability 10. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent @@ -673,9 +673,9 @@ def garbage_collect(store_name): - No `@` = internal storage (database) - `@` alone = default store - `@name` = named store -12. **Dual-mode types**: `` and `` support both internal and external storage -13. **External-only types**: ``, ``, `` require `@` -14. **Transparent access**: AttributeTypes return Python objects or file paths +12. **Dual-mode codecs**: `` and `` support both internal and external storage +13. **External-only codecs**: ``, ``, `` require `@` +14. **Transparent access**: Codecs return Python objects or file paths 15. **Lazy access**: `` and `` return ObjectRef 16. **MD5 for content hashing**: See [Hash Algorithm Choice](#hash-algorithm-choice) below 17. **No separate registry**: Hash metadata stored in JSON columns, not a separate table From 2dde3d9409f83a133ba290ce90853ec7b2c6b8e2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 21:18:42 +0000 Subject: [PATCH 20/32] Add Codec base class with __init_subclass__ auto-registration Design improvements for Python 3.10+: - Codecs auto-register when subclassed via __init_subclass__ - No decorator needed - just inherit from dj.Codec and set name - Use register=False for abstract base classes - Removed @dj.codec decorator from all examples New API: class GraphCodec(dj.Codec): name = "graph" def encode(...): ... def decode(...): ... Abstract bases: class ExternalOnlyCodec(dj.Codec, register=False): ... Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 119 ++++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 9493f295b..f7aead7de 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -209,6 +209,119 @@ The `@` character in codec syntax indicates **external storage** (object store): Some codecs support both modes (``, ``), others are external-only (``, ``, ``). +### Codec Base Class + +Codecs auto-register when subclassed using Python's `__init_subclass__` mechanism. +No decorator is needed. + +```python +from abc import ABC, abstractmethod +from typing import Any + +# Global codec registry +_codec_registry: dict[str, "Codec"] = {} + + +class Codec(ABC): + """ + Base class for codec types. Subclasses auto-register by name. + + Requires Python 3.10+. + """ + name: str | None = None # Must be set by concrete subclasses + + def __init_subclass__(cls, *, register: bool = True, **kwargs): + """Auto-register concrete codecs when subclassed.""" + super().__init_subclass__(**kwargs) + + if not register: + return # Skip registration for abstract bases + + if cls.name is None: + return # Skip registration if no name (abstract) + + if cls.name in _codec_registry: + existing = _codec_registry[cls.name] + if type(existing) is not cls: + raise DataJointError( + f"Codec <{cls.name}> already registered by {type(existing).__name__}" + ) + return # Same class, idempotent + + _codec_registry[cls.name] = cls() + + def get_dtype(self, is_external: bool) -> str: + """ + Return the storage dtype for this codec. + + Args: + is_external: True if @ modifier present (external storage) + + Returns: + A core type (e.g., "bytes", "json") or another codec (e.g., "") + """ + raise NotImplementedError + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: + """Encode Python value for storage.""" + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """Decode stored value back to Python.""" + ... + + def validate(self, value: Any) -> None: + """Optional validation before encoding. Override to add constraints.""" + pass + + +def list_codecs() -> list[str]: + """Return list of registered codec names.""" + return sorted(_codec_registry.keys()) + + +def get_codec(name: str) -> Codec: + """Get codec by name. Raises DataJointError if not found.""" + if name not in _codec_registry: + raise DataJointError(f"Unknown codec: <{name}>") + return _codec_registry[name] +``` + +**Usage - no decorator needed:** + +```python +class GraphCodec(dj.Codec): + """Auto-registered as .""" + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +**Skip registration for abstract bases:** + +```python +class ExternalOnlyCodec(dj.Codec, register=False): + """Abstract base for external-only codecs. Not registered.""" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(f"<{self.name}> requires @ (external only)") + return "json" +``` + ### Codec Resolution and Chaining Codecs resolve to core types through chaining. The `get_dtype(is_external)` method @@ -471,7 +584,6 @@ blob format. Compatible with MATLAB. - **``**: Stored in specific named store ```python -@dj.codec class BlobCodec(dj.Codec): """Serialized Python objects. Supports internal and external.""" name = "blob" @@ -511,7 +623,6 @@ Stores files with filename preserved. On fetch, extracts to configured download - **``**: Stored in specific named store ```python -@dj.codec class AttachCodec(dj.Codec): """File attachment with filename. Supports internal and external.""" name = "attach" @@ -548,7 +659,6 @@ class Attachments(dj.Manual): Users can define custom codecs for domain-specific data: ```python -@dj.codec class GraphCodec(dj.Codec): """Store NetworkX graphs. Internal only (no external support).""" name = "graph" @@ -562,6 +672,7 @@ class GraphCodec(dj.Codec): return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} def decode(self, stored, *, key=None): + import networkx as nx G = nx.Graph() G.add_nodes_from(stored['nodes']) G.add_edges_from(stored['edges']) @@ -571,7 +682,6 @@ class GraphCodec(dj.Codec): Custom codecs can support both modes by returning different dtypes: ```python -@dj.codec class ImageCodec(dj.Codec): """Store images. Supports both internal and external.""" name = "image" @@ -679,6 +789,7 @@ def garbage_collect(store_name): 15. **Lazy access**: `` and `` return ObjectRef 16. **MD5 for content hashing**: See [Hash Algorithm Choice](#hash-algorithm-choice) below 17. **No separate registry**: Hash metadata stored in JSON columns, not a separate table +18. **Auto-registration via `__init_subclass__`**: Codecs register automatically when subclassed—no decorator needed. Use `register=False` for abstract bases. Requires Python 3.10+. ### Hash Algorithm Choice From 8218fa728b3384b8fb243940da62537f1fae6c07 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 21:30:46 +0000 Subject: [PATCH 21/32] Update Codec API with get_dtype(is_external) method - Codec.get_dtype(is_external) now determines storage type based on whether @ modifier is present in the declaration - BlobCodec returns "bytes" for internal, "" for external - AttachCodec returns "bytes" for internal, "" for external - HashCodec, ObjectCodec, FilepathCodec enforce external-only usage - Consolidates / and / into unified codecs - Adds backward compatibility aliases for old type names - Updates __init__.py with new codec exports (Codec, list_codecs, get_codec) --- src/datajoint/__init__.py | 14 +- src/datajoint/attribute_type.py | 484 +++++++++++++++++--------------- src/datajoint/builtin_types.py | 396 +++++++++----------------- 3 files changed, 410 insertions(+), 484 deletions(-) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index a19aae6d0..f3744f7d6 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,6 +45,11 @@ "kill", "MatCell", "MatStruct", + # New codec API + "Codec", + "list_codecs", + "get_codec", + # Backward compatibility aliases "AttributeType", "register_type", "list_types", @@ -61,7 +66,14 @@ from . import errors from . import migrate from .admin import kill -from .attribute_type import AttributeType, list_types, register_type +from .attribute_type import ( + AttributeType, + Codec, + get_codec, + list_codecs, + list_types, + register_type, +) from .blob import MatCell, MatStruct from .cli import cli from .connection import Connection, conn diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 37fae88ca..75732b680 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -1,23 +1,28 @@ """ -Custom attribute type system for DataJoint. +Codec type system for DataJoint. -This module provides the AttributeType base class and registration mechanism -for creating custom data types that extend DataJoint's native type system. +This module provides the Codec base class for creating custom data types +that extend DataJoint's native type system. Codecs provide encode/decode +semantics for complex Python objects. -Custom types enable seamless integration of complex Python objects (like NumPy arrays, -graphs, or domain-specific structures) with DataJoint's relational storage. +Codecs auto-register when subclassed - no decorator needed (Python 3.10+). Example: - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" + class GraphCodec(dj.Codec): + name = "graph" - def encode(self, graph: nx.Graph) -> list: - return list(graph.edges) + def get_dtype(self, is_external: bool) -> str: + return "" - def decode(self, edges: list) -> nx.Graph: - return nx.Graph(edges) + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G # Then use in table definitions: class MyTable(dj.Manual): @@ -31,47 +36,46 @@ class MyTable(dj.Manual): from __future__ import annotations import logging +import warnings from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import Any from .errors import DataJointError -if TYPE_CHECKING: - pass - logger = logging.getLogger(__name__.split(".")[0]) -# Global type registry - maps type_name to AttributeType instance -_type_registry: dict[str, AttributeType] = {} +# Global codec registry - maps name to Codec instance +_codec_registry: dict[str, Codec] = {} _entry_points_loaded: bool = False -class AttributeType(ABC): +class Codec(ABC): """ - Base class for custom DataJoint attribute types. + Base class for codec types. Subclasses auto-register by name. - Subclass this to create custom types that can be used in table definitions - with the ```` syntax. Custom types define bidirectional conversion - between Python objects and DataJoint's storage format. + Requires Python 3.10+. Attributes: - type_name: Unique identifier used in ```` syntax - dtype: Underlying DataJoint storage type + name: Unique identifier used in ```` syntax. Must be set by subclasses. Example: - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" + class GraphCodec(dj.Codec): + name = "graph" - def encode(self, graph): - return list(graph.edges) + def get_dtype(self, is_external: bool) -> str: + return "" - def decode(self, edges): + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): import networkx as nx - return nx.Graph(edges) + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G - The type can then be used in table definitions:: + The codec can then be used in table definitions:: class Connectivity(dj.Manual): definition = ''' @@ -79,73 +83,79 @@ class Connectivity(dj.Manual): --- graph_data : ''' + + To skip auto-registration (for abstract base classes):: + + class ExternalOnlyCodec(dj.Codec, register=False): + '''Abstract base - not registered.''' + ... """ - @property - @abstractmethod - def type_name(self) -> str: - """ - Unique identifier for this type, used in table definitions as ````. + name: str | None = None # Must be set by concrete subclasses - This name must be unique across all registered types. It should be lowercase - with underscores (e.g., "graph", "zarr_array", "compressed_image"). + def __init_subclass__(cls, *, register: bool = True, **kwargs): + """Auto-register concrete codecs when subclassed.""" + super().__init_subclass__(**kwargs) - Returns: - The type name string without angle brackets. - """ - ... + if not register: + return # Skip registration for abstract bases - @property - @abstractmethod - def dtype(self) -> str: + if cls.name is None: + return # Skip registration if no name (abstract) + + if not isinstance(cls.name, str) or not cls.name: + raise DataJointError(f"Codec name must be a non-empty string, got {cls.name!r}") + + if cls.name in _codec_registry: + existing = _codec_registry[cls.name] + if type(existing) is not cls: + raise DataJointError( + f"Codec <{cls.name}> already registered by " + f"{type(existing).__module__}.{type(existing).__name__}" + ) + return # Same class, idempotent + + _codec_registry[cls.name] = cls() + logger.debug(f"Registered codec <{cls.name}> from {cls.__module__}.{cls.__name__}") + + def get_dtype(self, is_external: bool) -> str: """ - The underlying DataJoint type used for storage. + Return the storage dtype for this codec. - Can be: - - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"`` - - An external type: ``"blob@store"``, ``"attach@store"`` - - The object type: ``"object"`` - - Another custom type: ``""`` (enables type chaining) + Args: + is_external: True if @ modifier present (external storage) Returns: - The storage type specification string. + A core type (e.g., "bytes", "json") or another codec (e.g., "") + + Raises: + NotImplementedError: If not overridden by subclass. + DataJointError: If external storage not supported but requested. """ - ... + raise NotImplementedError(f"Codec <{self.name}> must implement get_dtype()") @abstractmethod - def encode(self, value: Any, *, key: dict | None = None) -> Any: + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: """ - Convert a Python object to the storable format. - - Called during INSERT operations to transform user-provided objects - into a format suitable for storage in the underlying ``dtype``. + Encode Python value for storage. Args: value: The Python object to store. - key: Primary key values as a dict. Available when the dtype uses - object storage and may be needed for path construction. + key: Primary key values as a dict. May be needed for path construction. + store_name: Target store name for external storage. Returns: - Value in the format expected by ``dtype``. For example: - - For ``dtype="longblob"``: any picklable Python object - - For ``dtype="object"``: path string or file-like object - - For ``dtype="varchar(N)"``: string + Value in the format expected by the dtype. """ ... @abstractmethod def decode(self, stored: Any, *, key: dict | None = None) -> Any: """ - Convert stored data back to a Python object. - - Called during FETCH operations to reconstruct the original Python - object from the stored format. + Decode stored value back to Python. Args: - stored: Data retrieved from storage. Type depends on ``dtype``: - - For ``"object"``: an ``ObjectRef`` handle - - For blob types: the unpacked Python object - - For native types: the native Python value (str, int, etc.) + stored: Data retrieved from storage. key: Primary key values as a dict. Returns: @@ -170,76 +180,73 @@ def validate(self, value: Any) -> None: """ pass - def default(self) -> Any: - """ - Return a default value for this type. + # ========================================================================= + # Backward compatibility properties + # ========================================================================= - Override if the type has a sensible default value. The default - implementation raises NotImplementedError, indicating no default exists. + @property + def type_name(self) -> str | None: + """Backward compatibility alias for `name`.""" + return self.name - Returns: - The default value for this type. + @property + def dtype(self) -> str: + """ + Backward compatibility property. - Raises: - NotImplementedError: If no default exists (the default behavior). + Deprecated: Use get_dtype(is_external) instead. """ - raise NotImplementedError(f"No default value for type <{self.type_name}>") + warnings.warn( + "Codec.dtype property is deprecated. Use get_dtype(is_external) instead.", + DeprecationWarning, + stacklevel=2, + ) + return self.get_dtype(is_external=False) def __repr__(self) -> str: - return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>" + return f"<{self.__class__.__name__}(name={self.name!r})>" + + +# Backward compatibility alias +AttributeType = Codec -def register_type(cls: type[AttributeType]) -> type[AttributeType]: +def register_type(cls: type[Codec]) -> type[Codec]: """ - Register a custom attribute type with DataJoint. + Register a codec with DataJoint. - Can be used as a decorator or called directly. The type becomes available - for use in table definitions with the ```` syntax. + Deprecated: Codecs now auto-register when subclassed. This function + is kept for backward compatibility but is no longer needed. Args: - cls: An AttributeType subclass to register. + cls: A Codec subclass to register. Returns: - The same class, unmodified (allows use as decorator). + The same class, unmodified. + """ + warnings.warn( + "@dj.register_type is deprecated. Codecs auto-register when subclassed. " + "Just inherit from dj.Codec and set the 'name' class attribute.", + DeprecationWarning, + stacklevel=2, + ) - Raises: - DataJointError: If a type with the same name is already registered - by a different class. - TypeError: If cls is not an AttributeType subclass. + if not isinstance(cls, type) or not issubclass(cls, Codec): + raise TypeError(f"register_type requires a Codec subclass, got {cls!r}") - Example: - As a decorator:: + # If already registered via __init_subclass__, this is a no-op + if cls.name and cls.name in _codec_registry: + return cls - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - ... + # Manual registration for classes that didn't auto-register + if cls.name: + _codec_registry[cls.name] = cls() - Or called directly:: + return cls - dj.register_type(GraphType) - """ - if not isinstance(cls, type) or not issubclass(cls, AttributeType): - raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}") - - instance = cls() - name = instance.type_name - - if not isinstance(name, str) or not name: - raise DataJointError(f"type_name must be a non-empty string, got {name!r}") - - if name in _type_registry: - existing = _type_registry[name] - if type(existing) is not cls: - raise DataJointError( - f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}" - ) - # Same class registered twice - idempotent, no error - return cls - _type_registry[name] = instance - logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}") - return cls +# Backward compatibility alias +codec = register_type def parse_type_spec(spec: str) -> tuple[str, str | None]: @@ -247,16 +254,18 @@ def parse_type_spec(spec: str) -> tuple[str, str | None]: Parse a type specification into type name and optional store parameter. Handles formats like: - - "" -> ("xblob", None) - - "" -> ("xblob", "cold") - - "xblob@cold" -> ("xblob", "cold") - - "xblob" -> ("xblob", None) + - "" -> ("blob", None) + - "" -> ("blob", "cold") + - "" -> ("blob", "") # default store + - "blob@cold" -> ("blob", "cold") + - "blob" -> ("blob", None) Args: spec: Type specification string, with or without angle brackets. Returns: - Tuple of (type_name, store_name). store_name is None if not specified. + Tuple of (type_name, store_name). store_name is None if not specified, + empty string if @ present without name (default store). """ # Strip angle brackets spec = spec.strip("<>").strip() @@ -268,97 +277,115 @@ def parse_type_spec(spec: str) -> tuple[str, str | None]: return spec, None -def unregister_type(name: str) -> None: +def unregister_codec(name: str) -> None: """ - Remove a type from the registry. + Remove a codec from the registry. Primarily useful for testing. Use with caution in production code. Args: - name: The type_name to unregister. + name: The codec name to unregister. Raises: - DataJointError: If the type is not registered. + DataJointError: If the codec is not registered. """ name = name.strip("<>") - if name not in _type_registry: - raise DataJointError(f"Type <{name}> is not registered") - del _type_registry[name] + if name not in _codec_registry: + raise DataJointError(f"Codec <{name}> is not registered") + del _codec_registry[name] + + +# Backward compatibility alias +unregister_type = unregister_codec -def get_type(name: str) -> AttributeType: +def get_codec(name: str) -> Codec: """ - Retrieve a registered attribute type by name. + Retrieve a registered codec by name. - Looks up the type in the explicit registry first, then attempts + Looks up the codec in the explicit registry first, then attempts to load from installed packages via entry points. Args: - name: The type name, with or without angle brackets. - Store parameters (e.g., "") are stripped. + name: The codec name, with or without angle brackets. + Store parameters (e.g., "") are stripped. Returns: - The registered AttributeType instance. + The registered Codec instance. Raises: - DataJointError: If the type is not found. + DataJointError: If the codec is not found. """ # Strip angle brackets and store parameter type_name, _ = parse_type_spec(name) # Check explicit registry first - if type_name in _type_registry: - return _type_registry[type_name] + if type_name in _codec_registry: + return _codec_registry[type_name] # Lazy-load entry points _load_entry_points() - if type_name in _type_registry: - return _type_registry[type_name] + if type_name in _codec_registry: + return _codec_registry[type_name] raise DataJointError( - f"Unknown attribute type: <{type_name}>. " - f"Ensure the type is registered via @dj.register_type or installed as a package." + f"Unknown codec: <{type_name}>. " + f"Ensure the codec is defined (inherit from dj.Codec with name='{type_name}')." ) -def list_types() -> list[str]: +# Backward compatibility alias +get_type = get_codec + + +def list_codecs() -> list[str]: """ - List all registered type names. + List all registered codec names. Returns: - Sorted list of registered type names. + Sorted list of registered codec names. """ _load_entry_points() - return sorted(_type_registry.keys()) + return sorted(_codec_registry.keys()) + + +# Backward compatibility alias +list_types = list_codecs -def is_type_registered(name: str) -> bool: +def is_codec_registered(name: str) -> bool: """ - Check if a type name is registered. + Check if a codec name is registered. Args: - name: The type name to check (store parameters are ignored). + name: The codec name to check (store parameters are ignored). Returns: - True if the type is registered. + True if the codec is registered. """ type_name, _ = parse_type_spec(name) - if type_name in _type_registry: + if type_name in _codec_registry: return True _load_entry_points() - return type_name in _type_registry + return type_name in _codec_registry + + +# Backward compatibility alias +is_type_registered = is_codec_registered def _load_entry_points() -> None: """ - Load attribute types from installed packages via entry points. + Load codecs from installed packages via entry points. + + Codecs are discovered from the ``datajoint.codecs`` entry point group + (also checks legacy ``datajoint.types`` for backward compatibility). - Types are discovered from the ``datajoint.types`` entry point group. - Packages declare types in pyproject.toml:: + Packages declare codecs in pyproject.toml:: - [project.entry-points."datajoint.types"] - zarr_array = "dj_zarr:ZarrArrayType" + [project.entry-points."datajoint.codecs"] + zarr_array = "dj_zarr:ZarrArrayCodec" This function is idempotent - entry points are only loaded once. """ @@ -371,89 +398,99 @@ def _load_entry_points() -> None: try: from importlib.metadata import entry_points except ImportError: - # Python < 3.10 fallback - try: - from importlib_metadata import entry_points - except ImportError: - logger.debug("importlib.metadata not available, skipping entry point discovery") - return + logger.debug("importlib.metadata not available, skipping entry point discovery") + return - try: - # Python 3.10+ / importlib_metadata 3.6+ - eps = entry_points(group="datajoint.types") - except TypeError: - # Older API - eps = entry_points().get("datajoint.types", []) - - for ep in eps: - if ep.name in _type_registry: - # Already registered explicitly, skip entry point - continue + # Load from both new and legacy entry point groups + for group in ("datajoint.codecs", "datajoint.types"): try: - type_class = ep.load() - register_type(type_class) - logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}") - except Exception as e: - logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") + eps = entry_points(group=group) + except TypeError: + # Older API fallback + eps = entry_points().get(group, []) + + for ep in eps: + if ep.name in _codec_registry: + # Already registered explicitly, skip entry point + continue + try: + codec_class = ep.load() + # The class should auto-register via __init_subclass__ + # But if it's an old-style class, manually register + if ep.name not in _codec_registry and hasattr(codec_class, 'name'): + _codec_registry[ep.name] = codec_class() + logger.debug(f"Loaded codec <{ep.name}> from entry point {ep.value}") + except Exception as e: + logger.warning(f"Failed to load codec '{ep.name}' from {ep.value}: {e}") def resolve_dtype( dtype: str, seen: set[str] | None = None, store_name: str | None = None -) -> tuple[str, list[AttributeType], str | None]: +) -> tuple[str, list[Codec], str | None]: """ - Resolve a dtype string, following type chains. + Resolve a dtype string, following codec chains. - If dtype references another custom type (e.g., ""), recursively + If dtype references another codec (e.g., ""), recursively resolves to find the ultimate storage type. Store parameters are propagated through the chain. Args: - dtype: The dtype string to resolve (e.g., "", "", "longblob"). - seen: Set of already-seen type names (for cycle detection). + dtype: The dtype string to resolve (e.g., "", "", "bytes"). + seen: Set of already-seen codec names (for cycle detection). store_name: Store name from outer type specification (propagated inward). Returns: - Tuple of (final_storage_type, list_of_types_in_chain, resolved_store_name). - The chain is ordered from outermost to innermost type. + Tuple of (final_storage_type, list_of_codecs_in_chain, resolved_store_name). + The chain is ordered from outermost to innermost codec. Raises: DataJointError: If a circular type reference is detected. Examples: - >>> resolve_dtype("") - ("json", [XBlobType, ContentType], None) + >>> resolve_dtype("") + ("bytes", [BlobCodec], None) - >>> resolve_dtype("") - ("json", [XBlobType, ContentType], "cold") + >>> resolve_dtype("") + ("", [BlobCodec], "cold") # BlobCodec.get_dtype(True) returns "" - >>> resolve_dtype("longblob") - ("longblob", [], None) + >>> resolve_dtype("bytes") + ("bytes", [], None) """ if seen is None: seen = set() - chain: list[AttributeType] = [] + chain: list[Codec] = [] - # Check if dtype is a custom type reference + # Check if dtype is a codec reference if dtype.startswith("<") and dtype.endswith(">"): type_name, dtype_store = parse_type_spec(dtype) # Store from this level overrides inherited store - effective_store = dtype_store if dtype_store is not None else store_name + # Empty string means default store (@), None means no store specified + if dtype_store is not None: + effective_store = dtype_store + else: + effective_store = store_name if type_name in seen: - raise DataJointError(f"Circular type reference detected: <{type_name}>") + raise DataJointError(f"Circular codec reference detected: <{type_name}>") seen.add(type_name) - attr_type = get_type(type_name) - chain.append(attr_type) + codec = get_codec(type_name) + chain.append(codec) + + # Determine if external based on whether @ is present + is_external = effective_store is not None + + # Get the inner dtype from the codec + inner_dtype = codec.get_dtype(is_external) # Recursively resolve the inner dtype, propagating store - inner_dtype, inner_chain, resolved_store = resolve_dtype(attr_type.dtype, seen, effective_store) + final_dtype, inner_chain, resolved_store = resolve_dtype(inner_dtype, seen, effective_store) chain.extend(inner_chain) - return inner_dtype, chain, resolved_store + return final_dtype, chain, resolved_store - # Not a custom type - check if it has a store suffix (e.g., "blob@store") + # Not a codec - check if it has a store suffix (e.g., "blob@store") if "@" in dtype: base_type, dtype_store = dtype.split("@", 1) effective_store = dtype_store if dtype_store else store_name @@ -463,35 +500,38 @@ def resolve_dtype( return dtype, chain, store_name -def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: +def get_adapter(context: dict | None, adapter_name: str) -> tuple[Codec, str | None]: """ - Get an attribute type by name. + Get a codec by name. This is a compatibility function used by heading and declare modules. Args: context: Ignored (legacy parameter, kept for API compatibility). - adapter_name: The type name, with or without angle brackets. - May include store parameter (e.g., ""). + adapter_name: The codec name, with or without angle brackets. + May include store parameter (e.g., ""). Returns: - Tuple of (AttributeType instance, store_name or None). + Tuple of (Codec instance, store_name or None). Raises: - DataJointError: If the type is not found. + DataJointError: If the codec is not found. """ type_name, store_name = parse_type_spec(adapter_name) - if is_type_registered(type_name): - return get_type(type_name), store_name + if is_codec_registered(type_name): + return get_codec(type_name), store_name - raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") + raise DataJointError( + f"Codec <{type_name}> is not registered. " + "Define a Codec subclass with name='{type_name}'." + ) # ============================================================================= -# Auto-register built-in types +# Auto-register built-in codecs # ============================================================================= -# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.) -# This import has a side effect: it registers the types via @register_type decorators +# Import builtin_types module to register built-in codecs +# This import has a side effect: it registers the codecs via __init_subclass__ from . import builtin_types as _builtin_types # noqa: F401, E402 diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py index 3c1654a61..611042348 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_types.py @@ -1,31 +1,30 @@ """ -Built-in DataJoint attribute types. +Built-in DataJoint codecs. -This module defines the standard AttributeTypes that ship with DataJoint. -These serve as both useful built-in types and as examples for users who -want to create their own custom types. +This module defines the standard codecs that ship with DataJoint. +These serve as both useful built-in codecs and as examples for users who +want to create their own custom codecs. -Built-in Types: - - ````: Serialize Python objects to DataJoint's blob format (internal storage) - - ````: Content-addressed storage with SHA256 deduplication - - ````: External serialized blobs using content-addressed storage +Built-in Codecs: + - ````: Serialize Python objects (internal) or external with dedup + - ````: Hash-addressed storage with MD5 deduplication - ````: Path-addressed storage for files/folders (Zarr, HDF5) - - ````: Internal file attachment stored in database - - ````: External file attachment with deduplication + - ````: File attachment (internal) or external with dedup - ````: Reference to existing file in store -Example - Creating a Custom Type: - Here's how to define your own AttributeType, modeled after the built-in types:: +Example - Creating a Custom Codec: + Here's how to define your own codec, modeled after the built-in codecs:: import datajoint as dj import networkx as nx - @dj.register_type - class GraphType(dj.AttributeType): + class GraphCodec(dj.Codec): '''Store NetworkX graphs as edge lists.''' - type_name = "graph" # Use as in definitions - dtype = "" # Compose with djblob for serialization + name = "graph" # Use as in definitions + + def get_dtype(self, is_external: bool) -> str: + return "" # Compose with blob for serialization def encode(self, graph, *, key=None, store_name=None): # Convert graph to a serializable format @@ -59,22 +58,26 @@ class Networks(dj.Manual): from typing import Any -from .attribute_type import AttributeType, register_type +from .attribute_type import Codec +from .errors import DataJointError # ============================================================================= -# DJBlob Types - DataJoint's native serialization +# Blob Codec - DataJoint's native serialization # ============================================================================= -@register_type -class DJBlobType(AttributeType): +class BlobCodec(Codec): """ Serialize Python objects using DataJoint's blob format. - The ```` type handles serialization of arbitrary Python objects + The ```` codec handles serialization of arbitrary Python objects including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs. - Data is stored in a MySQL ``LONGBLOB`` column. + + Supports both internal and external storage: + - ````: Stored in database (bytes → LONGBLOB) + - ````: Stored externally via ```` with deduplication + - ````: Stored in specific named store Format Features: - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) @@ -88,19 +91,20 @@ class ProcessedData(dj.Manual): definition = ''' data_id : int --- - results : # Serialized Python objects + small_result : # internal (in database) + large_result : # external (default store) + archive : # external (specific store) ''' # Insert any serializable object - table.insert1({'data_id': 1, 'results': {'scores': [0.9, 0.8], 'labels': ['a', 'b']}}) - - Note: - Plain ``longblob`` columns store raw bytes without serialization. - Use ```` when you need automatic serialization. + table.insert1({'data_id': 1, 'small_result': {'scores': [0.9, 0.8]}}) """ - type_name = "djblob" - dtype = "longblob" + name = "blob" + + def get_dtype(self, is_external: bool) -> str: + """Return bytes for internal, for external storage.""" + return "" if is_external else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """Serialize a Python object to DataJoint's blob format.""" @@ -115,23 +119,28 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: return blob.unpack(stored, squeeze=False) +# Backward compatibility alias +DJBlobType = BlobCodec + + # ============================================================================= -# Content-Addressed Storage Types +# Hash-Addressed Storage Codec # ============================================================================= -@register_type -class ContentType(AttributeType): +class HashCodec(Codec): """ - Content-addressed storage with SHA256 deduplication. + Hash-addressed storage with MD5 deduplication. - The ```` type stores raw bytes using content-addressed storage. - Data is identified by its SHA256 hash and stored in a hierarchical directory: - ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` + The ```` codec stores raw bytes using content-addressed storage. + Data is identified by its MD5 hash and stored in a hierarchical directory: + ``_hash/{hash[:2]}/{hash[2:4]}/{hash}`` The database column stores JSON metadata: ``{hash, store, size}``. Duplicate content is automatically deduplicated. + External only - requires @ modifier. + Example:: @schema @@ -139,20 +148,24 @@ class RawContent(dj.Manual): definition = ''' content_id : int --- - data : + data : ''' # Insert raw bytes table.insert1({'content_id': 1, 'data': b'raw binary content'}) Note: - This type accepts only ``bytes``. For Python objects, use ````. - A store must be specified (e.g., ````) unless a default - store is configured. + This codec accepts only ``bytes``. For Python objects, use ````. + Typically used indirectly via ```` or ```` rather than directly. """ - type_name = "content" - dtype = "json" + name = "hash" + + def get_dtype(self, is_external: bool) -> str: + """Hash storage is external only.""" + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: """ @@ -166,9 +179,9 @@ def encode(self, value: bytes, *, key: dict | None = None, store_name: str | Non Returns: Metadata dict: {hash, store, size} """ - from .content_registry import put_content + from .hash_registry import put_hash_content - return put_content(value, store_name=store_name) + return put_hash_content(value, store_name=store_name) def decode(self, stored: dict, *, key: dict | None = None) -> bytes: """ @@ -181,82 +194,33 @@ def decode(self, stored: dict, *, key: dict | None = None) -> bytes: Returns: Original bytes. """ - from .content_registry import get_content + from .hash_registry import get_hash_content - return get_content(stored["hash"], store_name=stored.get("store")) + return get_hash_content(stored["hash"], store_name=stored.get("store")) def validate(self, value: Any) -> None: """Validate that value is bytes.""" if not isinstance(value, bytes): - raise TypeError(f" expects bytes, got {type(value).__name__}") + raise TypeError(f" expects bytes, got {type(value).__name__}") -@register_type -class XBlobType(AttributeType): - """ - External serialized blobs with content-addressed storage. - - The ```` type combines DataJoint's blob serialization with - content-addressed storage. Objects are serialized, then stored externally - with automatic deduplication. - - This is ideal for large objects (NumPy arrays, DataFrames) that may be - duplicated across rows. - - Example:: - - @schema - class LargeArrays(dj.Manual): - definition = ''' - array_id : int - --- - data : - ''' - - import numpy as np - table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) - - Type Composition: - ```` composes with ````:: - - Insert: object → blob.pack() → put_content() → JSON metadata - Fetch: JSON → get_content() → blob.unpack() → object - - Note: - - For internal storage, use ```` - - For raw bytes without serialization, use ```` - """ - - type_name = "xblob" - dtype = "" # Composition: uses ContentType - - def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: - """Serialize object to bytes (passed to ContentType).""" - from . import blob - - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key: dict | None = None) -> Any: - """Deserialize bytes back to Python object.""" - from . import blob - - return blob.unpack(stored, squeeze=False) +# Backward compatibility alias +ContentType = HashCodec # ============================================================================= -# Path-Addressed Storage Types (OAS - Object-Augmented Schema) +# Path-Addressed Storage Codec (OAS - Object-Augmented Schema) # ============================================================================= -@register_type -class ObjectType(AttributeType): +class ObjectCodec(Codec): """ Path-addressed storage for files and folders. - The ```` type provides managed file/folder storage where the path - is derived from the primary key: ``{schema}/{table}/objects/{pk}/{field}_{token}.{ext}`` + The ```` codec provides managed file/folder storage where the path + is derived from the primary key: ``{schema}/{table}/{pk}/{field}/`` - Unlike ```` (content-addressed), each row has its own storage path, + Unlike ```` (hash-addressed), each row has its own storage path, and content is deleted when the row is deleted. This is ideal for: - Zarr arrays (hierarchical chunked data) @@ -264,6 +228,8 @@ class ObjectType(AttributeType): - Complex multi-file outputs - Any content that shouldn't be deduplicated + External only - requires @ modifier. + Example:: @schema @@ -287,26 +253,25 @@ def make(self, key): Storage Structure: Objects are stored at:: - {store_root}/{schema}/{table}/objects/{pk}/{field}_{token}.ext + {store_root}/{schema}/{table}/{pk}/{field}/ - The token ensures uniqueness even if content is replaced. + Comparison with ````:: - Comparison with ````:: - - | Aspect | | | + | Aspect | | | |----------------|-------------------|---------------------| | Addressing | Path (by PK) | Hash (by content) | | Deduplication | No | Yes | | Deletion | With row | GC when unreferenced| | Use case | Zarr, HDF5 | Blobs, attachments | - - Note: - A store must be specified (````) unless a default store - is configured. Returns ``ObjectRef`` on fetch for lazy access. """ - type_name = "object" - dtype = "json" + name = "object" + + def get_dtype(self, is_external: bool) -> str: + """Object storage is external only.""" + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode( self, @@ -335,8 +300,7 @@ def encode( from datetime import datetime, timezone from pathlib import Path - from .content_registry import get_store_backend - from .storage import build_object_path + from .storage import build_object_path, get_store_backend # Extract context from key key = key or {} @@ -355,7 +319,6 @@ def encode( ext = None size = None item_count = None - source_path = None if isinstance(value, bytes): content = value @@ -371,8 +334,6 @@ def encode( elif isinstance(value, (str, Path)): source_path = Path(value) if not source_path.exists(): - from .errors import DataJointError - raise DataJointError(f"Source path not found: {source_path}") is_dir = source_path.is_dir() ext = source_path.suffix if not is_dir else None @@ -434,8 +395,8 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Returns: ObjectRef for accessing the stored content. """ - from .content_registry import get_store_backend from .objectref import ObjectRef + from .storage import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -459,17 +420,24 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects bytes or path, got {type(value).__name__}") +# Backward compatibility alias +ObjectType = ObjectCodec + + # ============================================================================= -# File Attachment Types +# File Attachment Codecs # ============================================================================= -@register_type -class AttachType(AttributeType): +class AttachCodec(Codec): """ - Internal file attachment stored in database. + File attachment with filename preserved. + + Supports both internal and external storage: + - ````: Stored in database (bytes → LONGBLOB) + - ````: Stored externally via ```` with deduplication + - ````: Stored in specific named store - The ```` type stores a file directly in the database as a ``LONGBLOB``. The filename is preserved and the file is extracted to the configured download path on fetch. @@ -480,26 +448,27 @@ class Documents(dj.Manual): definition = ''' doc_id : int --- - report : + config : # internal (small file in DB) + dataset : # external (default store) + archive : # external (specific store) ''' # Insert a file - table.insert1({'doc_id': 1, 'report': '/path/to/report.pdf'}) + table.insert1({'doc_id': 1, 'config': '/path/to/config.json'}) # Fetch extracts to download_path and returns local path - local_path = (table & 'doc_id=1').fetch1('report') + local_path = (table & 'doc_id=1').fetch1('config') - Storage Format: + Storage Format (internal): The blob contains: ``filename\\0contents`` - Filename (UTF-8 encoded) + null byte + raw file contents - - Note: - - For large files, use ```` (external storage with deduplication) - - For files that shouldn't be copied, use ```` """ - type_name = "attach" - dtype = "longblob" + name = "attach" + + def get_dtype(self, is_external: bool) -> str: + """Return bytes for internal, for external storage.""" + return "" if is_external else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """ @@ -576,138 +545,26 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects a file path, got {type(value).__name__}") -@register_type -class XAttachType(AttributeType): - """ - External file attachment with content-addressed storage. - - The ```` type stores files externally using content-addressed - storage. Like ````, the filename is preserved and the file is - extracted on fetch. Unlike ````, files are stored externally - with automatic deduplication. - - Example:: - - @schema - class LargeDocuments(dj.Manual): - definition = ''' - doc_id : int - --- - dataset : - ''' - - # Insert a large file - table.insert1({'doc_id': 1, 'dataset': '/path/to/large_file.h5'}) - - # Fetch downloads and returns local path - local_path = (table & 'doc_id=1').fetch1('dataset') - - Type Composition: - ```` composes with ````:: - - Insert: file → read + encode filename → put_content() → JSON - Fetch: JSON → get_content() → extract → local path - - Comparison:: - - | Type | Storage | Deduplication | Best for | - |------------|----------|---------------|---------------------| - | | Database | No | Small files (<16MB) | - | | External | Yes | Large files | - """ - - type_name = "xattach" - dtype = "" # Composition: uses ContentType - - def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: - """ - Read file and encode as filename + contents. - - Args: - value: Path to file (str or Path). - key: Primary key values (unused). - store_name: Passed to ContentType for storage. - - Returns: - Bytes: filename (UTF-8) + null byte + file contents - """ - from pathlib import Path - - path = Path(value) - if not path.exists(): - raise FileNotFoundError(f"Attachment file not found: {path}") - if path.is_dir(): - raise IsADirectoryError(f" does not support directories: {path}") - - filename = path.name - contents = path.read_bytes() - return filename.encode("utf-8") + b"\x00" + contents - - def decode(self, stored: bytes, *, key: dict | None = None) -> str: - """ - Extract file to download path and return local path. - - Args: - stored: Bytes containing filename + null + contents. - key: Primary key values (unused). - - Returns: - Path to extracted file as string. - """ - from pathlib import Path - - from .settings import config - - # Split on first null byte - null_pos = stored.index(b"\x00") - filename = stored[:null_pos].decode("utf-8") - contents = stored[null_pos + 1 :] - - # Write to download path - download_path = Path(config.get("download_path", ".")) - download_path.mkdir(parents=True, exist_ok=True) - local_path = download_path / filename - - # Handle filename collision - if file exists with different content, add suffix - if local_path.exists(): - existing_contents = local_path.read_bytes() - if existing_contents != contents: - # Find unique filename - stem = local_path.stem - suffix = local_path.suffix - counter = 1 - while local_path.exists() and local_path.read_bytes() != contents: - local_path = download_path / f"{stem}_{counter}{suffix}" - counter += 1 - - # Only write if file doesn't exist or has different content - if not local_path.exists(): - local_path.write_bytes(contents) - - return str(local_path) - - def validate(self, value: Any) -> None: - """Validate that value is a valid file path.""" - from pathlib import Path - - if not isinstance(value, (str, Path)): - raise TypeError(f" expects a file path, got {type(value).__name__}") +# Backward compatibility aliases +AttachType = AttachCodec +XAttachType = AttachCodec # is now just AttachCodec with external storage # ============================================================================= -# Filepath Reference Type +# Filepath Reference Codec # ============================================================================= -@register_type -class FilepathType(AttributeType): +class FilepathCodec(Codec): """ Reference to existing file in configured store. - The ```` type stores a reference to a file that already - exists in the storage backend. Unlike ```` or ````, no + The ```` codec stores a reference to a file that already + exists in the storage backend. Unlike ```` or ````, no file copying occurs - only the path is recorded. + External only - requires @store. + This is useful when: - Files are managed externally (e.g., by acquisition software) - Files are too large to copy @@ -739,8 +596,13 @@ class Recordings(dj.Manual): DataJoint does not manage the lifecycle of referenced files. """ - type_name = "filepath" - dtype = "json" + name = "filepath" + + def get_dtype(self, is_external: bool) -> str: + """Filepath is external only.""" + if not is_external: + raise DataJointError(" requires @store") + return "json" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict: """ @@ -756,7 +618,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ from datetime import datetime, timezone - from .content_registry import get_store_backend + from .storage import get_store_backend path = str(value) @@ -790,8 +652,8 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Returns: ObjectRef for accessing the file. """ - from .content_registry import get_store_backend from .objectref import ObjectRef + from .storage import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -803,3 +665,15 @@ def validate(self, value: Any) -> None: if not isinstance(value, (str, Path)): raise TypeError(f" expects a path string or Path, got {type(value).__name__}") + + +# Backward compatibility alias +FilepathType = FilepathCodec + + +# ============================================================================= +# Legacy aliases for backward compatibility +# ============================================================================= + +# Old names that mapped to content-addressed storage +XBlobType = BlobCodec # is now From e7054dd642db278f11991efaf571906adbb70c0a Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 16:37:47 -0600 Subject: [PATCH 22/32] Unify codec names and fix external storage chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove legacy codecs (djblob, xblob, xattach, content) - Use unified codecs: , , , , - All codecs support both internal and external modes via @store modifier - Fix dtype chain resolution to propagate store to inner codecs - Fix fetch.py to resolve correct chain for external storage - Update tests to use new codec API (name, get_dtype method) - Fix imports: use content_registry for get_store_backend - Add 'local' store to mock_object_storage fixture All 471 tests pass. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/attribute_type.py | 22 +- src/datajoint/builtin_types.py | 31 +-- src/datajoint/content_registry.py | 2 +- src/datajoint/declare.py | 12 +- src/datajoint/fetch.py | 12 +- src/datajoint/gc.py | 32 ++- src/datajoint/heading.py | 4 +- src/datajoint/jobs.py | 4 +- src/datajoint/migrate.py | 20 +- src/datajoint/table.py | 2 +- tests/conftest.py | 9 + tests/integration/test_autopopulate.py | 4 +- tests/integration/test_blob_matlab.py | 2 +- tests/integration/test_fetch_same.py | 2 +- tests/integration/test_gc.py | 30 +-- tests/integration/test_type_composition.py | 184 +++++++++-------- tests/integration/test_update1.py | 4 +- tests/schema.py | 8 +- tests/schema_adapted.py | 28 +-- tests/schema_alter.py | 2 +- tests/schema_external.py | 12 +- tests/schema_object.py | 10 +- tests/schema_simple.py | 2 +- tests/unit/test_attribute_type.py | 226 +++++++++++---------- 24 files changed, 367 insertions(+), 297 deletions(-) diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 75732b680..1a40abfa6 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -110,8 +110,7 @@ def __init_subclass__(cls, *, register: bool = True, **kwargs): existing = _codec_registry[cls.name] if type(existing) is not cls: raise DataJointError( - f"Codec <{cls.name}> already registered by " - f"{type(existing).__module__}.{type(existing).__name__}" + f"Codec <{cls.name}> already registered by " f"{type(existing).__module__}.{type(existing).__name__}" ) return # Same class, idempotent @@ -234,9 +233,14 @@ def register_type(cls: type[Codec]) -> type[Codec]: if not isinstance(cls, type) or not issubclass(cls, Codec): raise TypeError(f"register_type requires a Codec subclass, got {cls!r}") - # If already registered via __init_subclass__, this is a no-op + # Check if already registered if cls.name and cls.name in _codec_registry: - return cls + existing = _codec_registry[cls.name] + if type(existing) is not cls: + raise DataJointError( + f"Codec <{cls.name}> already registered by " f"{type(existing).__module__}.{type(existing).__name__}" + ) + return cls # Same class, idempotent # Manual registration for classes that didn't auto-register if cls.name: @@ -330,8 +334,7 @@ def get_codec(name: str) -> Codec: return _codec_registry[type_name] raise DataJointError( - f"Unknown codec: <{type_name}>. " - f"Ensure the codec is defined (inherit from dj.Codec with name='{type_name}')." + f"Unknown codec: <{type_name}>. " f"Ensure the codec is defined (inherit from dj.Codec with name='{type_name}')." ) @@ -417,7 +420,7 @@ def _load_entry_points() -> None: codec_class = ep.load() # The class should auto-register via __init_subclass__ # But if it's an old-style class, manually register - if ep.name not in _codec_registry and hasattr(codec_class, 'name'): + if ep.name not in _codec_registry and hasattr(codec_class, "name"): _codec_registry[ep.name] = codec_class() logger.debug(f"Loaded codec <{ep.name}> from entry point {ep.value}") except Exception as e: @@ -522,10 +525,7 @@ def get_adapter(context: dict | None, adapter_name: str) -> tuple[Codec, str | N if is_codec_registered(type_name): return get_codec(type_name), store_name - raise DataJointError( - f"Codec <{type_name}> is not registered. " - "Define a Codec subclass with name='{type_name}'." - ) + raise DataJointError(f"Codec <{type_name}> is not registered. " "Define a Codec subclass with name='{type_name}'.") # ============================================================================= diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py index 611042348..d734e8e0b 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_types.py @@ -119,8 +119,7 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: return blob.unpack(stored, squeeze=False) -# Backward compatibility alias -DJBlobType = BlobCodec +# Note: DJBlobType is defined at end of file as DJBlobCodec (not BlobCodec) # ============================================================================= @@ -179,9 +178,9 @@ def encode(self, value: bytes, *, key: dict | None = None, store_name: str | Non Returns: Metadata dict: {hash, store, size} """ - from .hash_registry import put_hash_content + from .content_registry import put_content - return put_hash_content(value, store_name=store_name) + return put_content(value, store_name=store_name) def decode(self, stored: dict, *, key: dict | None = None) -> bytes: """ @@ -194,9 +193,9 @@ def decode(self, stored: dict, *, key: dict | None = None) -> bytes: Returns: Original bytes. """ - from .hash_registry import get_hash_content + from .content_registry import get_content - return get_hash_content(stored["hash"], store_name=stored.get("store")) + return get_content(stored["hash"], store_name=stored.get("store")) def validate(self, value: Any) -> None: """Validate that value is bytes.""" @@ -204,8 +203,7 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects bytes, got {type(value).__name__}") -# Backward compatibility alias -ContentType = HashCodec +# Note: ContentType is defined at end of file as ContentCodec (not HashCodec) # ============================================================================= @@ -300,7 +298,8 @@ def encode( from datetime import datetime, timezone from pathlib import Path - from .storage import build_object_path, get_store_backend + from .content_registry import get_store_backend + from .storage import build_object_path # Extract context from key key = key or {} @@ -396,7 +395,7 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: ObjectRef for accessing the stored content. """ from .objectref import ObjectRef - from .storage import get_store_backend + from .content_registry import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -618,7 +617,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ from datetime import datetime, timezone - from .storage import get_store_backend + from .content_registry import get_store_backend path = str(value) @@ -653,7 +652,7 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: ObjectRef for accessing the file. """ from .objectref import ObjectRef - from .storage import get_store_backend + from .content_registry import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -669,11 +668,3 @@ def validate(self, value: Any) -> None: # Backward compatibility alias FilepathType = FilepathCodec - - -# ============================================================================= -# Legacy aliases for backward compatibility -# ============================================================================= - -# Old names that mapped to content-addressed storage -XBlobType = BlobCodec # is now diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py index 652f35def..f9747cca7 100644 --- a/src/datajoint/content_registry.py +++ b/src/datajoint/content_registry.py @@ -1,7 +1,7 @@ """ Content-addressed storage registry for DataJoint. -This module provides content-addressed storage with deduplication for the +This module provides content-addressed storage with deduplication for the AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash} diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 82ed02b15..eb23debeb 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -471,7 +471,17 @@ def substitute_special_type(match, category, foreign_key_sql, context): attr_type, store_name = get_adapter(context, match["type"]) if store_name is not None: match["store"] = store_name - match["type"] = attr_type.dtype + # Determine if external storage is used (store_name is present, even if empty string for default) + is_external = store_name is not None + inner_dtype = attr_type.get_dtype(is_external=is_external) + + # If inner dtype is a codec without store, propagate the store from outer type + # e.g., returns , we need to resolve as + if inner_dtype.startswith("<") and "@" not in inner_dtype and match.get("store") is not None: + # Append store to the inner dtype + inner_dtype = inner_dtype[:-1] + "@" + match["store"] + ">" + + match["type"] = inner_dtype # Recursively resolve if dtype is also a special type category = match_type(match["type"]) if category in SPECIAL_TYPES: diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index bd97dfd11..fc1ec435a 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -42,7 +42,7 @@ def _get(connection, attr, data, squeeze, download_path): - Blob types return raw bytes (unless an adapter handles them) - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains - For composed types (e.g., using ), decoders are applied + For composed types (e.g., using ), decoders are applied in reverse order: innermost first, then outermost. :param connection: a dj.Connection object @@ -61,7 +61,13 @@ def _get(connection, attr, data, squeeze, download_path): if attr.adapter: from .attribute_type import resolve_dtype - final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") + # Include store if present to get correct chain for external storage + store = getattr(attr, "store", None) + if store is not None: + dtype_spec = f"<{attr.adapter.type_name}@{store}>" + else: + dtype_spec = f"<{attr.adapter.type_name}>" + final_dtype, type_chain, _ = resolve_dtype(dtype_spec) # First, process the final dtype (what's stored in the database) if final_dtype.lower() == "json": @@ -95,7 +101,7 @@ def _get(connection, attr, data, squeeze, download_path): return uuid_module.UUID(bytes=data) if attr.is_blob: - return data # raw bytes (use for automatic deserialization) + return data # raw bytes (use for automatic deserialization) # Native types - pass through unchanged return data diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index e0b7aaafe..73b56abd5 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -6,10 +6,10 @@ referencing it are deleted. Supports two storage patterns: -- Content-addressed storage: , , +- Content-addressed storage: , , Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash} -- Path-addressed storage: +- Path-addressed storage: Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/ Usage: @@ -41,10 +41,10 @@ def _uses_content_storage(attr) -> bool: """ Check if an attribute uses content-addressed storage. - This includes types that compose with : - - directly - - (composes with ) - - (composes with ) + This includes types that chain to for external storage: + - directly + - (chains to ) + - (chains to ) Args: attr: Attribute from table heading @@ -55,9 +55,19 @@ def _uses_content_storage(attr) -> bool: if not attr.adapter: return False - # Check if this type or its composition chain uses content storage + # Check if this type uses content storage type_name = getattr(attr.adapter, "type_name", "") - return type_name in ("content", "xblob", "xattach") + store = getattr(attr, "store", None) + + # always uses content storage (external only) + if type_name == "hash": + return True + + # and use content storage when external (has store) + if type_name in ("blob", "attach") and store is not None: + return True + + return False def _uses_object_storage(attr) -> bool: @@ -144,7 +154,7 @@ def scan_references( Scan schemas for content references. Examines all tables in the given schemas and extracts content hashes - from columns that use content-addressed storage (, , ). + from columns that use content-addressed storage (, , ). Args: *schemas: Schema instances to scan @@ -384,7 +394,7 @@ def scan( """ Scan for orphaned content and objects without deleting. - Scans both content-addressed storage (for , , ) + Scans both content-addressed storage (for , , ) and path-addressed storage (for ). Args: @@ -542,7 +552,7 @@ def format_stats(stats: dict[str, Any]) -> str: # Show content-addressed storage stats if present if "content_referenced" in stats: lines.append("") - lines.append("Content-Addressed Storage (, , ):") + lines.append("Content-Addressed Storage (, , ):") lines.append(f" Referenced: {stats['content_referenced']}") lines.append(f" Stored: {stats['content_stored']}") lines.append(f" Orphaned: {stats['content_orphaned']}") diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 5c4482f3e..c2904c85a 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -326,7 +326,9 @@ def _init_from_database(self): # if no adapter, then delay the error until the first invocation attr["adapter"] = _MissingType(adapter_name) else: - attr["type"] = attr["adapter"].dtype + # Determine if external storage based on store presence + is_external = attr.get("store") is not None + attr["type"] = attr["adapter"].get_dtype(is_external=is_external) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") # Update is_blob based on resolved dtype (check both BYTES and NATIVE_BLOB patterns) diff --git a/src/datajoint/jobs.py b/src/datajoint/jobs.py index 18bf5730e..b542f9364 100644 --- a/src/datajoint/jobs.py +++ b/src/datajoint/jobs.py @@ -26,9 +26,9 @@ def __init__(self, conn, database): key_hash :char(32) # key hash --- status :enum('reserved','error','ignore') # if tuple is missing, the job is available - key=null : # structure containing the key + key=null : # structure containing the key error_message="" :varchar({error_message_length}) # error message returned if failed - error_stack=null : # error stack if failed + error_stack=null : # error stack if failed user="" :varchar(255) # database user host="" :varchar(255) # system hostname pid=0 :int unsigned # system process id diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 696ca380e..b8937e9ca 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -3,7 +3,7 @@ This module provides tools for migrating existing schemas to use the new AttributeType system, particularly for upgrading blob columns to use -explicit `` type declarations. +explicit `` type declarations. """ from __future__ import annotations @@ -25,7 +25,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: """ - Analyze a schema to find blob columns that could be migrated to . + Analyze a schema to find blob columns that could be migrated to . This function identifies blob columns that: 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) @@ -98,19 +98,19 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: def generate_migration_sql( schema: Schema, - target_type: str = "djblob", + target_type: str = "blob", dry_run: bool = True, ) -> list[str]: """ - Generate SQL statements to migrate blob columns to use . + Generate SQL statements to migrate blob columns to use . This generates ALTER TABLE statements that update column comments to - include the `::` prefix, marking them as using explicit + include the `::` prefix, marking them as using explicit DataJoint blob serialization. Args: schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "djblob"). + target_type: The type name to migrate to (default: "blob"). dry_run: If True, only return SQL without executing. Returns: @@ -156,18 +156,18 @@ def generate_migration_sql( def migrate_blob_columns( schema: Schema, - target_type: str = "djblob", + target_type: str = "blob", dry_run: bool = True, ) -> dict: """ - Migrate blob columns in a schema to use explicit type. + Migrate blob columns in a schema to use explicit type. This updates column comments in the database to include the type declaration. The data format remains unchanged. Args: schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "djblob"). + target_type: The type name to migrate to (default: "blob"). dry_run: If True, only preview changes without applying. Returns: @@ -188,7 +188,7 @@ def migrate_blob_columns( Warning: After migration, table definitions should be updated to use - `` instead of `longblob` for consistency. The migration + `` instead of `longblob` for consistency. The migration only updates database metadata; source code changes are manual. """ columns = analyze_blob_columns(schema) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 1ce7e816f..0c35f2b13 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -790,7 +790,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): # Numeric - convert to string elif attr.numeric: value = str(int(value) if isinstance(value, bool) else value) - # Blob - pass through as bytes (use for automatic serialization) + # Blob - pass through as bytes (use for automatic serialization) return name, placeholder, value diff --git a/tests/conftest.py b/tests/conftest.py index d64404230..18e5a539a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -701,6 +701,7 @@ def mock_object_storage(object_storage_config): "protocol": dj.config.object_storage.protocol, "location": dj.config.object_storage.location, "token_length": dj.config.object_storage.token_length, + "stores": dict(dj.config.object_storage.stores), } # Set test values @@ -709,6 +710,12 @@ def mock_object_storage(object_storage_config): dj.config.object_storage.location = object_storage_config["location"] dj.config.object_storage.token_length = object_storage_config.get("token_length", 8) + # Configure 'local' store using same location + dj.config.object_storage.stores["local"] = { + "protocol": "file", + "location": object_storage_config["location"], + } + yield object_storage_config # Restore original values @@ -716,6 +723,8 @@ def mock_object_storage(object_storage_config): dj.config.object_storage.protocol = original["protocol"] dj.config.object_storage.location = original["location"] dj.config.object_storage.token_length = original["token_length"] + dj.config.object_storage.stores.clear() + dj.config.object_storage.stores.update(original["stores"]) @pytest.fixture diff --git a/tests/integration/test_autopopulate.py b/tests/integration/test_autopopulate.py index 6bde3b49e..de9dc95a3 100644 --- a/tests/integration/test_autopopulate.py +++ b/tests/integration/test_autopopulate.py @@ -121,7 +121,7 @@ class Image(dj.Imported): definition = """ -> ImageSource --- - image_data: + image_data: """ def make(self, key): @@ -134,7 +134,7 @@ class Crop(dj.Computed): definition = """ -> Image --- - crop_image: + crop_image: """ def make(self, key): diff --git a/tests/integration/test_blob_matlab.py b/tests/integration/test_blob_matlab.py index 8e5e9235d..07f42660a 100644 --- a/tests/integration/test_blob_matlab.py +++ b/tests/integration/test_blob_matlab.py @@ -11,7 +11,7 @@ class Blob(dj.Manual): id : int ----- comment : varchar(255) - blob : + blob : """ diff --git a/tests/integration/test_fetch_same.py b/tests/integration/test_fetch_same.py index ad830616f..886af2b94 100644 --- a/tests/integration/test_fetch_same.py +++ b/tests/integration/test_fetch_same.py @@ -10,7 +10,7 @@ class ProjData(dj.Manual): --- resp : float sim : float - big : + big : blah : varchar(10) """ diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 2c312bcc0..8eab3584f 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -20,35 +20,39 @@ def test_returns_false_for_no_adapter(self): assert gc._uses_content_storage(attr) is False - def test_returns_true_for_content_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_hash_type(self): + """Test that True is returned for type.""" attr = MagicMock() attr.adapter = MagicMock() - attr.adapter.type_name = "content" + attr.adapter.type_name = "hash" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_true_for_xblob_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_blob_external(self): + """Test that True is returned for type (external).""" attr = MagicMock() attr.adapter = MagicMock() - attr.adapter.type_name = "xblob" + attr.adapter.type_name = "blob" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_true_for_xattach_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_attach_external(self): + """Test that True is returned for type (external).""" attr = MagicMock() attr.adapter = MagicMock() - attr.adapter.type_name = "xattach" + attr.adapter.type_name = "attach" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_false_for_other_types(self): - """Test that False is returned for non-content types.""" + def test_returns_false_for_blob_internal(self): + """Test that False is returned for internal storage.""" attr = MagicMock() attr.adapter = MagicMock() - attr.adapter.type_name = "djblob" + attr.adapter.type_name = "blob" + attr.store = None assert gc._uses_content_storage(attr) is False @@ -107,7 +111,7 @@ def test_returns_false_for_other_types(self): """Test that False is returned for non-object types.""" attr = MagicMock() attr.adapter = MagicMock() - attr.adapter.type_name = "xblob" + attr.adapter.type_name = "blob" assert gc._uses_object_storage(attr) is False diff --git a/tests/integration/test_type_composition.py b/tests/integration/test_type_composition.py index 0b51b3d68..0e24da54d 100644 --- a/tests/integration/test_type_composition.py +++ b/tests/integration/test_type_composition.py @@ -1,14 +1,13 @@ """ Tests for type composition (type chain encoding/decoding). -This tests the → json composition pattern +This tests the → json composition pattern and similar type chains. """ from datajoint.attribute_type import ( AttributeType, - _type_registry, - register_type, + _codec_registry, resolve_dtype, ) @@ -18,23 +17,24 @@ class TestTypeChainResolution: def setup_method(self): """Clear test types from registry before each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def teardown_method(self): """Clean up test types after each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def test_single_type_chain(self): """Test resolving a single-type chain.""" - @register_type class TestSingle(AttributeType): - type_name = "test_single" - dtype = "varchar(100)" + name = "test_single" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" def encode(self, value, *, key=None, store_name=None): return str(value) @@ -52,10 +52,11 @@ def decode(self, stored, *, key=None): def test_two_type_chain(self): """Test resolving a two-type chain.""" - @register_type class TestInner(AttributeType): - type_name = "test_inner" - dtype = "longblob" + name = "test_inner" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" def encode(self, value, *, key=None, store_name=None): return value @@ -63,10 +64,11 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - @register_type class TestOuter(AttributeType): - type_name = "test_outer" - dtype = "" + name = "test_outer" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): return value @@ -76,7 +78,7 @@ def decode(self, stored, *, key=None): final_dtype, chain, store = resolve_dtype("") - assert final_dtype == "longblob" + assert final_dtype == "bytes" assert len(chain) == 2 assert chain[0].type_name == "test_outer" assert chain[1].type_name == "test_inner" @@ -84,10 +86,11 @@ def decode(self, stored, *, key=None): def test_three_type_chain(self): """Test resolving a three-type chain.""" - @register_type class TestBase(AttributeType): - type_name = "test_base" - dtype = "json" + name = "test_base" + + def get_dtype(self, is_external: bool) -> str: + return "json" def encode(self, value, *, key=None, store_name=None): return value @@ -95,10 +98,11 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - @register_type class TestMiddle(AttributeType): - type_name = "test_middle" - dtype = "" + name = "test_middle" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): return value @@ -106,10 +110,11 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - @register_type class TestTop(AttributeType): - type_name = "test_top" - dtype = "" + name = "test_top" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): return value @@ -131,24 +136,25 @@ class TestTypeChainEncodeDecode: def setup_method(self): """Clear test types from registry before each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def teardown_method(self): """Clean up test types after each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def test_encode_order(self): """Test that encode is applied outer → inner.""" encode_order = [] - @register_type class TestInnerEnc(AttributeType): - type_name = "test_inner_enc" - dtype = "longblob" + name = "test_inner_enc" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" def encode(self, value, *, key=None, store_name=None): encode_order.append("inner") @@ -157,10 +163,11 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - @register_type class TestOuterEnc(AttributeType): - type_name = "test_outer_enc" - dtype = "" + name = "test_outer_enc" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): encode_order.append("outer") @@ -183,10 +190,11 @@ def test_decode_order(self): """Test that decode is applied inner → outer (reverse of encode).""" decode_order = [] - @register_type class TestInnerDec(AttributeType): - type_name = "test_inner_dec" - dtype = "longblob" + name = "test_inner_dec" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" def encode(self, value, *, key=None, store_name=None): return value @@ -195,10 +203,11 @@ def decode(self, stored, *, key=None): decode_order.append("inner") return stored.replace(b"_inner", b"") - @register_type class TestOuterDec(AttributeType): - type_name = "test_outer_dec" - dtype = "" + name = "test_outer_dec" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): return value @@ -220,10 +229,11 @@ def decode(self, stored, *, key=None): def test_roundtrip(self): """Test encode/decode roundtrip through a type chain.""" - @register_type class TestInnerRt(AttributeType): - type_name = "test_inner_rt" - dtype = "longblob" + name = "test_inner_rt" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" def encode(self, value, *, key=None, store_name=None): # Compress (just add prefix for testing) @@ -233,10 +243,11 @@ def decode(self, stored, *, key=None): # Decompress return stored.replace(b"COMPRESSED:", b"") - @register_type class TestOuterRt(AttributeType): - type_name = "test_outer_rt" - dtype = "" + name = "test_outer_rt" + + def get_dtype(self, is_external: bool) -> str: + return "" def encode(self, value, *, key=None, store_name=None): # Serialize (just encode string for testing) @@ -269,63 +280,68 @@ def decode(self, stored, *, key=None): class TestBuiltinTypeComposition: """Tests for built-in type composition.""" - def test_xblob_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") + def test_blob_internal_resolves_to_bytes(self): + """Test that (internal) → bytes.""" + final_dtype, chain, _ = resolve_dtype("") - assert final_dtype == "json" - assert len(chain) == 2 - assert chain[0].type_name == "xblob" - assert chain[1].type_name == "content" + assert final_dtype == "bytes" + assert len(chain) == 1 + assert chain[0].type_name == "blob" - def test_xattach_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") + def test_blob_external_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" assert len(chain) == 2 - assert chain[0].type_name == "xattach" - assert chain[1].type_name == "content" + assert chain[0].type_name == "blob" + assert chain[1].type_name == "hash" + assert store == "store" - def test_djblob_resolves_to_longblob(self): - """Test that → longblob (no chain).""" - final_dtype, chain, _ = resolve_dtype("") + def test_attach_internal_resolves_to_bytes(self): + """Test that (internal) → bytes.""" + final_dtype, chain, _ = resolve_dtype("") - assert final_dtype == "longblob" + assert final_dtype == "bytes" assert len(chain) == 1 - assert chain[0].type_name == "djblob" + assert chain[0].type_name == "attach" - def test_content_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") + def test_attach_external_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" - assert len(chain) == 1 - assert chain[0].type_name == "content" + assert len(chain) == 2 + assert chain[0].type_name == "attach" + assert chain[1].type_name == "hash" + assert store == "store" - def test_object_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") + def test_hash_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" assert len(chain) == 1 - assert chain[0].type_name == "object" + assert chain[0].type_name == "hash" + assert store == "store" - def test_attach_resolves_to_longblob(self): - """Test that → longblob.""" - final_dtype, chain, _ = resolve_dtype("") + def test_object_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") - assert final_dtype == "longblob" + assert final_dtype == "json" assert len(chain) == 1 - assert chain[0].type_name == "attach" + assert chain[0].type_name == "object" + assert store == "store" - def test_filepath_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") + def test_filepath_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" assert len(chain) == 1 assert chain[0].type_name == "filepath" + assert store == "store" class TestStoreNameParsing: @@ -333,14 +349,14 @@ class TestStoreNameParsing: def test_type_with_store(self): """Test parsing type with store name.""" - final_dtype, chain, store = resolve_dtype("") + final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" assert store == "mystore" def test_type_without_store(self): """Test parsing type without store name.""" - final_dtype, chain, store = resolve_dtype("") + final_dtype, chain, store = resolve_dtype("") assert store is None diff --git a/tests/integration/test_update1.py b/tests/integration/test_update1.py index d09f70c4e..92f68a8d4 100644 --- a/tests/integration/test_update1.py +++ b/tests/integration/test_update1.py @@ -14,8 +14,8 @@ class Thing(dj.Manual): --- number=0 : int frac : float - picture = null : - params = null : + picture = null : + params = null : img_file = null: timestamp = CURRENT_TIMESTAMP : datetime """ diff --git a/tests/schema.py b/tests/schema.py index b4ffa7f0b..99a7c457d 100644 --- a/tests/schema.py +++ b/tests/schema.py @@ -200,8 +200,8 @@ class Channel(dj.Part): -> master channel :tinyint unsigned # channel number within Ephys ---- - voltage : - current = null : # optional current to test null handling + voltage : + current = null : # optional current to test null handling """ def _make_tuples(self, key): @@ -228,7 +228,7 @@ class Image(dj.Manual): # table for testing blob inserts id : int # image identifier --- - img : # image + img : # image """ @@ -454,7 +454,7 @@ class Longblob(dj.Manual): definition = """ id: int --- - data: + data: """ diff --git a/tests/schema_adapted.py b/tests/schema_adapted.py index a2b3e4924..8edf8d65d 100644 --- a/tests/schema_adapted.py +++ b/tests/schema_adapted.py @@ -5,14 +5,16 @@ import datajoint as dj -@dj.register_type -class GraphType(dj.AttributeType): - """Custom type for storing NetworkX graphs as edge lists.""" +class GraphType(dj.Codec): + """Custom codec for storing NetworkX graphs as edge lists.""" - type_name = "graph" - dtype = "" # Use djblob for proper serialization + name = "graph" - def encode(self, obj, *, key=None): + def get_dtype(self, is_external: bool) -> str: + """Chain to djblob for serialization.""" + return "" + + def encode(self, obj, *, key=None, store_name=None): """Convert graph object into an edge list.""" assert isinstance(obj, nx.Graph) return list(obj.edges) @@ -22,14 +24,16 @@ def decode(self, stored, *, key=None): return nx.Graph(stored) -@dj.register_type -class LayoutToFilepathType(dj.AttributeType): - """Custom type that saves a graph layout as serialized JSON blob.""" +class LayoutToFilepathType(dj.Codec): + """Custom codec that saves a graph layout as serialized JSON blob.""" + + name = "layout_to_filepath" - type_name = "layout_to_filepath" - dtype = "" # Use djblob for serialization + def get_dtype(self, is_external: bool) -> str: + """Chain to djblob for serialization.""" + return "" - def encode(self, layout, *, key=None): + def encode(self, layout, *, key=None, store_name=None): """Serialize layout dict.""" return layout # djblob handles serialization diff --git a/tests/schema_alter.py b/tests/schema_alter.py index 6f18448e4..ef8b35f0c 100644 --- a/tests/schema_alter.py +++ b/tests/schema_alter.py @@ -20,7 +20,7 @@ class Experiment(dj.Imported): experiment_id :smallint # experiment number for this subject --- data_path : int # some number - extra=null : # just testing + extra=null : # just testing -> [nullable] User subject_notes=null :varchar(2048) # {notes} e.g. purpose of experiment entry_time=CURRENT_TIMESTAMP :timestamp # automatic timestamp diff --git a/tests/schema_external.py b/tests/schema_external.py index 5a2db1e86..ae1803f5e 100644 --- a/tests/schema_external.py +++ b/tests/schema_external.py @@ -13,7 +13,7 @@ class Simple(dj.Manual): definition = """ simple : int --- - item : + item : """ @@ -21,7 +21,7 @@ class SimpleRemote(dj.Manual): definition = """ simple : int --- - item : + item : """ @@ -36,7 +36,7 @@ class Dimension(dj.Lookup): definition = """ dim : int --- - dimensions : + dimensions : """ contents = ([0, [100, 50]], [1, [3, 4, 8, 6]]) @@ -47,8 +47,8 @@ class Image(dj.Computed): -> Seed -> Dimension ---- - img : # objects are stored as specified by dj.config['stores']['share'] - neg : # objects are stored as specified by dj.config['stores']['local'] + img : # objects are stored as specified by dj.config['stores']['share'] + neg : # objects are stored as specified by dj.config['stores']['local'] """ def make(self, key): @@ -62,7 +62,7 @@ class Attach(dj.Manual): # table for storing attachments attach : int ---- - img : # attachments are stored as specified by: dj.config['stores']['share'] + img : # attachments are stored as specified by: dj.config['stores']['share'] txt : # attachments are stored directly in the database """ diff --git a/tests/schema_object.py b/tests/schema_object.py index 7caf7e16c..ef1d957dc 100644 --- a/tests/schema_object.py +++ b/tests/schema_object.py @@ -13,7 +13,7 @@ class ObjectFile(dj.Manual): definition = """ file_id : int --- - data_file : # stored file + data_file : # stored file """ @@ -23,7 +23,7 @@ class ObjectFolder(dj.Manual): definition = """ folder_id : int --- - data_folder : # stored folder + data_folder : # stored folder """ @@ -33,8 +33,8 @@ class ObjectMultiple(dj.Manual): definition = """ record_id : int --- - raw_data : # raw data file - processed : # processed data file + raw_data : # raw data file + processed : # processed data file """ @@ -46,6 +46,6 @@ class ObjectWithOther(dj.Manual): session_id : int --- name : varchar(100) - data_file : + data_file : notes : varchar(255) """ diff --git a/tests/schema_simple.py b/tests/schema_simple.py index 0d4ebd53b..3ac71469f 100644 --- a/tests/schema_simple.py +++ b/tests/schema_simple.py @@ -250,7 +250,7 @@ class TTestUpdate(dj.Lookup): --- string_attr : varchar(255) num_attr=null : float - blob_attr : + blob_attr : """ contents = [ diff --git a/tests/unit/test_attribute_type.py b/tests/unit/test_attribute_type.py index afc6674af..0c9b6811e 100644 --- a/tests/unit/test_attribute_type.py +++ b/tests/unit/test_attribute_type.py @@ -7,7 +7,7 @@ import datajoint as dj from datajoint.attribute_type import ( AttributeType, - _type_registry, + _codec_registry, get_type, is_type_registered, list_types, @@ -23,25 +23,26 @@ class TestAttributeTypeRegistry: def setup_method(self): """Clear any test types from registry before each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def teardown_method(self): """Clean up test types after each test.""" - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] - def test_register_type_decorator(self): - """Test registering a type using the decorator.""" + def test_register_type_auto(self): + """Test auto-registration via __init_subclass__.""" - @register_type class TestType(AttributeType): - type_name = "test_decorator" - dtype = "longblob" + name = "test_decorator" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -53,11 +54,13 @@ def decode(self, stored, *, key=None): def test_register_type_direct(self): """Test registering a type by calling register_type directly.""" - class TestType(AttributeType): - type_name = "test_direct" - dtype = "varchar(255)" + class TestType(AttributeType, register=False): + name = "test_direct" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "varchar(255)" + + def encode(self, value, *, key=None, store_name=None): return str(value) def decode(self, stored, *, key=None): @@ -69,12 +72,13 @@ def decode(self, stored, *, key=None): def test_register_type_idempotent(self): """Test that registering the same type twice is idempotent.""" - @register_type class TestType(AttributeType): - type_name = "test_idempotent" - dtype = "int" + name = "test_idempotent" + + def get_dtype(self, is_external: bool) -> str: + return "int32" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -87,22 +91,25 @@ def decode(self, stored, *, key=None): def test_register_duplicate_name_different_class(self): """Test that registering different classes with same name raises error.""" - @register_type class TestType1(AttributeType): - type_name = "test_duplicate" - dtype = "int" + name = "test_duplicate" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): return stored - class TestType2(AttributeType): - type_name = "test_duplicate" - dtype = "varchar(100)" + class TestType2(AttributeType, register=False): + name = "test_duplicate" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return str(value) def decode(self, stored, *, key=None): @@ -114,12 +121,13 @@ def decode(self, stored, *, key=None): def test_unregister_type(self): """Test unregistering a type.""" - @register_type class TestType(AttributeType): - type_name = "test_unregister" - dtype = "int" + name = "test_unregister" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -131,18 +139,19 @@ def decode(self, stored, *, key=None): def test_get_type_not_found(self): """Test that getting an unregistered type raises error.""" - with pytest.raises(DataJointError, match="Unknown attribute type"): + with pytest.raises(DataJointError, match="Unknown codec"): get_type("nonexistent_type") def test_list_types(self): """Test listing registered types.""" - @register_type class TestType(AttributeType): - type_name = "test_list" - dtype = "int" + name = "test_list" + + def get_dtype(self, is_external: bool) -> str: + return "int32" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -155,12 +164,13 @@ def decode(self, stored, *, key=None): def test_get_type_strips_brackets(self): """Test that get_type accepts names with or without angle brackets.""" - @register_type class TestType(AttributeType): - type_name = "test_brackets" - dtype = "int" + name = "test_brackets" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -173,24 +183,25 @@ class TestAttributeTypeValidation: """Tests for the validate method.""" def setup_method(self): - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def teardown_method(self): - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def test_validate_called_default(self): """Test that default validate accepts any value.""" - @register_type class TestType(AttributeType): - type_name = "test_validate_default" - dtype = "longblob" + name = "test_validate_default" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -206,12 +217,13 @@ def decode(self, stored, *, key=None): def test_validate_custom(self): """Test custom validation logic.""" - @register_type class PositiveIntType(AttributeType): - type_name = "test_positive_int" - dtype = "int" + name = "test_positive_int" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -237,31 +249,32 @@ class TestTypeChaining: """Tests for type chaining (dtype referencing another custom type).""" def setup_method(self): - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def teardown_method(self): - for name in list(_type_registry.keys()): + for name in list(_codec_registry.keys()): if name.startswith("test_"): - del _type_registry[name] + del _codec_registry[name] def test_resolve_native_dtype(self): """Test resolving a native dtype.""" - final_dtype, chain, store = resolve_dtype("longblob") - assert final_dtype == "longblob" + final_dtype, chain, store = resolve_dtype("bytes") + assert final_dtype == "bytes" assert chain == [] assert store is None def test_resolve_custom_dtype(self): """Test resolving a custom dtype.""" - @register_type class TestType(AttributeType): - type_name = "test_resolve" - dtype = "varchar(100)" + name = "test_resolve" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): @@ -276,30 +289,32 @@ def decode(self, stored, *, key=None): def test_resolve_chained_dtype(self): """Test resolving a chained dtype.""" - @register_type class InnerType(AttributeType): - type_name = "test_inner" - dtype = "longblob" + name = "test_inner" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): return stored - @register_type class OuterType(AttributeType): - type_name = "test_outer" - dtype = "" + name = "test_outer" + + def get_dtype(self, is_external: bool) -> str: + return "" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): return stored final_dtype, chain, store = resolve_dtype("") - assert final_dtype == "longblob" + assert final_dtype == "bytes" assert len(chain) == 2 assert chain[0].type_name == "test_outer" assert chain[1].type_name == "test_inner" @@ -308,29 +323,31 @@ def decode(self, stored, *, key=None): def test_circular_reference_detection(self): """Test that circular type references are detected.""" - @register_type class TypeA(AttributeType): - type_name = "test_circular_a" - dtype = "" + name = "test_circular_a" - def encode(self, value, *, key=None): + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): return stored - @register_type class TypeB(AttributeType): - type_name = "test_circular_b" - dtype = "" + name = "test_circular_b" + + def get_dtype(self, is_external: bool) -> str: + return "" - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): return value def decode(self, stored, *, key=None): return stored - with pytest.raises(DataJointError, match="Circular type reference"): + with pytest.raises(DataJointError, match="Circular codec reference"): resolve_dtype("") @@ -344,24 +361,25 @@ def test_exports_from_datajoint(self): assert hasattr(dj, "list_types") -class TestDJBlobType: - """Tests for the built-in DJBlobType.""" +class TestBlobCodec: + """Tests for the built-in BlobCodec.""" - def test_djblob_is_registered(self): - """Test that djblob is automatically registered.""" - assert is_type_registered("djblob") + def test_blob_is_registered(self): + """Test that blob is automatically registered.""" + assert is_type_registered("blob") - def test_djblob_properties(self): - """Test DJBlobType properties.""" - blob_type = get_type("djblob") - assert blob_type.type_name == "djblob" - assert blob_type.dtype == "longblob" + def test_blob_properties(self): + """Test BlobCodec properties.""" + blob_type = get_type("blob") + assert blob_type.type_name == "blob" + assert blob_type.get_dtype(is_external=False) == "bytes" + assert blob_type.get_dtype(is_external=True) == "" - def test_djblob_encode_decode_roundtrip(self): + def test_blob_encode_decode_roundtrip(self): """Test that encode/decode is a proper roundtrip.""" import numpy as np - blob_type = get_type("djblob") + blob_type = get_type("blob") # Test with various data types test_data = [ @@ -382,34 +400,34 @@ def test_djblob_encode_decode_roundtrip(self): else: assert decoded == original - def test_djblob_encode_produces_valid_blob_format(self): + def test_blob_encode_produces_valid_blob_format(self): """Test that encoded data has valid blob protocol header.""" - blob_type = get_type("djblob") + blob_type = get_type("blob") encoded = blob_type.encode({"test": "data"}) # Should start with compression prefix or protocol header valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") assert any(encoded.startswith(p) for p in valid_prefixes) - def test_djblob_in_list_types(self): - """Test that djblob appears in list_types.""" + def test_blob_in_list_types(self): + """Test that blob appears in list_types.""" types = list_types() - assert "djblob" in types + assert "blob" in types - def test_djblob_handles_serialization(self): - """Test that DJBlobType handles serialization internally. + def test_blob_handles_serialization(self): + """Test that BlobCodec handles serialization internally. With the new design: - - Plain longblob columns store/return raw bytes (no serialization) - - handles pack/unpack in encode/decode + - Plain bytes columns store/return raw bytes (no serialization) + - handles pack/unpack in encode/decode """ - blob_type = get_type("djblob") + blob_type = get_type("blob") - # DJBlobType.encode() should produce packed bytes + # BlobCodec.encode() should produce packed bytes data = {"key": "value"} encoded = blob_type.encode(data) assert isinstance(encoded, bytes) - # DJBlobType.decode() should unpack back to original + # BlobCodec.decode() should unpack back to original decoded = blob_type.decode(encoded) assert decoded == data From 898d0edc0e0ae62eb4051b02be5d9ef8dbe5df72 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 16:57:50 -0600 Subject: [PATCH 23/32] Rename AttributeType/adapter terminology to Codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename attribute_type.py → codecs.py - Rename builtin_types.py → builtin_codecs.py - Rename test_attribute_type.py → test_codecs.py - Rename get_adapter() → lookup_codec() - Rename attr.adapter → attr.codec in Attribute namedtuple - Update all imports and references throughout codebase - Update comments and docstrings to use codec terminology All 471 tests pass. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/__init__.py | 2 +- .../{builtin_types.py => builtin_codecs.py} | 2 +- .../{attribute_type.py => codecs.py} | 18 +++--- src/datajoint/declare.py | 10 +-- src/datajoint/fetch.py | 16 ++--- src/datajoint/gc.py | 8 +-- src/datajoint/heading.py | 64 +++++++++---------- src/datajoint/migrate.py | 8 +-- src/datajoint/preview.py | 4 +- src/datajoint/staged_insert.py | 2 +- src/datajoint/table.py | 8 +-- tests/integration/test_adapted_attributes.py | 2 +- tests/integration/test_gc.py | 32 +++++----- tests/integration/test_type_composition.py | 2 +- ...{test_attribute_type.py => test_codecs.py} | 4 +- 15 files changed, 89 insertions(+), 93 deletions(-) rename src/datajoint/{builtin_types.py => builtin_codecs.py} (99%) rename src/datajoint/{attribute_type.py => codecs.py} (96%) rename tests/unit/{test_attribute_type.py => test_codecs.py} (99%) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index f3744f7d6..c245c049f 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -66,7 +66,7 @@ from . import errors from . import migrate from .admin import kill -from .attribute_type import ( +from .codecs import ( AttributeType, Codec, get_codec, diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_codecs.py similarity index 99% rename from src/datajoint/builtin_types.py rename to src/datajoint/builtin_codecs.py index d734e8e0b..56aef6779 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_codecs.py @@ -58,7 +58,7 @@ class Networks(dj.Manual): from typing import Any -from .attribute_type import Codec +from .codecs import Codec from .errors import DataJointError diff --git a/src/datajoint/attribute_type.py b/src/datajoint/codecs.py similarity index 96% rename from src/datajoint/attribute_type.py rename to src/datajoint/codecs.py index 1a40abfa6..6ed3e4f05 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/codecs.py @@ -503,16 +503,16 @@ def resolve_dtype( return dtype, chain, store_name -def get_adapter(context: dict | None, adapter_name: str) -> tuple[Codec, str | None]: +def lookup_codec(codec_spec: str) -> tuple[Codec, str | None]: """ - Get a codec by name. + Look up a codec from a type specification string. - This is a compatibility function used by heading and declare modules. + Parses a codec specification (e.g., "") and returns + the codec instance along with any store name. Args: - context: Ignored (legacy parameter, kept for API compatibility). - adapter_name: The codec name, with or without angle brackets. - May include store parameter (e.g., ""). + codec_spec: The codec specification, with or without angle brackets. + May include store parameter (e.g., ""). Returns: Tuple of (Codec instance, store_name or None). @@ -520,7 +520,7 @@ def get_adapter(context: dict | None, adapter_name: str) -> tuple[Codec, str | N Raises: DataJointError: If the codec is not found. """ - type_name, store_name = parse_type_spec(adapter_name) + type_name, store_name = parse_type_spec(codec_spec) if is_codec_registered(type_name): return get_codec(type_name), store_name @@ -532,6 +532,6 @@ def get_adapter(context: dict | None, adapter_name: str) -> tuple[Codec, str | N # Auto-register built-in codecs # ============================================================================= -# Import builtin_types module to register built-in codecs +# Import builtin_codecs module to register built-in codecs # This import has a side effect: it registers the codecs via __init_subclass__ -from . import builtin_types as _builtin_types # noqa: F401, E402 +from . import builtin_codecs as _builtin_codecs # noqa: F401, E402 diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index eb23debeb..777136f0b 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -9,7 +9,7 @@ import pyparsing as pp -from .attribute_type import get_adapter +from .codecs import lookup_codec from .condition import translate_attribute from .errors import DataJointError from .settings import config @@ -464,16 +464,16 @@ def substitute_special_type(match, category, foreign_key_sql, context): :param match: dict containing with keys "type" and "comment" -- will be modified in place :param category: attribute type category from TYPE_PATTERN :param foreign_key_sql: list of foreign key declarations to add to - :param context: context for looking up user-defined attribute_type adapters + :param context: context for looking up user-defined codecs (unused, kept for compatibility) """ if category == "ADAPTED": - # AttributeType - resolve to underlying dtype - attr_type, store_name = get_adapter(context, match["type"]) + # Codec - resolve to underlying dtype + codec, store_name = lookup_codec(match["type"]) if store_name is not None: match["store"] = store_name # Determine if external storage is used (store_name is present, even if empty string for default) is_external = store_name is not None - inner_dtype = attr_type.get_dtype(is_external=is_external) + inner_dtype = codec.get_dtype(is_external=is_external) # If inner dtype is a codec without store, propagate the store from outer type # e.g., returns , we need to resolve as diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index fc1ec435a..44551c4be 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -39,8 +39,8 @@ def _get(connection, attr, data, squeeze, download_path): - Native types pass through unchanged - JSON types are parsed - UUID types are converted from bytes - - Blob types return raw bytes (unless an adapter handles them) - - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains + - Blob types return raw bytes (unless a codec handles them) + - Codecs handle all custom encoding/decoding via type chains For composed types (e.g., using ), decoders are applied in reverse order: innermost first, then outermost. @@ -57,16 +57,16 @@ def _get(connection, attr, data, squeeze, download_path): if data is None: return None - # Get the final storage type and type chain if adapter present - if attr.adapter: - from .attribute_type import resolve_dtype + # Get the final storage type and type chain if codec present + if attr.codec: + from .codecs import resolve_dtype # Include store if present to get correct chain for external storage store = getattr(attr, "store", None) if store is not None: - dtype_spec = f"<{attr.adapter.type_name}@{store}>" + dtype_spec = f"<{attr.codec.type_name}@{store}>" else: - dtype_spec = f"<{attr.adapter.type_name}>" + dtype_spec = f"<{attr.codec.type_name}>" final_dtype, type_chain, _ = resolve_dtype(dtype_spec) # First, process the final dtype (what's stored in the database) @@ -93,7 +93,7 @@ def _get(connection, attr, data, squeeze, download_path): return data - # No adapter - handle native types + # No codec - handle native types if attr.json: return json.loads(data) diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index 73b56abd5..cff0296ee 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -52,11 +52,11 @@ def _uses_content_storage(attr) -> bool: Returns: True if the attribute stores content hashes """ - if not attr.adapter: + if not attr.codec: return False # Check if this type uses content storage - type_name = getattr(attr.adapter, "type_name", "") + type_name = getattr(attr.codec, "type_name", "") store = getattr(attr, "store", None) # always uses content storage (external only) @@ -80,10 +80,10 @@ def _uses_object_storage(attr) -> bool: Returns: True if the attribute stores object paths """ - if not attr.adapter: + if not attr.codec: return False - type_name = getattr(attr.adapter, "type_name", "") + type_name = getattr(attr.codec, "type_name", "") return type_name == "object" diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index c2904c85a..cc11eb760 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,8 +5,8 @@ import numpy as np -from .attribute_type import get_adapter -from .attribute_type import AttributeType +from .codecs import lookup_codec +from .codecs import Codec from .declare import ( CORE_TYPE_NAMES, SPECIAL_TYPES, @@ -15,33 +15,31 @@ from .errors import DataJointError -class _MissingType(AttributeType): - """Placeholder for missing/unregistered attribute types. Raises error on use.""" +class _MissingType(Codec, register=False): + """Placeholder for missing/unregistered codecs. Raises error on use.""" - def __init__(self, name: str): - self._name = name + name = None # Don't auto-register + + def __init__(self, codec_name: str): + self._codec_name = codec_name @property def type_name(self) -> str: - return self._name + return self._codec_name - @property - def dtype(self) -> str: + def get_dtype(self, is_external: bool) -> str: raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) def decode(self, stored, *, key=None): raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) @@ -62,7 +60,7 @@ def decode(self, stored, *, key=None): json=None, is_blob=False, is_hidden=False, - adapter=None, + codec=None, store=None, unsupported=False, attribute_expression=None, @@ -286,7 +284,7 @@ def _init_from_database(self): is_blob=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")), uuid=False, json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), - adapter=None, + codec=None, store=None, attribute_expression=None, is_hidden=attr["name"].startswith("_"), @@ -311,26 +309,24 @@ def _init_from_database(self): # Store the original type name for display but keep db_type for SQL attr["original_type"] = special["type"] - # process AttributeTypes (adapted types in angle brackets) + # process Codecs (adapted types in angle brackets) if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): # Context can be None for built-in types that are globally registered - adapter_name = special["type"] + codec_spec = special["type"] try: - adapter_result = get_adapter(context, adapter_name) - # get_adapter returns (adapter, store_name) tuple - if isinstance(adapter_result, tuple): - attr["adapter"], attr["store"] = adapter_result - else: - attr["adapter"] = adapter_result + codec_instance, codec_store = lookup_codec(codec_spec) + attr["codec"] = codec_instance + if codec_store is not None: + attr["store"] = codec_store except DataJointError: - # if no adapter, then delay the error until the first invocation - attr["adapter"] = _MissingType(adapter_name) + # if no codec, then delay the error until the first invocation + attr["codec"] = _MissingType(codec_spec) else: # Determine if external storage based on store presence is_external = attr.get("store") is not None - attr["type"] = attr["adapter"].get_dtype(is_external=is_external) + attr["type"] = attr["codec"].get_dtype(is_external=is_external) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): - raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") + raise DataJointError(f"Invalid dtype '{attr['type']}' in codec <{codec_spec}>.") # Update is_blob based on resolved dtype (check both BYTES and NATIVE_BLOB patterns) attr["is_blob"] = any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")) @@ -367,7 +363,7 @@ def _init_from_database(self): # fill out dtype. All floats and non-nullable integers are turned into specific dtypes attr["dtype"] = object - if attr["numeric"] and not attr["adapter"]: + if attr["numeric"] and not attr["codec"]: is_integer = TYPE_PATTERN["INTEGER"].match(attr["type"]) is_float = TYPE_PATTERN["FLOAT"].match(attr["type"]) if is_integer and not attr["nullable"] or is_float: @@ -377,9 +373,9 @@ def _init_from_database(self): assert (t, is_unsigned) in numeric_types, "dtype not found for type %s" % t attr["dtype"] = numeric_types[(t, is_unsigned)] - if attr["adapter"]: - # restore adapted type name for display - attr["type"] = adapter_name + if attr["codec"]: + # restore codec type name for display + attr["type"] = codec_spec self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes)) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index b8937e9ca..0bfc355db 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -29,7 +29,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: This function identifies blob columns that: 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) - 2. Do NOT already have an adapter/type specified in their comment + 2. Do NOT already have a codec/type specified in their comment All blob size variants are included in the analysis. @@ -80,8 +80,8 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() for column_name, column_type, comment in columns: - # Check if comment already has an adapter type (starts with :type:) - has_adapter = comment and comment.startswith(":") + # Check if comment already has a codec type (starts with :type:) + has_codec = comment and comment.startswith(":") results.append( { @@ -89,7 +89,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: "column_name": column_name, "column_type": column_type, "current_comment": comment or "", - "needs_migration": not has_adapter, + "needs_migration": not has_codec, } ) diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py index 7572125e9..0ef096d2c 100644 --- a/src/datajoint/preview.py +++ b/src/datajoint/preview.py @@ -27,7 +27,7 @@ def _format_object_display(json_data): def preview(query_expression, limit, width): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - # Object fields are AttributeTypes with adapters - not specially handled in simplified model + # Object fields use codecs - not specially handled in simplified model object_fields = [] if limit is None: limit = config["display.limit"] @@ -88,7 +88,7 @@ def get_display_value(tup, f, idx): def repr_html(query_expression): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - # Object fields are AttributeTypes with adapters - not specially handled in simplified model + # Object fields use codecs - not specially handled in simplified model object_fields = [] info = heading.table_status tuples = rel.fetch(limit=config["display.limit"] + 1, format="array") diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py index dbf51c6bc..aa5635968 100644 --- a/src/datajoint/staged_insert.py +++ b/src/datajoint/staged_insert.py @@ -99,7 +99,7 @@ def _get_storage_path(self, field: str, ext: str = "") -> str: attr = self._table.heading[field] # Check if this is an object AttributeType (has adapter with "object" in type_name) - if not (attr.adapter and hasattr(attr.adapter, "type_name") and "object" in attr.adapter.type_name): + if not (attr.codec and hasattr(attr.codec, "type_name") and "object" in attr.codec.type_name): raise DataJointError(f"Attribute '{field}' is not an type") # Extract primary key from rec diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 0c35f2b13..cb8dff8a0 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -748,17 +748,17 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): attr = self.heading[name] # Apply adapter encoding with type chain support - if attr.adapter: - from .attribute_type import resolve_dtype + if attr.codec: + from .codecs import resolve_dtype # Skip validation and encoding for None values (nullable columns) if value is None: return name, "DEFAULT", None - attr.adapter.validate(value) + attr.codec.validate(value) # Resolve full type chain - _, type_chain, resolved_store = resolve_dtype(f"<{attr.adapter.type_name}>", store_name=attr.store) + _, type_chain, resolved_store = resolve_dtype(f"<{attr.codec.type_name}>", store_name=attr.store) # Apply encoders from outermost to innermost for attr_type in type_chain: diff --git a/tests/integration/test_adapted_attributes.py b/tests/integration/test_adapted_attributes.py index ee88c6fcd..58a6e4cef 100644 --- a/tests/integration/test_adapted_attributes.py +++ b/tests/integration/test_adapted_attributes.py @@ -50,7 +50,7 @@ def local_schema(schema_ad, schema_name): @pytest.fixture def schema_virtual_module(schema_ad, schema_name): """Fixture for testing virtual modules""" - # Types are registered globally, no need to add_objects for adapters + # Types are registered globally, no need to add_objects for codecs schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, connection=schema_ad.connection) return schema_virtual_module diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 8eab3584f..3b4f4a6b1 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -14,17 +14,17 @@ class TestUsesContentStorage: """Tests for _uses_content_storage helper function.""" def test_returns_false_for_no_adapter(self): - """Test that False is returned when attribute has no adapter.""" + """Test that False is returned when attribute has no codec.""" attr = MagicMock() - attr.adapter = None + attr.codec = None assert gc._uses_content_storage(attr) is False def test_returns_true_for_hash_type(self): """Test that True is returned for type.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "hash" + attr.codec = MagicMock() + attr.codec.type_name = "hash" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -32,8 +32,8 @@ def test_returns_true_for_hash_type(self): def test_returns_true_for_blob_external(self): """Test that True is returned for type (external).""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "blob" + attr.codec = MagicMock() + attr.codec.type_name = "blob" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -41,8 +41,8 @@ def test_returns_true_for_blob_external(self): def test_returns_true_for_attach_external(self): """Test that True is returned for type (external).""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "attach" + attr.codec = MagicMock() + attr.codec.type_name = "attach" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -50,8 +50,8 @@ def test_returns_true_for_attach_external(self): def test_returns_false_for_blob_internal(self): """Test that False is returned for internal storage.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "blob" + attr.codec = MagicMock() + attr.codec.type_name = "blob" attr.store = None assert gc._uses_content_storage(attr) is False @@ -93,25 +93,25 @@ class TestUsesObjectStorage: """Tests for _uses_object_storage helper function.""" def test_returns_false_for_no_adapter(self): - """Test that False is returned when attribute has no adapter.""" + """Test that False is returned when attribute has no codec.""" attr = MagicMock() - attr.adapter = None + attr.codec = None assert gc._uses_object_storage(attr) is False def test_returns_true_for_object_type(self): """Test that True is returned for type.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "object" + attr.codec = MagicMock() + attr.codec.type_name = "object" assert gc._uses_object_storage(attr) is True def test_returns_false_for_other_types(self): """Test that False is returned for non-object types.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "blob" + attr.codec = MagicMock() + attr.codec.type_name = "blob" assert gc._uses_object_storage(attr) is False diff --git a/tests/integration/test_type_composition.py b/tests/integration/test_type_composition.py index 0e24da54d..00f81a6c0 100644 --- a/tests/integration/test_type_composition.py +++ b/tests/integration/test_type_composition.py @@ -5,7 +5,7 @@ and similar type chains. """ -from datajoint.attribute_type import ( +from datajoint.codecs import ( AttributeType, _codec_registry, resolve_dtype, diff --git a/tests/unit/test_attribute_type.py b/tests/unit/test_codecs.py similarity index 99% rename from tests/unit/test_attribute_type.py rename to tests/unit/test_codecs.py index 0c9b6811e..288b96aa4 100644 --- a/tests/unit/test_attribute_type.py +++ b/tests/unit/test_codecs.py @@ -1,11 +1,11 @@ """ -Tests for the new AttributeType system. +Tests for the Codec system. """ import pytest import datajoint as dj -from datajoint.attribute_type import ( +from datajoint.codecs import ( AttributeType, _codec_registry, get_type, From f3e4489bd9e539f60fadb233dd396e2dc9c809a8 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 17:23:31 -0600 Subject: [PATCH 24/32] Remove backward compatibility aliases from Codec API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove AttributeType alias (use Codec directly) - Remove register_type function (codecs auto-register) - Remove deprecated type_name property (use name) - Remove list_types, get_type, is_type_registered, unregister_type aliases - Update all internal usages from type_name to name - Update tests to use new API The previous implementation was experimental; no backward compatibility is needed for the v2.0 release. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/__init__.py | 9 +- src/datajoint/codecs.py | 87 --------- src/datajoint/fetch.py | 4 +- src/datajoint/gc.py | 10 +- src/datajoint/heading.py | 4 +- src/datajoint/staged_insert.py | 4 +- src/datajoint/table.py | 2 +- tests/integration/test_adapted_attributes.py | 4 +- tests/integration/test_gc.py | 12 +- tests/integration/test_type_composition.py | 56 +++--- tests/unit/test_codecs.py | 182 +++++++++---------- 11 files changed, 137 insertions(+), 237 deletions(-) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index c245c049f..684ffd083 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,14 +45,10 @@ "kill", "MatCell", "MatStruct", - # New codec API + # Codec API "Codec", "list_codecs", "get_codec", - # Backward compatibility aliases - "AttributeType", - "register_type", - "list_types", "errors", "migrate", "DataJointError", @@ -67,12 +63,9 @@ from . import migrate from .admin import kill from .codecs import ( - AttributeType, Codec, get_codec, list_codecs, - list_types, - register_type, ) from .blob import MatCell, MatStruct from .cli import cli diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py index 6ed3e4f05..cc592badd 100644 --- a/src/datajoint/codecs.py +++ b/src/datajoint/codecs.py @@ -36,7 +36,6 @@ class MyTable(dj.Manual): from __future__ import annotations import logging -import warnings from abc import ABC, abstractmethod from typing import Any @@ -179,80 +178,10 @@ def validate(self, value: Any) -> None: """ pass - # ========================================================================= - # Backward compatibility properties - # ========================================================================= - - @property - def type_name(self) -> str | None: - """Backward compatibility alias for `name`.""" - return self.name - - @property - def dtype(self) -> str: - """ - Backward compatibility property. - - Deprecated: Use get_dtype(is_external) instead. - """ - warnings.warn( - "Codec.dtype property is deprecated. Use get_dtype(is_external) instead.", - DeprecationWarning, - stacklevel=2, - ) - return self.get_dtype(is_external=False) - def __repr__(self) -> str: return f"<{self.__class__.__name__}(name={self.name!r})>" -# Backward compatibility alias -AttributeType = Codec - - -def register_type(cls: type[Codec]) -> type[Codec]: - """ - Register a codec with DataJoint. - - Deprecated: Codecs now auto-register when subclassed. This function - is kept for backward compatibility but is no longer needed. - - Args: - cls: A Codec subclass to register. - - Returns: - The same class, unmodified. - """ - warnings.warn( - "@dj.register_type is deprecated. Codecs auto-register when subclassed. " - "Just inherit from dj.Codec and set the 'name' class attribute.", - DeprecationWarning, - stacklevel=2, - ) - - if not isinstance(cls, type) or not issubclass(cls, Codec): - raise TypeError(f"register_type requires a Codec subclass, got {cls!r}") - - # Check if already registered - if cls.name and cls.name in _codec_registry: - existing = _codec_registry[cls.name] - if type(existing) is not cls: - raise DataJointError( - f"Codec <{cls.name}> already registered by " f"{type(existing).__module__}.{type(existing).__name__}" - ) - return cls # Same class, idempotent - - # Manual registration for classes that didn't auto-register - if cls.name: - _codec_registry[cls.name] = cls() - - return cls - - -# Backward compatibility alias -codec = register_type - - def parse_type_spec(spec: str) -> tuple[str, str | None]: """ Parse a type specification into type name and optional store parameter. @@ -299,10 +228,6 @@ def unregister_codec(name: str) -> None: del _codec_registry[name] -# Backward compatibility alias -unregister_type = unregister_codec - - def get_codec(name: str) -> Codec: """ Retrieve a registered codec by name. @@ -338,10 +263,6 @@ def get_codec(name: str) -> Codec: ) -# Backward compatibility alias -get_type = get_codec - - def list_codecs() -> list[str]: """ List all registered codec names. @@ -353,10 +274,6 @@ def list_codecs() -> list[str]: return sorted(_codec_registry.keys()) -# Backward compatibility alias -list_types = list_codecs - - def is_codec_registered(name: str) -> bool: """ Check if a codec name is registered. @@ -374,10 +291,6 @@ def is_codec_registered(name: str) -> bool: return type_name in _codec_registry -# Backward compatibility alias -is_type_registered = is_codec_registered - - def _load_entry_points() -> None: """ Load codecs from installed packages via entry points. diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 44551c4be..575f3cbfe 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -64,9 +64,9 @@ def _get(connection, attr, data, squeeze, download_path): # Include store if present to get correct chain for external storage store = getattr(attr, "store", None) if store is not None: - dtype_spec = f"<{attr.codec.type_name}@{store}>" + dtype_spec = f"<{attr.codec.name}@{store}>" else: - dtype_spec = f"<{attr.codec.type_name}>" + dtype_spec = f"<{attr.codec.name}>" final_dtype, type_chain, _ = resolve_dtype(dtype_spec) # First, process the final dtype (what's stored in the database) diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index cff0296ee..db327f37e 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -56,15 +56,15 @@ def _uses_content_storage(attr) -> bool: return False # Check if this type uses content storage - type_name = getattr(attr.codec, "type_name", "") + codec_name = getattr(attr.codec, "name", "") store = getattr(attr, "store", None) # always uses content storage (external only) - if type_name == "hash": + if codec_name == "hash": return True # and use content storage when external (has store) - if type_name in ("blob", "attach") and store is not None: + if codec_name in ("blob", "attach") and store is not None: return True return False @@ -83,8 +83,8 @@ def _uses_object_storage(attr) -> bool: if not attr.codec: return False - type_name = getattr(attr.codec, "type_name", "") - return type_name == "object" + codec_name = getattr(attr.codec, "name", "") + return codec_name == "object" def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index cc11eb760..c451089c0 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -18,13 +18,11 @@ class _MissingType(Codec, register=False): """Placeholder for missing/unregistered codecs. Raises error on use.""" - name = None # Don't auto-register - def __init__(self, codec_name: str): self._codec_name = codec_name @property - def type_name(self) -> str: + def name(self) -> str: return self._codec_name def get_dtype(self, is_external: bool) -> str: diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py index aa5635968..8f9c94d2c 100644 --- a/src/datajoint/staged_insert.py +++ b/src/datajoint/staged_insert.py @@ -98,8 +98,8 @@ def _get_storage_path(self, field: str, ext: str = "") -> str: raise DataJointError(f"Attribute '{field}' not found in table heading") attr = self._table.heading[field] - # Check if this is an object AttributeType (has adapter with "object" in type_name) - if not (attr.codec and hasattr(attr.codec, "type_name") and "object" in attr.codec.type_name): + # Check if this is an object Codec (has codec with "object" as name) + if not (attr.codec and attr.codec.name == "object"): raise DataJointError(f"Attribute '{field}' is not an type") # Extract primary key from rec diff --git a/src/datajoint/table.py b/src/datajoint/table.py index cb8dff8a0..9a3328a48 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -758,7 +758,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): attr.codec.validate(value) # Resolve full type chain - _, type_chain, resolved_store = resolve_dtype(f"<{attr.codec.type_name}>", store_name=attr.store) + _, type_chain, resolved_store = resolve_dtype(f"<{attr.codec.name}>", store_name=attr.store) # Apply encoders from outermost to innermost for attr_type in type_chain: diff --git a/tests/integration/test_adapted_attributes.py b/tests/integration/test_adapted_attributes.py index 58a6e4cef..3fe67a96a 100644 --- a/tests/integration/test_adapted_attributes.py +++ b/tests/integration/test_adapted_attributes.py @@ -1,7 +1,7 @@ """ Tests for adapted/custom attribute types. -These tests verify the AttributeType system for custom data types. +These tests verify the Codec system for custom data types. """ from itertools import zip_longest @@ -29,7 +29,7 @@ def schema_ad( schema_name, ): dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} - # Types are registered globally via @dj.register_type decorator in schema_adapted + # Codecs are auto-registered via __init_subclass__ in schema_adapted context = {**schema_adapted.LOCALS_ADAPTED} schema = dj.schema(schema_name, context=context, connection=connection_test) schema(schema_adapted.Connectivity) diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 3b4f4a6b1..e0c5fafca 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -24,7 +24,7 @@ def test_returns_true_for_hash_type(self): """Test that True is returned for type.""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "hash" + attr.codec.name = "hash" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -33,7 +33,7 @@ def test_returns_true_for_blob_external(self): """Test that True is returned for type (external).""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "blob" + attr.codec.name = "blob" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -42,7 +42,7 @@ def test_returns_true_for_attach_external(self): """Test that True is returned for type (external).""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "attach" + attr.codec.name = "attach" attr.store = "mystore" assert gc._uses_content_storage(attr) is True @@ -51,7 +51,7 @@ def test_returns_false_for_blob_internal(self): """Test that False is returned for internal storage.""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "blob" + attr.codec.name = "blob" attr.store = None assert gc._uses_content_storage(attr) is False @@ -103,7 +103,7 @@ def test_returns_true_for_object_type(self): """Test that True is returned for type.""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "object" + attr.codec.name = "object" assert gc._uses_object_storage(attr) is True @@ -111,7 +111,7 @@ def test_returns_false_for_other_types(self): """Test that False is returned for non-object types.""" attr = MagicMock() attr.codec = MagicMock() - attr.codec.type_name = "blob" + attr.codec.name = "blob" assert gc._uses_object_storage(attr) is False diff --git a/tests/integration/test_type_composition.py b/tests/integration/test_type_composition.py index 00f81a6c0..23ca927b0 100644 --- a/tests/integration/test_type_composition.py +++ b/tests/integration/test_type_composition.py @@ -6,7 +6,7 @@ """ from datajoint.codecs import ( - AttributeType, + Codec, _codec_registry, resolve_dtype, ) @@ -30,7 +30,7 @@ def teardown_method(self): def test_single_type_chain(self): """Test resolving a single-type chain.""" - class TestSingle(AttributeType): + class TestSingle(Codec): name = "test_single" def get_dtype(self, is_external: bool) -> str: @@ -46,13 +46,13 @@ def decode(self, stored, *, key=None): assert final_dtype == "varchar(100)" assert len(chain) == 1 - assert chain[0].type_name == "test_single" + assert chain[0].name == "test_single" assert store is None def test_two_type_chain(self): """Test resolving a two-type chain.""" - class TestInner(AttributeType): + class TestInner(Codec): name = "test_inner" def get_dtype(self, is_external: bool) -> str: @@ -64,7 +64,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TestOuter(AttributeType): + class TestOuter(Codec): name = "test_outer" def get_dtype(self, is_external: bool) -> str: @@ -80,13 +80,13 @@ def decode(self, stored, *, key=None): assert final_dtype == "bytes" assert len(chain) == 2 - assert chain[0].type_name == "test_outer" - assert chain[1].type_name == "test_inner" + assert chain[0].name == "test_outer" + assert chain[1].name == "test_inner" def test_three_type_chain(self): """Test resolving a three-type chain.""" - class TestBase(AttributeType): + class TestBase(Codec): name = "test_base" def get_dtype(self, is_external: bool) -> str: @@ -98,7 +98,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TestMiddle(AttributeType): + class TestMiddle(Codec): name = "test_middle" def get_dtype(self, is_external: bool) -> str: @@ -110,7 +110,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TestTop(AttributeType): + class TestTop(Codec): name = "test_top" def get_dtype(self, is_external: bool) -> str: @@ -126,9 +126,9 @@ def decode(self, stored, *, key=None): assert final_dtype == "json" assert len(chain) == 3 - assert chain[0].type_name == "test_top" - assert chain[1].type_name == "test_middle" - assert chain[2].type_name == "test_base" + assert chain[0].name == "test_top" + assert chain[1].name == "test_middle" + assert chain[2].name == "test_base" class TestTypeChainEncodeDecode: @@ -150,7 +150,7 @@ def test_encode_order(self): """Test that encode is applied outer → inner.""" encode_order = [] - class TestInnerEnc(AttributeType): + class TestInnerEnc(Codec): name = "test_inner_enc" def get_dtype(self, is_external: bool) -> str: @@ -163,7 +163,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TestOuterEnc(AttributeType): + class TestOuterEnc(Codec): name = "test_outer_enc" def get_dtype(self, is_external: bool) -> str: @@ -190,7 +190,7 @@ def test_decode_order(self): """Test that decode is applied inner → outer (reverse of encode).""" decode_order = [] - class TestInnerDec(AttributeType): + class TestInnerDec(Codec): name = "test_inner_dec" def get_dtype(self, is_external: bool) -> str: @@ -203,7 +203,7 @@ def decode(self, stored, *, key=None): decode_order.append("inner") return stored.replace(b"_inner", b"") - class TestOuterDec(AttributeType): + class TestOuterDec(Codec): name = "test_outer_dec" def get_dtype(self, is_external: bool) -> str: @@ -229,7 +229,7 @@ def decode(self, stored, *, key=None): def test_roundtrip(self): """Test encode/decode roundtrip through a type chain.""" - class TestInnerRt(AttributeType): + class TestInnerRt(Codec): name = "test_inner_rt" def get_dtype(self, is_external: bool) -> str: @@ -243,7 +243,7 @@ def decode(self, stored, *, key=None): # Decompress return stored.replace(b"COMPRESSED:", b"") - class TestOuterRt(AttributeType): + class TestOuterRt(Codec): name = "test_outer_rt" def get_dtype(self, is_external: bool) -> str: @@ -286,7 +286,7 @@ def test_blob_internal_resolves_to_bytes(self): assert final_dtype == "bytes" assert len(chain) == 1 - assert chain[0].type_name == "blob" + assert chain[0].name == "blob" def test_blob_external_resolves_to_json(self): """Test that → json.""" @@ -294,8 +294,8 @@ def test_blob_external_resolves_to_json(self): assert final_dtype == "json" assert len(chain) == 2 - assert chain[0].type_name == "blob" - assert chain[1].type_name == "hash" + assert chain[0].name == "blob" + assert chain[1].name == "hash" assert store == "store" def test_attach_internal_resolves_to_bytes(self): @@ -304,7 +304,7 @@ def test_attach_internal_resolves_to_bytes(self): assert final_dtype == "bytes" assert len(chain) == 1 - assert chain[0].type_name == "attach" + assert chain[0].name == "attach" def test_attach_external_resolves_to_json(self): """Test that → json.""" @@ -312,8 +312,8 @@ def test_attach_external_resolves_to_json(self): assert final_dtype == "json" assert len(chain) == 2 - assert chain[0].type_name == "attach" - assert chain[1].type_name == "hash" + assert chain[0].name == "attach" + assert chain[1].name == "hash" assert store == "store" def test_hash_external_resolves_to_json(self): @@ -322,7 +322,7 @@ def test_hash_external_resolves_to_json(self): assert final_dtype == "json" assert len(chain) == 1 - assert chain[0].type_name == "hash" + assert chain[0].name == "hash" assert store == "store" def test_object_external_resolves_to_json(self): @@ -331,7 +331,7 @@ def test_object_external_resolves_to_json(self): assert final_dtype == "json" assert len(chain) == 1 - assert chain[0].type_name == "object" + assert chain[0].name == "object" assert store == "store" def test_filepath_external_resolves_to_json(self): @@ -340,7 +340,7 @@ def test_filepath_external_resolves_to_json(self): assert final_dtype == "json" assert len(chain) == 1 - assert chain[0].type_name == "filepath" + assert chain[0].name == "filepath" assert store == "store" diff --git a/tests/unit/test_codecs.py b/tests/unit/test_codecs.py index 288b96aa4..ada626748 100644 --- a/tests/unit/test_codecs.py +++ b/tests/unit/test_codecs.py @@ -6,37 +6,36 @@ import datajoint as dj from datajoint.codecs import ( - AttributeType, + Codec, _codec_registry, - get_type, - is_type_registered, - list_types, - register_type, + get_codec, + is_codec_registered, + list_codecs, resolve_dtype, - unregister_type, + unregister_codec, ) from datajoint.errors import DataJointError -class TestAttributeTypeRegistry: - """Tests for the type registry functionality.""" +class TestCodecRegistry: + """Tests for the codec registry functionality.""" def setup_method(self): - """Clear any test types from registry before each test.""" + """Clear any test codecs from registry before each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] def teardown_method(self): - """Clean up test types after each test.""" + """Clean up test codecs after each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] - def test_register_type_auto(self): + def test_register_codec_auto(self): """Test auto-registration via __init_subclass__.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_decorator" def get_dtype(self, is_external: bool) -> str: @@ -48,14 +47,14 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - assert is_type_registered("test_decorator") - assert get_type("test_decorator").type_name == "test_decorator" + assert is_codec_registered("test_decorator") + assert get_codec("test_decorator").name == "test_decorator" - def test_register_type_direct(self): - """Test registering a type by calling register_type directly.""" + def test_register_codec_skip(self): + """Test skipping registration with register=False.""" - class TestType(AttributeType, register=False): - name = "test_direct" + class TestCodec(Codec, register=False): + name = "test_skip" def get_dtype(self, is_external: bool) -> str: return "varchar(255)" @@ -66,13 +65,12 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - register_type(TestType) - assert is_type_registered("test_direct") + assert not is_codec_registered("test_skip") - def test_register_type_idempotent(self): - """Test that registering the same type twice is idempotent.""" + def test_register_codec_idempotent(self): + """Test that defining the same codec class twice is idempotent.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_idempotent" def get_dtype(self, is_external: bool) -> str: @@ -84,14 +82,13 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - # Second registration should not raise - register_type(TestType) - assert is_type_registered("test_idempotent") + # Redefine the same name should not raise (same class) + assert is_codec_registered("test_idempotent") def test_register_duplicate_name_different_class(self): """Test that registering different classes with same name raises error.""" - class TestType1(AttributeType): + class TestCodec1(Codec): name = "test_duplicate" def get_dtype(self, is_external: bool) -> str: @@ -103,25 +100,24 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TestType2(AttributeType, register=False): - name = "test_duplicate" + with pytest.raises(DataJointError, match="already registered"): - def get_dtype(self, is_external: bool) -> str: - return "varchar(100)" + class TestCodec2(Codec): + name = "test_duplicate" - def encode(self, value, *, key=None, store_name=None): - return str(value) + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" - def decode(self, stored, *, key=None): - return stored + def encode(self, value, *, key=None, store_name=None): + return str(value) - with pytest.raises(DataJointError, match="already registered"): - register_type(TestType2) + def decode(self, stored, *, key=None): + return stored - def test_unregister_type(self): - """Test unregistering a type.""" + def test_unregister_codec(self): + """Test unregistering a codec.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_unregister" def get_dtype(self, is_external: bool) -> str: @@ -133,19 +129,19 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - assert is_type_registered("test_unregister") - unregister_type("test_unregister") - assert not is_type_registered("test_unregister") + assert is_codec_registered("test_unregister") + unregister_codec("test_unregister") + assert not is_codec_registered("test_unregister") - def test_get_type_not_found(self): - """Test that getting an unregistered type raises error.""" + def test_get_codec_not_found(self): + """Test that getting an unregistered codec raises error.""" with pytest.raises(DataJointError, match="Unknown codec"): - get_type("nonexistent_type") + get_codec("nonexistent_codec") - def test_list_types(self): - """Test listing registered types.""" + def test_list_codecs(self): + """Test listing registered codecs.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_list" def get_dtype(self, is_external: bool) -> str: @@ -157,14 +153,14 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - types = list_types() - assert "test_list" in types - assert types == sorted(types) # Should be sorted + codecs = list_codecs() + assert "test_list" in codecs + assert codecs == sorted(codecs) # Should be sorted - def test_get_type_strips_brackets(self): - """Test that get_type accepts names with or without angle brackets.""" + def test_get_codec_strips_brackets(self): + """Test that get_codec accepts names with or without angle brackets.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_brackets" def get_dtype(self, is_external: bool) -> str: @@ -176,10 +172,10 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - assert get_type("test_brackets") is get_type("") + assert get_codec("test_brackets") is get_codec("") -class TestAttributeTypeValidation: +class TestCodecValidation: """Tests for the validate method.""" def setup_method(self): @@ -195,7 +191,7 @@ def teardown_method(self): def test_validate_called_default(self): """Test that default validate accepts any value.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_validate_default" def get_dtype(self, is_external: bool) -> str: @@ -207,7 +203,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - t = get_type("test_validate_default") + t = get_codec("test_validate_default") # Default validate should not raise for any value t.validate(None) t.validate(42) @@ -217,7 +213,7 @@ def decode(self, stored, *, key=None): def test_validate_custom(self): """Test custom validation logic.""" - class PositiveIntType(AttributeType): + class PositiveIntCodec(Codec): name = "test_positive_int" def get_dtype(self, is_external: bool) -> str: @@ -235,7 +231,7 @@ def validate(self, value): if value < 0: raise ValueError("Value must be positive") - t = get_type("test_positive_int") + t = get_codec("test_positive_int") t.validate(42) # Should pass with pytest.raises(TypeError): @@ -245,8 +241,8 @@ def validate(self, value): t.validate(-1) -class TestTypeChaining: - """Tests for type chaining (dtype referencing another custom type).""" +class TestCodecChaining: + """Tests for codec chaining (dtype referencing another codec).""" def setup_method(self): for name in list(_codec_registry.keys()): @@ -268,7 +264,7 @@ def test_resolve_native_dtype(self): def test_resolve_custom_dtype(self): """Test resolving a custom dtype.""" - class TestType(AttributeType): + class TestCodec(Codec): name = "test_resolve" def get_dtype(self, is_external: bool) -> str: @@ -283,13 +279,13 @@ def decode(self, stored, *, key=None): final_dtype, chain, store = resolve_dtype("") assert final_dtype == "varchar(100)" assert len(chain) == 1 - assert chain[0].type_name == "test_resolve" + assert chain[0].name == "test_resolve" assert store is None def test_resolve_chained_dtype(self): """Test resolving a chained dtype.""" - class InnerType(AttributeType): + class InnerCodec(Codec): name = "test_inner" def get_dtype(self, is_external: bool) -> str: @@ -301,7 +297,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class OuterType(AttributeType): + class OuterCodec(Codec): name = "test_outer" def get_dtype(self, is_external: bool) -> str: @@ -316,14 +312,14 @@ def decode(self, stored, *, key=None): final_dtype, chain, store = resolve_dtype("") assert final_dtype == "bytes" assert len(chain) == 2 - assert chain[0].type_name == "test_outer" - assert chain[1].type_name == "test_inner" + assert chain[0].name == "test_outer" + assert chain[1].name == "test_inner" assert store is None def test_circular_reference_detection(self): - """Test that circular type references are detected.""" + """Test that circular codec references are detected.""" - class TypeA(AttributeType): + class CodecA(Codec): name = "test_circular_a" def get_dtype(self, is_external: bool) -> str: @@ -335,7 +331,7 @@ def encode(self, value, *, key=None, store_name=None): def decode(self, stored, *, key=None): return stored - class TypeB(AttributeType): + class CodecB(Codec): name = "test_circular_b" def get_dtype(self, is_external: bool) -> str: @@ -355,10 +351,10 @@ class TestExportsAndAPI: """Test that the public API is properly exported.""" def test_exports_from_datajoint(self): - """Test that AttributeType and helpers are exported from datajoint.""" - assert hasattr(dj, "AttributeType") - assert hasattr(dj, "register_type") - assert hasattr(dj, "list_types") + """Test that Codec and helpers are exported from datajoint.""" + assert hasattr(dj, "Codec") + assert hasattr(dj, "get_codec") + assert hasattr(dj, "list_codecs") class TestBlobCodec: @@ -366,20 +362,20 @@ class TestBlobCodec: def test_blob_is_registered(self): """Test that blob is automatically registered.""" - assert is_type_registered("blob") + assert is_codec_registered("blob") def test_blob_properties(self): """Test BlobCodec properties.""" - blob_type = get_type("blob") - assert blob_type.type_name == "blob" - assert blob_type.get_dtype(is_external=False) == "bytes" - assert blob_type.get_dtype(is_external=True) == "" + blob_codec = get_codec("blob") + assert blob_codec.name == "blob" + assert blob_codec.get_dtype(is_external=False) == "bytes" + assert blob_codec.get_dtype(is_external=True) == "" def test_blob_encode_decode_roundtrip(self): """Test that encode/decode is a proper roundtrip.""" import numpy as np - blob_type = get_type("blob") + blob_codec = get_codec("blob") # Test with various data types test_data = [ @@ -392,9 +388,9 @@ def test_blob_encode_decode_roundtrip(self): ] for original in test_data: - encoded = blob_type.encode(original) + encoded = blob_codec.encode(original) assert isinstance(encoded, bytes) - decoded = blob_type.decode(encoded) + decoded = blob_codec.decode(encoded) if isinstance(original, np.ndarray): np.testing.assert_array_equal(decoded, original) else: @@ -402,17 +398,17 @@ def test_blob_encode_decode_roundtrip(self): def test_blob_encode_produces_valid_blob_format(self): """Test that encoded data has valid blob protocol header.""" - blob_type = get_type("blob") - encoded = blob_type.encode({"test": "data"}) + blob_codec = get_codec("blob") + encoded = blob_codec.encode({"test": "data"}) # Should start with compression prefix or protocol header valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") assert any(encoded.startswith(p) for p in valid_prefixes) - def test_blob_in_list_types(self): - """Test that blob appears in list_types.""" - types = list_types() - assert "blob" in types + def test_blob_in_list_codecs(self): + """Test that blob appears in list_codecs.""" + codecs = list_codecs() + assert "blob" in codecs def test_blob_handles_serialization(self): """Test that BlobCodec handles serialization internally. @@ -421,13 +417,13 @@ def test_blob_handles_serialization(self): - Plain bytes columns store/return raw bytes (no serialization) - handles pack/unpack in encode/decode """ - blob_type = get_type("blob") + blob_codec = get_codec("blob") # BlobCodec.encode() should produce packed bytes data = {"key": "value"} - encoded = blob_type.encode(data) + encoded = blob_codec.encode(data) assert isinstance(encoded, bytes) # BlobCodec.decode() should unpack back to original - decoded = blob_type.decode(encoded) + decoded = blob_codec.decode(encoded) assert decoded == data From 4417ec1d56b132a78b45201d086a423039a15f66 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 17:28:26 -0600 Subject: [PATCH 25/32] Add Codec documentation with plugin specification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add codec-spec.md: detailed API specification for creating codecs - Add codecs.md: user guide with examples (replaces customtype.md) - Remove customtype.md (replaced by codecs.md) Documentation covers: - Codec base class and required methods - Auto-registration via __init_subclass__ - Codec composition/chaining - Plugin system via entry points - Built-in codecs (blob, hash, object, attach, filepath) - Complete examples for neuroscience workflows 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/src/design/tables/codec-spec.md | 766 +++++++++++++++++++++++++++ docs/src/design/tables/codecs.md | 553 +++++++++++++++++++ docs/src/design/tables/customtype.md | 615 --------------------- 3 files changed, 1319 insertions(+), 615 deletions(-) create mode 100644 docs/src/design/tables/codec-spec.md create mode 100644 docs/src/design/tables/codecs.md delete mode 100644 docs/src/design/tables/customtype.md diff --git a/docs/src/design/tables/codec-spec.md b/docs/src/design/tables/codec-spec.md new file mode 100644 index 000000000..a3eefa578 --- /dev/null +++ b/docs/src/design/tables/codec-spec.md @@ -0,0 +1,766 @@ +# Codec Specification + +This document specifies the DataJoint Codec API for creating custom attribute types +that extend DataJoint's native type system. + +## Overview + +Codecs define bidirectional conversion between Python objects and database storage. +They enable storing complex data types (graphs, models, custom formats) while +maintaining DataJoint's query capabilities. + +``` +┌─────────────────┐ ┌─────────────────┐ +│ Python Object │ ──── encode ────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. bytes) │ +│ │ ◄─── decode ──── │ │ +└─────────────────┘ └─────────────────┘ +``` + +## Quick Start + +```python +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + """Store NetworkX graphs.""" + + name = "graph" # Use as in definitions + + def get_dtype(self, is_external: bool) -> str: + return "" # Delegate to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +# Use in table definition +@schema +class Connectivity(dj.Manual): + definition = ''' + conn_id : int + --- + network : + ''' +``` + +## The Codec Base Class + +All custom codecs inherit from `dj.Codec`: + +```python +class Codec(ABC): + """Base class for codec types.""" + + name: str | None = None # Required: unique identifier + + def get_dtype(self, is_external: bool) -> str: + """Return the storage dtype.""" + raise NotImplementedError + + @abstractmethod + def encode(self, value, *, key=None, store_name=None) -> Any: + """Encode Python value for storage.""" + ... + + @abstractmethod + def decode(self, stored, *, key=None) -> Any: + """Decode stored value back to Python.""" + ... + + def validate(self, value) -> None: + """Optional: validate value before encoding.""" + pass +``` + +## Required Components + +### 1. The `name` Attribute + +The `name` class attribute is a unique identifier used in table definitions with +`` syntax: + +```python +class MyCodec(dj.Codec): + name = "mycodec" # Use as in definitions +``` + +Naming conventions: +- Use lowercase with underscores: `spike_train`, `graph_embedding` +- Avoid generic names that might conflict: prefer `lab_model` over `model` +- Names must be unique across all registered codecs + +### 2. The `get_dtype()` Method + +Returns the underlying storage type. The `is_external` parameter indicates whether +the `@` modifier is present in the table definition: + +```python +def get_dtype(self, is_external: bool) -> str: + """ + Args: + is_external: True if @ modifier present (e.g., ) + + Returns: + - A core type: "bytes", "json", "varchar(N)", "int32", etc. + - Another codec: "", "", etc. + + Raises: + DataJointError: If external storage not supported but @ is present + """ +``` + +Examples: + +```python +# Simple: always store as bytes +def get_dtype(self, is_external: bool) -> str: + return "bytes" + +# Different behavior for internal/external +def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" + +# External-only codec +def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" +``` + +### 3. The `encode()` Method + +Converts Python objects to the format expected by `get_dtype()`: + +```python +def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: + """ + Args: + value: The Python object to store + key: Primary key values (for context-dependent encoding) + store_name: Target store name (for external storage) + + Returns: + Value in the format expected by get_dtype() + """ +``` + +### 4. The `decode()` Method + +Converts stored values back to Python objects: + +```python +def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Args: + stored: Data retrieved from storage + key: Primary key values (for context-dependent decoding) + + Returns: + The reconstructed Python object + """ +``` + +### 5. The `validate()` Method (Optional) + +Called automatically before `encode()` during INSERT operations: + +```python +def validate(self, value: Any) -> None: + """ + Args: + value: The value to validate + + Raises: + TypeError: If the value has an incompatible type + ValueError: If the value fails domain validation + """ + if not isinstance(value, ExpectedType): + raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") +``` + +## Auto-Registration + +Codecs automatically register when their class is defined. No decorator needed: + +```python +# This codec is registered automatically when the class is defined +class MyCodec(dj.Codec): + name = "mycodec" + # ... +``` + +### Skipping Registration + +For abstract base classes that shouldn't be registered: + +```python +class BaseCodec(dj.Codec, register=False): + """Abstract base - not registered.""" + name = None # Or omit entirely + +class ConcreteCodec(BaseCodec): + name = "concrete" # This one IS registered + # ... +``` + +### Registration Timing + +Codecs are registered at class definition time. Ensure your codec classes are +imported before any table definitions that use them: + +```python +# myproject/codecs.py +class GraphCodec(dj.Codec): + name = "graph" + ... + +# myproject/tables.py +import myproject.codecs # Ensure codecs are registered + +@schema +class Networks(dj.Manual): + definition = ''' + id : int + --- + network : + ''' +``` + +## Codec Composition (Chaining) + +Codecs can delegate to other codecs by returning `` from `get_dtype()`. +This enables layered functionality: + +```python +class CompressedJsonCodec(dj.Codec): + """Compress JSON data with zlib.""" + + name = "zjson" + + def get_dtype(self, is_external: bool) -> str: + return "" # Delegate serialization to blob codec + + def encode(self, value, *, key=None, store_name=None): + import json, zlib + json_bytes = json.dumps(value).encode('utf-8') + return zlib.compress(json_bytes) + + def decode(self, stored, *, key=None): + import json, zlib + json_bytes = zlib.decompress(stored) + return json.loads(json_bytes.decode('utf-8')) +``` + +### How Chaining Works + +When DataJoint encounters ``: + +1. Calls `ZjsonCodec.get_dtype(is_external=False)` → returns `""` +2. Calls `BlobCodec.get_dtype(is_external=False)` → returns `"bytes"` +3. Final storage type is `bytes` (LONGBLOB in MySQL) + +During INSERT: +1. `ZjsonCodec.encode()` converts Python dict → compressed bytes +2. `BlobCodec.encode()` packs bytes → DJ blob format +3. Stored in database + +During FETCH: +1. Read from database +2. `BlobCodec.decode()` unpacks DJ blob → compressed bytes +3. `ZjsonCodec.decode()` decompresses → Python dict + +### Built-in Codec Chains + +DataJoint's built-in codecs form these chains: + +``` + → bytes (internal) + → json (external) + + → bytes (internal) + → json (external) + + → json (external only) + → json (external only) + → json (external only) +``` + +### Store Name Propagation + +When using external storage (`@`), the store name propagates through the chain: + +```python +# Table definition +data : + +# Resolution: +# 1. MyCodec.get_dtype(is_external=True) → "" +# 2. BlobCodec.get_dtype(is_external=True) → "" +# 3. HashCodec.get_dtype(is_external=True) → "json" +# 4. store_name="coldstore" passed to HashCodec.encode() +``` + +## Plugin System (Entry Points) + +Codecs can be distributed as installable packages using Python entry points. + +### Package Structure + +``` +dj-graph-codecs/ +├── pyproject.toml +└── src/ + └── dj_graph_codecs/ + ├── __init__.py + └── codecs.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-codecs" +version = "1.0.0" +dependencies = ["datajoint>=2.0", "networkx"] + +[project.entry-points."datajoint.codecs"] +graph = "dj_graph_codecs.codecs:GraphCodec" +weighted_graph = "dj_graph_codecs.codecs:WeightedGraphCodec" +``` + +### Codec Implementation + +```python +# src/dj_graph_codecs/codecs.py +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +class WeightedGraphCodec(dj.Codec): + name = "weighted_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': [(u, v, d) for u, v, d in graph.edges(data=True)], + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + for u, v, d in stored['edges']: + G.add_edge(u, v, **d) + return G +``` + +### Usage After Installation + +```bash +pip install dj-graph-codecs +``` + +```python +# Codecs are automatically discovered and available +@schema +class Networks(dj.Manual): + definition = ''' + network_id : int + --- + topology : + weights : + ''' +``` + +### Entry Point Discovery + +DataJoint loads entry points lazily when a codec is first requested: + +1. Check explicit registry (codecs defined in current process) +2. Load entry points from `datajoint.codecs` group +3. Also checks legacy `datajoint.types` group for compatibility + +## API Reference + +### Module Functions + +```python +import datajoint as dj + +# List all registered codec names +dj.list_codecs() # Returns: ['blob', 'hash', 'object', 'attach', 'filepath', ...] + +# Get a codec instance by name +codec = dj.get_codec("blob") +codec = dj.get_codec("") # Angle brackets are optional +codec = dj.get_codec("") # Store parameter is stripped +``` + +### Internal Functions (for advanced use) + +```python +from datajoint.codecs import ( + is_codec_registered, # Check if codec exists + unregister_codec, # Remove codec (testing only) + resolve_dtype, # Resolve codec chain + parse_type_spec, # Parse "" syntax +) +``` + +## Built-in Codecs + +DataJoint provides these built-in codecs: + +| Codec | Internal | External | Description | +|-------|----------|----------|-------------| +| `` | `bytes` | `` | DataJoint serialization for Python objects | +| `` | N/A | `json` | Content-addressed storage with MD5 deduplication | +| `` | N/A | `json` | Path-addressed storage for files/folders | +| `` | `bytes` | `` | File attachments with filename preserved | +| `` | N/A | `json` | Reference to existing files in store | + +## Complete Examples + +### Example 1: Simple Serialization + +```python +import datajoint as dj +import numpy as np + +class SpikeTrainCodec(dj.Codec): + """Efficient storage for sparse spike timing data.""" + + name = "spike_train" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if len(value) > 1 and not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None, store_name=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) +``` + +### Example 2: External Storage + +```python +import datajoint as dj +import pickle + +class ModelCodec(dj.Codec): + """Store ML models with optional external storage.""" + + name = "model" + + def get_dtype(self, is_external: bool) -> str: + # Use hash-addressed storage for large models + return "" if is_external else "" + + def encode(self, model, *, key=None, store_name=None): + return pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) + + def decode(self, stored, *, key=None): + return pickle.loads(stored) + + def validate(self, value): + # Check that model has required interface + if not hasattr(value, 'predict'): + raise TypeError("Model must have a predict() method") +``` + +Usage: +```python +@schema +class Models(dj.Manual): + definition = ''' + model_id : int + --- + small_model : # Internal storage + large_model : # External (default store) + archive_model : # External (specific store) + ''' +``` + +### Example 3: JSON with Schema Validation + +```python +import datajoint as dj +import jsonschema + +class ConfigCodec(dj.Codec): + """Store validated JSON configuration.""" + + name = "config" + + SCHEMA = { + "type": "object", + "properties": { + "version": {"type": "integer", "minimum": 1}, + "settings": {"type": "object"}, + }, + "required": ["version", "settings"], + } + + def get_dtype(self, is_external: bool) -> str: + return "json" + + def validate(self, value): + jsonschema.validate(value, self.SCHEMA) + + def encode(self, config, *, key=None, store_name=None): + return config # JSON type handles serialization + + def decode(self, stored, *, key=None): + return stored +``` + +### Example 4: Context-Dependent Encoding + +```python +import datajoint as dj + +class VersionedDataCodec(dj.Codec): + """Handle different encoding versions based on primary key.""" + + name = "versioned" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + version = key.get("schema_version", 1) if key else 1 + if version >= 2: + return {"v": 2, "data": self._encode_v2(value)} + return {"v": 1, "data": self._encode_v1(value)} + + def decode(self, stored, *, key=None): + version = stored.get("v", 1) + if version >= 2: + return self._decode_v2(stored["data"]) + return self._decode_v1(stored["data"]) + + def _encode_v1(self, value): + return value + + def _decode_v1(self, data): + return data + + def _encode_v2(self, value): + # New encoding format + return {"optimized": True, "payload": value} + + def _decode_v2(self, data): + return data["payload"] +``` + +### Example 5: External-Only Codec + +```python +import datajoint as dj +from pathlib import Path + +class ZarrCodec(dj.Codec): + """Store Zarr arrays in object storage.""" + + name = "zarr" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise dj.DataJointError(" requires @ (external storage only)") + return "" # Delegate to object storage + + def encode(self, value, *, key=None, store_name=None): + import zarr + import tempfile + + # If already a path, pass through + if isinstance(value, (str, Path)): + return str(value) + + # If zarr array, save to temp and return path + if isinstance(value, zarr.Array): + tmpdir = tempfile.mkdtemp() + path = Path(tmpdir) / "data.zarr" + zarr.save(path, value) + return str(path) + + raise TypeError(f"Expected zarr.Array or path, got {type(value)}") + + def decode(self, stored, *, key=None): + # ObjectCodec returns ObjectRef, use its fsmap for zarr + import zarr + return zarr.open(stored.fsmap, mode='r') +``` + +## Best Practices + +### 1. Choose Appropriate Storage Types + +| Data Type | Recommended `get_dtype()` | +|-----------|---------------------------| +| Python objects (dicts, arrays) | `""` | +| Large binary data | `""` (external) | +| Files/folders (Zarr, HDF5) | `""` (external) | +| Simple JSON-serializable | `"json"` | +| Short strings | `"varchar(N)"` | +| Numeric identifiers | `"int32"`, `"int64"` | + +### 2. Handle None Values + +Nullable columns may pass `None` to your codec: + +```python +def encode(self, value, *, key=None, store_name=None): + if value is None: + return None # Pass through for nullable columns + return self._actual_encode(value) + +def decode(self, stored, *, key=None): + if stored is None: + return None + return self._actual_decode(stored) +``` + +### 3. Test Round-Trips + +Always verify that `decode(encode(x)) == x`: + +```python +def test_codec_roundtrip(): + codec = MyCodec() + + test_values = [ + {"key": "value"}, + [1, 2, 3], + np.array([1.0, 2.0]), + ] + + for original in test_values: + encoded = codec.encode(original) + decoded = codec.decode(encoded) + assert decoded == original or np.array_equal(decoded, original) +``` + +### 4. Include Validation + +Catch errors early with `validate()`: + +```python +def validate(self, value): + if not isinstance(value, ExpectedType): + raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") + + if not self._is_valid(value): + raise ValueError("Value fails validation constraints") +``` + +### 5. Document Expected Formats + +Include docstrings explaining input/output formats: + +```python +class MyCodec(dj.Codec): + """ + Store MyType objects. + + Input format (encode): + MyType instance with attributes: x, y, z + + Storage format: + Dict with keys: 'x', 'y', 'z' + + Output format (decode): + MyType instance reconstructed from storage + """ +``` + +### 6. Consider Versioning + +If your encoding format might change: + +```python +def encode(self, value, *, key=None, store_name=None): + return { + "_version": 2, + "_data": self._encode_v2(value), + } + +def decode(self, stored, *, key=None): + version = stored.get("_version", 1) + data = stored.get("_data", stored) + + if version == 1: + return self._decode_v1(data) + return self._decode_v2(data) +``` + +## Error Handling + +### Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| `Unknown codec: ` | Codec not registered | Import module defining codec before table definition | +| `Codec already registered` | Duplicate name | Use unique names; check for conflicts | +| ` requires @` | External-only codec used without @ | Add `@` or `@store` to attribute type | +| `Circular codec reference` | Codec chain forms a loop | Check `get_dtype()` return values | + +### Debugging + +```python +# Check what codecs are registered +print(dj.list_codecs()) + +# Inspect a codec +codec = dj.get_codec("mycodec") +print(f"Name: {codec.name}") +print(f"Internal dtype: {codec.get_dtype(is_external=False)}") +print(f"External dtype: {codec.get_dtype(is_external=True)}") + +# Resolve full chain +from datajoint.codecs import resolve_dtype +final_type, chain, store = resolve_dtype("") +print(f"Final storage type: {final_type}") +print(f"Codec chain: {[c.name for c in chain]}") +print(f"Store: {store}") +``` diff --git a/docs/src/design/tables/codecs.md b/docs/src/design/tables/codecs.md new file mode 100644 index 000000000..ccc9db1f7 --- /dev/null +++ b/docs/src/design/tables/codecs.md @@ -0,0 +1,553 @@ +# Custom Codecs + +In modern scientific research, data pipelines often involve complex workflows that +generate diverse data types. From high-dimensional imaging data to machine learning +models, these data types frequently exceed the basic representations supported by +traditional relational databases. For example: + ++ A lab working on neural connectivity might use graph objects to represent brain + networks. ++ Researchers processing raw imaging data might store custom objects for pre-processing + configurations. ++ Computational biologists might store fitted machine learning models or parameter + objects for downstream predictions. + +To handle these diverse needs, DataJoint provides the **Codec** system. It +enables researchers to store and retrieve complex, non-standard data types—like Python +objects or data structures—in a relational database while maintaining the +reproducibility, modularity, and query capabilities required for scientific workflows. + +## Overview + +Custom codecs define bidirectional conversion between: + +- **Python objects** (what your code works with) +- **Storage format** (what gets stored in the database) + +``` +┌─────────────────┐ encode() ┌─────────────────┐ +│ Python Object │ ───────────────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. bytes) │ +└─────────────────┘ decode() └─────────────────┘ + ◄─────────────── +``` + +## Defining Custom Codecs + +Create a custom codec by subclassing `dj.Codec` and implementing the required +methods. Codecs auto-register when their class is defined: + +```python +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + """Custom codec for storing networkx graphs.""" + + # Required: unique identifier used in table definitions + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + """Return the underlying storage type.""" + return "" # Delegate to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + """Convert graph to storable format (called on INSERT).""" + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + """Convert stored data back to graph (called on FETCH).""" + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +### Required Components + +| Component | Description | +|-----------|-------------| +| `name` | Unique identifier used in table definitions with `` syntax | +| `get_dtype(is_external)` | Returns underlying storage type (e.g., `""`, `"bytes"`, `"json"`) | +| `encode(value, *, key=None, store_name=None)` | Converts Python object to storable format | +| `decode(stored, *, key=None)` | Converts stored data back to Python object | + +### Using Custom Codecs in Tables + +Once defined, use the codec in table definitions with angle brackets: + +```python +@schema +class Connectivity(dj.Manual): + definition = """ + conn_id : int + --- + conn_graph = null : # Uses the GraphCodec we defined + """ +``` + +Insert and fetch work seamlessly: + +```python +import networkx as nx + +# Insert - encode() is called automatically +g = nx.lollipop_graph(4, 2) +Connectivity.insert1({"conn_id": 1, "conn_graph": g}) + +# Fetch - decode() is called automatically +result = (Connectivity & "conn_id = 1").fetch1("conn_graph") +assert isinstance(result, nx.Graph) +``` + +## Auto-Registration + +Codecs automatically register when their class is defined. No decorator needed: + +```python +# This codec is registered automatically when the class is defined +class MyCodec(dj.Codec): + name = "mycodec" + ... +``` + +### Skipping Registration + +For abstract base classes that shouldn't be registered: + +```python +class BaseCodec(dj.Codec, register=False): + """Abstract base - not registered.""" + name = None + +class ConcreteCodec(BaseCodec): + name = "concrete" # This one IS registered + ... +``` + +### Listing Registered Codecs + +```python +# List all registered codec names +print(dj.list_codecs()) +``` + +## Validation + +Add data validation by overriding the `validate()` method. It's called automatically +before `encode()` during INSERT operations: + +```python +class PositiveArrayCodec(dj.Codec): + name = "positive_array" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + """Ensure all values are positive.""" + import numpy as np + if not isinstance(value, np.ndarray): + raise TypeError(f"Expected numpy array, got {type(value).__name__}") + if np.any(value < 0): + raise ValueError("Array must contain only positive values") + + def encode(self, array, *, key=None, store_name=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## The `get_dtype()` Method + +The `get_dtype()` method specifies how data is stored. The `is_external` parameter +indicates whether the `@` modifier is present: + +```python +def get_dtype(self, is_external: bool) -> str: + """ + Args: + is_external: True if @ modifier present (e.g., ) + + Returns: + - A core type: "bytes", "json", "varchar(N)", etc. + - Another codec: "", "", etc. + """ +``` + +### Storage Type Options + +| Return Value | Use Case | Database Type | +|--------------|----------|---------------| +| `"bytes"` | Raw binary data | LONGBLOB | +| `"json"` | JSON-serializable data | JSON | +| `"varchar(N)"` | String representations | VARCHAR(N) | +| `"int32"` | Integer identifiers | INT | +| `""` | Serialized Python objects | Depends on internal/external | +| `""` | Large objects with deduplication | JSON (external only) | +| `""` | Chain to another codec | Varies | + +### External Storage + +For large data, use external storage with the `@` modifier: + +```python +class LargeArrayCodec(dj.Codec): + name = "large_array" + + def get_dtype(self, is_external: bool) -> str: + # Use hash-addressed external storage for large data + return "" if is_external else "" + + def encode(self, array, *, key=None, store_name=None): + import pickle + return pickle.dumps(array) + + def decode(self, stored, *, key=None): + import pickle + return pickle.loads(stored) +``` + +Usage: +```python +@schema +class Data(dj.Manual): + definition = ''' + id : int + --- + small_array : # Internal (in database) + big_array : # External (default store) + archive : # External (specific store) + ''' +``` + +## Codec Chaining + +Custom codecs can build on other codecs by returning `` from `get_dtype()`: + +```python +class CompressedGraphCodec(dj.Codec): + name = "compressed_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" # Chain to the GraphCodec + + def encode(self, graph, *, key=None, store_name=None): + # Compress before passing to GraphCodec + return self._compress(graph) + + def decode(self, stored, *, key=None): + # GraphCodec's decode already ran, decompress result + return self._decompress(stored) +``` + +DataJoint automatically resolves the chain to find the final storage type. + +### How Chaining Works + +When DataJoint encounters ``: + +1. `CompressedGraphCodec.get_dtype()` returns `""` +2. `GraphCodec.get_dtype()` returns `""` +3. `BlobCodec.get_dtype()` returns `"bytes"` +4. Final storage type is `bytes` (LONGBLOB in MySQL) + +During INSERT, encoders run outer → inner: +1. `CompressedGraphCodec.encode()` → compressed graph +2. `GraphCodec.encode()` → edge list dict +3. `BlobCodec.encode()` → serialized bytes + +During FETCH, decoders run inner → outer (reverse order). + +## The Key Parameter + +The `key` parameter provides access to primary key values during encode/decode +operations. This is useful when the conversion depends on record context: + +```python +class ContextAwareCodec(dj.Codec): + name = "context_aware" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + if key and key.get("version") == 2: + return self._encode_v2(value) + return self._encode_v1(value) + + def decode(self, stored, *, key=None): + if key and key.get("version") == 2: + return self._decode_v2(stored) + return self._decode_v1(stored) +``` + +## Publishing Codecs as Packages + +Custom codecs can be distributed as installable packages using Python entry points. +This allows codecs to be automatically discovered when the package is installed. + +### Package Structure + +``` +dj-graph-codecs/ +├── pyproject.toml +└── src/ + └── dj_graph_codecs/ + ├── __init__.py + └── codecs.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-codecs" +version = "1.0.0" +dependencies = ["datajoint>=2.0", "networkx"] + +[project.entry-points."datajoint.codecs"] +graph = "dj_graph_codecs.codecs:GraphCodec" +weighted_graph = "dj_graph_codecs.codecs:WeightedGraphCodec" +``` + +### Codec Implementation + +```python +# src/dj_graph_codecs/codecs.py +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +class WeightedGraphCodec(dj.Codec): + name = "weighted_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return [(u, v, d) for u, v, d in graph.edges(data=True)] + + def decode(self, edges, *, key=None): + g = nx.Graph() + for u, v, d in edges: + g.add_edge(u, v, **d) + return g +``` + +### Usage After Installation + +```bash +pip install dj-graph-codecs +``` + +```python +# Codecs are automatically available after package installation +@schema +class MyTable(dj.Manual): + definition = """ + id : int + --- + network : + weighted_network : + """ +``` + +## Complete Example + +Here's a complete example demonstrating custom codecs for a neuroscience workflow: + +```python +import datajoint as dj +import numpy as np + +# Define custom codecs +class SpikeTrainCodec(dj.Codec): + """Efficient storage for sparse spike timing data.""" + name = "spike_train" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if len(value) > 1 and not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None, store_name=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) + + +class WaveformCodec(dj.Codec): + """Storage for spike waveform templates with metadata.""" + name = "waveform" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, waveform_dict, *, key=None, store_name=None): + return { + "data": waveform_dict["data"].astype(np.float32), + "sampling_rate": waveform_dict["sampling_rate"], + "channel_ids": list(waveform_dict["channel_ids"]), + } + + def decode(self, stored, *, key=None): + return { + "data": stored["data"].astype(np.float64), + "sampling_rate": stored["sampling_rate"], + "channel_ids": np.array(stored["channel_ids"]), + } + + +# Create schema and tables +schema = dj.schema("ephys_analysis") + +@schema +class Unit(dj.Manual): + definition = """ + unit_id : int + --- + spike_times : + waveform : + quality : enum('good', 'mua', 'noise') + """ + + +# Usage +spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) +waveform = { + "data": np.random.randn(82, 4), + "sampling_rate": 30000, + "channel_ids": [10, 11, 12, 13], +} + +Unit.insert1({ + "unit_id": 1, + "spike_times": spike_times, + "waveform": waveform, + "quality": "good", +}) + +# Fetch - automatically decoded +result = (Unit & "unit_id = 1").fetch1() +print(f"Spike times: {result['spike_times']}") +print(f"Waveform shape: {result['waveform']['data'].shape}") +``` + +## Built-in Codecs + +DataJoint includes several built-in codecs: + +### `` - DataJoint Blob Serialization + +The `` codec provides DataJoint's native binary serialization. It supports: + +- NumPy arrays (compatible with MATLAB) +- Python dicts, lists, tuples, sets +- datetime objects, Decimals, UUIDs +- Nested data structures +- Optional compression + +```python +@schema +class ProcessedData(dj.Manual): + definition = """ + data_id : int + --- + results : # Internal (serialized in database) + large_results : # External (hash-addressed storage) + """ +``` + +### `` - Content-Addressed Storage + +Stores raw bytes using MD5 content hashing with automatic deduplication. +External storage only. + +### `` - Path-Addressed Storage + +Stores files and folders at paths derived from primary keys. Ideal for +Zarr arrays, HDF5 files, and multi-file outputs. External storage only. + +### `` - File Attachments + +Stores files with filename preserved. Supports internal and external storage. + +### `` - File References + +References existing files in configured stores without copying. +External storage only. + +## Best Practices + +1. **Choose descriptive codec names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) + +2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, `` or `` for large data + +3. **Add validation**: Use `validate()` to catch data errors early + +4. **Document your codecs**: Include docstrings explaining the expected input/output formats + +5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes + +6. **Consider versioning**: If your encoding format might change, include version information + +7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs + +```python +def test_graph_codec_roundtrip(): + import networkx as nx + g = nx.lollipop_graph(4, 2) + codec = GraphCodec() + + encoded = codec.encode(g) + decoded = codec.decode(encoded) + + assert set(g.edges) == set(decoded.edges) +``` + +## API Reference + +```python +import datajoint as dj + +# List all registered codecs +dj.list_codecs() + +# Get a codec instance +codec = dj.get_codec("blob") +codec = dj.get_codec("") # Angle brackets optional +codec = dj.get_codec("") # Store parameter stripped +``` + +For the complete Codec API specification, see [Codec Specification](codec-spec.md). diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md deleted file mode 100644 index c1844fb99..000000000 --- a/docs/src/design/tables/customtype.md +++ /dev/null @@ -1,615 +0,0 @@ -# Custom Attribute Types - -In modern scientific research, data pipelines often involve complex workflows that -generate diverse data types. From high-dimensional imaging data to machine learning -models, these data types frequently exceed the basic representations supported by -traditional relational databases. For example: - -+ A lab working on neural connectivity might use graph objects to represent brain - networks. -+ Researchers processing raw imaging data might store custom objects for pre-processing - configurations. -+ Computational biologists might store fitted machine learning models or parameter - objects for downstream predictions. - -To handle these diverse needs, DataJoint provides the **AttributeType** system. It -enables researchers to store and retrieve complex, non-standard data types—like Python -objects or data structures—in a relational database while maintaining the -reproducibility, modularity, and query capabilities required for scientific workflows. - -## Overview - -Custom attribute types define bidirectional conversion between: - -- **Python objects** (what your code works with) -- **Storage format** (what gets stored in the database) - -``` -┌─────────────────┐ encode() ┌─────────────────┐ -│ Python Object │ ───────────────► │ Storage Type │ -│ (e.g. Graph) │ │ (e.g. blob) │ -└─────────────────┘ decode() └─────────────────┘ - ◄─────────────── -``` - -## Defining Custom Types - -Create a custom type by subclassing `dj.AttributeType` and implementing the required -methods: - -```python -import datajoint as dj -import networkx as nx - -@dj.register_type -class GraphType(dj.AttributeType): - """Custom type for storing networkx graphs.""" - - # Required: unique identifier used in table definitions - type_name = "graph" - - # Required: underlying DataJoint storage type - dtype = "longblob" - - def encode(self, graph, *, key=None): - """Convert graph to storable format (called on INSERT).""" - return list(graph.edges) - - def decode(self, edges, *, key=None): - """Convert stored data back to graph (called on FETCH).""" - return nx.Graph(edges) -``` - -### Required Components - -| Component | Description | -|-----------|-------------| -| `type_name` | Unique identifier used in table definitions with `` syntax | -| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) | -| `encode(value, *, key=None)` | Converts Python object to storable format | -| `decode(stored, *, key=None)` | Converts stored data back to Python object | - -### Using Custom Types in Tables - -Once registered, use the type in table definitions with angle brackets: - -```python -@schema -class Connectivity(dj.Manual): - definition = """ - conn_id : int - --- - conn_graph = null : # Uses the GraphType we defined - """ -``` - -Insert and fetch work seamlessly: - -```python -import networkx as nx - -# Insert - encode() is called automatically -g = nx.lollipop_graph(4, 2) -Connectivity.insert1({"conn_id": 1, "conn_graph": g}) - -# Fetch - decode() is called automatically -result = (Connectivity & "conn_id = 1").fetch1("conn_graph") -assert isinstance(result, nx.Graph) -``` - -## Type Registration - -### Decorator Registration - -The simplest way to register a type is with the `@dj.register_type` decorator: - -```python -@dj.register_type -class MyType(dj.AttributeType): - type_name = "my_type" - ... -``` - -### Direct Registration - -You can also register types explicitly: - -```python -class MyType(dj.AttributeType): - type_name = "my_type" - ... - -dj.register_type(MyType) -``` - -### Listing Registered Types - -```python -# List all registered type names -print(dj.list_types()) -``` - -## Validation - -Add data validation by overriding the `validate()` method. It's called automatically -before `encode()` during INSERT operations: - -```python -@dj.register_type -class PositiveArrayType(dj.AttributeType): - type_name = "positive_array" - dtype = "longblob" - - def validate(self, value): - """Ensure all values are positive.""" - import numpy as np - if not isinstance(value, np.ndarray): - raise TypeError(f"Expected numpy array, got {type(value).__name__}") - if np.any(value < 0): - raise ValueError("Array must contain only positive values") - - def encode(self, array, *, key=None): - return array - - def decode(self, stored, *, key=None): - return stored -``` - -## Storage Types (dtype) - -The `dtype` property specifies how data is stored in the database: - -| dtype | Use Case | Stored Format | -|-------|----------|---------------| -| `"bytes"` | Raw binary data (core type) | Binary (LONGBLOB/BYTEA) | -| `"longblob"` | Raw binary data (native type, MySQL-specific) | Binary | -| `"json"` | JSON-serializable data | JSON string | -| `"varchar(N)"` | String representations | Text | -| `"int"` | Integer identifiers | Integer | -| `"blob@store"` | Large objects in external storage | UUID reference | -| `"object"` | Files/folders in object storage | JSON metadata | -| `""` | Serialized Python objects | DJ blob format | -| `""` | Chain to another custom type | Varies | - -### External Storage - -For large data, use external blob storage: - -```python -@dj.register_type -class LargeArrayType(dj.AttributeType): - type_name = "large_array" - dtype = "blob@mystore" # Uses external store named "mystore" - - def encode(self, array, *, key=None): - return array - - def decode(self, stored, *, key=None): - return stored -``` - -## Type Chaining - -Custom types can build on other custom types by referencing them in `dtype`: - -```python -@dj.register_type -class CompressedGraphType(dj.AttributeType): - type_name = "compressed_graph" - dtype = "" # Chain to the GraphType - - def encode(self, graph, *, key=None): - # Compress before passing to GraphType - return self._compress(graph) - - def decode(self, stored, *, key=None): - # GraphType's decode already ran - return self._decompress(stored) -``` - -DataJoint automatically resolves the chain to find the final storage type. - -## The Key Parameter - -The `key` parameter provides access to primary key values during encode/decode -operations. This is useful when the conversion depends on record context: - -```python -@dj.register_type -class ContextAwareType(dj.AttributeType): - type_name = "context_aware" - dtype = "longblob" - - def encode(self, value, *, key=None): - if key and key.get("version") == 2: - return self._encode_v2(value) - return self._encode_v1(value) - - def decode(self, stored, *, key=None): - if key and key.get("version") == 2: - return self._decode_v2(stored) - return self._decode_v1(stored) -``` - -## Publishing Custom Types as Packages - -Custom types can be distributed as installable packages using Python entry points. -This allows types to be automatically discovered when the package is installed. - -### Package Structure - -``` -dj-graph-types/ -├── pyproject.toml -└── src/ - └── dj_graph_types/ - ├── __init__.py - └── types.py -``` - -### pyproject.toml - -```toml -[project] -name = "dj-graph-types" -version = "1.0.0" - -[project.entry-points."datajoint.types"] -graph = "dj_graph_types.types:GraphType" -weighted_graph = "dj_graph_types.types:WeightedGraphType" -``` - -### Type Implementation - -```python -# src/dj_graph_types/types.py -import datajoint as dj -import networkx as nx - -class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, graph, *, key=None): - return list(graph.edges) - - def decode(self, edges, *, key=None): - return nx.Graph(edges) - -class WeightedGraphType(dj.AttributeType): - type_name = "weighted_graph" - dtype = "longblob" - - def encode(self, graph, *, key=None): - return [(u, v, d) for u, v, d in graph.edges(data=True)] - - def decode(self, edges, *, key=None): - g = nx.Graph() - g.add_weighted_edges_from(edges) - return g -``` - -### Usage After Installation - -```bash -pip install dj-graph-types -``` - -```python -# Types are automatically available after package installation -@schema -class MyTable(dj.Manual): - definition = """ - id : int - --- - network : - weighted_network : - """ -``` - -## Complete Example - -Here's a complete example demonstrating custom types for a neuroscience workflow: - -```python -import datajoint as dj -import numpy as np - -# Configure DataJoint -dj.config["database.host"] = "localhost" -dj.config["database.user"] = "root" -dj.config["database.password"] = "password" - -# Define custom types -@dj.register_type -class SpikeTrainType(dj.AttributeType): - """Efficient storage for sparse spike timing data.""" - type_name = "spike_train" - dtype = "longblob" - - def validate(self, value): - if not isinstance(value, np.ndarray): - raise TypeError("Expected numpy array of spike times") - if value.ndim != 1: - raise ValueError("Spike train must be 1-dimensional") - if not np.all(np.diff(value) >= 0): - raise ValueError("Spike times must be sorted") - - def encode(self, spike_times, *, key=None): - # Store as differences (smaller values, better compression) - return np.diff(spike_times, prepend=0).astype(np.float32) - - def decode(self, stored, *, key=None): - # Reconstruct original spike times - return np.cumsum(stored).astype(np.float64) - - -@dj.register_type -class WaveformType(dj.AttributeType): - """Storage for spike waveform templates with metadata.""" - type_name = "waveform" - dtype = "longblob" - - def encode(self, waveform_dict, *, key=None): - return { - "data": waveform_dict["data"].astype(np.float32), - "sampling_rate": waveform_dict["sampling_rate"], - "channel_ids": list(waveform_dict["channel_ids"]), - } - - def decode(self, stored, *, key=None): - return { - "data": stored["data"].astype(np.float64), - "sampling_rate": stored["sampling_rate"], - "channel_ids": np.array(stored["channel_ids"]), - } - - -# Create schema and tables -schema = dj.schema("ephys_analysis") - -@schema -class Unit(dj.Manual): - definition = """ - unit_id : int - --- - spike_times : - waveform : - quality : enum('good', 'mua', 'noise') - """ - - -# Usage -spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) -waveform = { - "data": np.random.randn(82, 4), - "sampling_rate": 30000, - "channel_ids": [10, 11, 12, 13], -} - -Unit.insert1({ - "unit_id": 1, - "spike_times": spike_times, - "waveform": waveform, - "quality": "good", -}) - -# Fetch - automatically decoded -result = (Unit & "unit_id = 1").fetch1() -print(f"Spike times: {result['spike_times']}") -print(f"Waveform shape: {result['waveform']['data'].shape}") -``` - -## Migration from AttributeAdapter - -The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`: - -### Before (deprecated) - -```python -class GraphAdapter(dj.AttributeAdapter): - attribute_type = "longblob" - - def put(self, obj): - return list(obj.edges) - - def get(self, value): - return nx.Graph(value) - -# Required context-based registration -graph = GraphAdapter() -schema = dj.schema("mydb", context={"graph": graph}) -``` - -### After (recommended) - -```python -@dj.register_type -class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, obj, *, key=None): - return list(obj.edges) - - def decode(self, value, *, key=None): - return nx.Graph(value) - -# Global registration - no context needed -schema = dj.schema("mydb") -``` - -### Key Differences - -| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) | -|--------|-------------------------------|----------------------------| -| Methods | `put()` / `get()` | `encode()` / `decode()` | -| Storage type | `attribute_type` | `dtype` | -| Type name | Variable name in context | `type_name` property | -| Registration | Context dict per schema | Global `@register_type` decorator | -| Validation | Manual | Built-in `validate()` method | -| Distribution | Copy adapter code | Entry point packages | -| Key access | Not available | Optional `key` parameter | - -## Best Practices - -1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) - -2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, external storage for large data - -3. **Add validation**: Use `validate()` to catch data errors early - -4. **Document your types**: Include docstrings explaining the expected input/output formats - -5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes - -6. **Consider versioning**: If your encoding format might change, include version information - -7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs - -```python -def test_graph_type_roundtrip(): - g = nx.lollipop_graph(4, 2) - t = GraphType() - - encoded = t.encode(g) - decoded = t.decode(encoded) - - assert set(g.edges) == set(decoded.edges) -``` - -## Built-in Types - -DataJoint includes a built-in type for explicit blob serialization: - -### `` - DataJoint Blob Serialization - -The `` type provides explicit control over DataJoint's native binary -serialization. It supports: - -- NumPy arrays (compatible with MATLAB) -- Python dicts, lists, tuples, sets -- datetime objects, Decimals, UUIDs -- Nested data structures -- Optional compression - -```python -@schema -class ProcessedData(dj.Manual): - definition = """ - data_id : int - --- - results : # Serialized Python objects - raw_bytes : longblob # Raw bytes (no serialization) - """ -``` - -#### When to Use `` - -- **Serialized data**: When storing Python objects (dicts, arrays, etc.) -- **New tables**: Prefer `` for automatic serialization -- **Migration**: Existing schemas with implicit serialization must migrate - -#### Raw Blob Behavior - -Plain `longblob` (and other blob variants) columns now store and return -**raw bytes** without automatic serialization: - -```python -@schema -class RawData(dj.Manual): - definition = """ - id : int - --- - raw_bytes : longblob # Stores/returns raw bytes - serialized : # Stores Python objects with serialization - """ - -# Raw bytes - no serialization -RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}}) - -row = (RawData & "id=1").fetch1() -row["raw_bytes"] # Returns: b"raw binary data" -row["serialized"] # Returns: {"key": "value"} -``` - -**Important**: Existing schemas that relied on implicit blob serialization -must be migrated to `` to preserve their behavior. - -## Schema Migration - -When upgrading existing schemas to use explicit type declarations, DataJoint -provides migration utilities. - -### Analyzing Blob Columns - -```python -import datajoint as dj - -schema = dj.schema("my_database") - -# Check migration status -status = dj.migrate.check_migration_status(schema) -print(f"Blob columns: {status['total_blob_columns']}") -print(f"Already migrated: {status['migrated']}") -print(f"Pending migration: {status['pending']}") -``` - -### Generating Migration SQL - -```python -# Preview migration (dry run) -result = dj.migrate.migrate_blob_columns(schema, dry_run=True) -for sql in result['sql_statements']: - print(sql) -``` - -### Applying Migration - -```python -# Apply migration -result = dj.migrate.migrate_blob_columns(schema, dry_run=False) -print(f"Migrated {result['migrated']} columns") -``` - -### Migration Details - -The migration updates MySQL column comments to include the type declaration. -This is a **metadata-only** change - the actual blob data format is unchanged. - -All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`. - -Before migration: -- Column: `longblob` (or `blob`, `mediumblob`, etc.) -- Comment: `user comment` -- Behavior: Auto-serialization (implicit) - -After migration: -- Column: `longblob` (unchanged) -- Comment: `::user comment` -- Behavior: Explicit serialization via `` - -### Updating Table Definitions - -After database migration, update your Python table definitions for consistency: - -```python -# Before -class MyTable(dj.Manual): - definition = """ - id : int - --- - data : longblob # stored data - """ - -# After -class MyTable(dj.Manual): - definition = """ - id : int - --- - data : # stored data - """ -``` - -Both definitions work identically after migration, but using `` makes -the serialization explicit and documents the intended behavior. From 568d71a79266ec97fb28f9425fd209ad3c280253 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 17:55:58 -0600 Subject: [PATCH 26/32] Remove object-type-spec.md (implementation complete) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The detailed implementation specification has served its purpose. User documentation is now in object.md, codec API in codec-spec.md, and type architecture in storage-types-spec.md. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/src/design/tables/object-type-spec.md | 1473 -------------------- 1 file changed, 1473 deletions(-) delete mode 100644 docs/src/design/tables/object-type-spec.md diff --git a/docs/src/design/tables/object-type-spec.md b/docs/src/design/tables/object-type-spec.md deleted file mode 100644 index 24fb2b4a7..000000000 --- a/docs/src/design/tables/object-type-spec.md +++ /dev/null @@ -1,1473 +0,0 @@ -# Object Column Type Specification - -## Overview - -The `object` type introduces a new paradigm for managed file storage in DataJoint. Unlike existing `attach@store` and `filepath@store` types that reference named stores, the `object` type uses a **unified storage backend** that is tightly coupled with the schema and configured at the pipeline level. - -The `object` type supports both **files and folders**. Content is copied to storage at insert time, referenced via handle on fetch, and deleted when the record is deleted. - -### Immutability Contract - -Objects stored via the `object` type are **immutable after finalization**. Users agree to: -- **Insert (copy)**: Copy existing content to storage -- **Insert (staged)**: Reserve path, write directly, then finalize -- **Fetch**: Read content via handle (no modification) -- **Delete**: Remove content when record is deleted (only way to remove) - -Once an object is **finalized** (either via copy-insert or staged-insert completion), users must not directly modify it in the object store. - -#### Two Insert Modes - -| Mode | Use Case | Workflow | -|------|----------|----------| -| **Copy** | Small files, existing data | Local file → copy to storage → insert record | -| **Staged** | Large objects, Zarr, TileDB | Reserve path → write directly to storage → finalize record | - -### Augmented Schema vs External References - -The `object` type implements **Augmented Schema (AUS)** — a paradigm where the object store becomes a true extension of the relational database: - -- **DataJoint fully controls** the object store lifecycle -- **Only DataJoint writes** to the object store (users may have direct read access) -- **Tight coupling** between database and object store -- **Joint transaction management** on objects and database records -- **Single backend per pipeline** — all managed objects live together - -This is fundamentally different from **external references**, where DataJoint merely points to user-managed data: - -| Aspect | `object` (Augmented Schema) | `filepath@store` (External Reference) | -|--------|----------------------------|--------------------------------------| -| **Ownership** | DataJoint owns the data | User owns the data | -| **Writes** | Only via DataJoint | User writes directly | -| **Deletion** | DataJoint deletes on record delete | User manages lifecycle | -| **Multi-backend** | Single backend per pipeline | Multiple named stores | -| **Use case** | Pipeline-generated data | Collaborator data, legacy assets | - -**When to use each:** - -- Use `object` for data that DataJoint should own and manage as part of the schema (e.g., processed results, derived datasets) -- Use `filepath@store` for referencing externally-managed data across multiple backends (e.g., collaborator data on different cloud providers, legacy data that shouldn't be moved) - -## Storage Architecture - -### Default and Named Stores - -Each DataJoint pipeline has a **default storage backend** plus optional **named stores**, all configured in `datajoint.json`. DataJoint fully controls the path structure within each store. - -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object # uses default store - published : object@public # uses 'public' named store - """ -``` - -**All stores follow OAS principles:** -- DataJoint owns the lifecycle (insert/delete/fetch as a unit) -- Same deterministic path structure (`project/schema/Table/objects/...`) -- Same access control alignment with database -- Each store has its own `datajoint_store.json` metadata file - -**Why support multiple stores?** -- Different access policies (private vs public buckets) -- Different storage tiers (hot vs cold storage) -- Organizational requirements (data sovereignty, compliance) - -**Why require explicit store configuration?** -- All stores must be registered for OAS semantics -- Credential management aligns with database access control (platform-managed) -- Orphan cleanup operates per-store with full knowledge of configured stores - -### Access Control Patterns - -The deterministic path structure (`project/schema/Table/objects/pk=val/...`) enables **prefix-based access control policies** on each storage backend. - -**Supported access control levels:** - -| Level | Implementation | Example Policy Prefix | -|-------|---------------|----------------------| -| Project-level | IAM/bucket policy | `my-bucket/my_project/*` | -| Schema-level | IAM/bucket policy | `my-bucket/my_project/lab_internal/*` | -| Table-level | IAM/bucket policy | `my-bucket/my_project/schema/SensitiveTable/*` | -| Row-level | Per-object ACL or signed URLs | Future enhancement | - -**Example: Private and public data in separate stores** - -``` -# Default store (private) -s3://internal-bucket/my_project/ -└── lab_schema/ - └── ProcessingResults/ - └── objects/... - -# Named 'public' store -s3://public-bucket/my_project/ -└── lab_schema/ - └── PublishedDatasets/ - └── objects/... -``` - -Alternatively, use prefix-based policies within a single bucket if preferred. - -**Row-level access control** (access to objects for specific primary key values) is not directly supported by object store policies. Future versions may address this via DataJoint-generated signed URLs that project database permissions onto object access. - -### Supported Backends - -DataJoint uses **[`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)** to ensure compatibility across multiple storage backends: - -- **Local storage** – POSIX-compliant file systems (e.g., NFS, SMB) -- **Cloud-based object storage** – Amazon S3, Google Cloud Storage, Azure Blob, MinIO - -## Project Structure - -A DataJoint project creates a structured hierarchical storage pattern: - -``` -📁 project_name/ -├── datajoint_store.json # store metadata (not client config) -├── 📁 schema_name/ -│ ├── 📁 Table1/ -│ │ ├── data.parquet # tabular data export (future) -│ │ └── 📁 objects/ # object storage for this table -│ │ ├── pk1=val1/pk2=val2/field1_token.dat -│ │ └── pk1=val1/pk2=val2/field2_token.zarr -│ ├── 📁 Table2/ -│ │ ├── data.parquet -│ │ └── 📁 objects/ -│ │ └── ... -``` - -### Object Storage Keys - -When using cloud object storage: - -``` -s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.dat -s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.zarr -``` - -## Configuration - -### Settings Structure - -Object storage is configured in `datajoint.json` using the existing settings system: - -```json -{ - "database.host": "localhost", - "database.user": "datajoint", - - "object_storage.project_name": "my_project", - "object_storage.protocol": "s3", - "object_storage.endpoint": "s3.amazonaws.com", - "object_storage.bucket": "my-bucket", - "object_storage.location": "my_project", - "object_storage.partition_pattern": "{subject_id}/{session_id}" -} -``` - -For local filesystem storage: - -```json -{ - "object_storage.project_name": "my_project", - "object_storage.protocol": "file", - "object_storage.location": "/data/my_project", - "object_storage.partition_pattern": "{subject_id}/{session_id}" -} -``` - -### Named Stores - -Additional stores can be defined using the `object_storage.stores.` prefix: - -```json -{ - "object_storage.project_name": "my_project", - "object_storage.protocol": "s3", - "object_storage.bucket": "internal-bucket", - "object_storage.location": "my_project", - - "object_storage.stores.public.protocol": "s3", - "object_storage.stores.public.bucket": "public-bucket", - "object_storage.stores.public.location": "my_project" -} -``` - -Named stores inherit `project_name` from the default configuration but can override all other settings. Use named stores with the `object@store_name` syntax: - -```python -@schema -class Dataset(dj.Manual): - definition = """ - dataset_id : int - --- - internal_data : object # default store (internal-bucket) - published_data : object@public # public store (public-bucket) - """ -``` - -Each named store: -- Must be explicitly configured (no ad-hoc URLs) -- Has its own `datajoint_store.json` metadata file -- Follows the same OAS lifecycle semantics as the default store -- Credentials are managed at the platform level, aligned with database access control - -### Settings Schema - -| Setting | Type | Required | Description | -|---------|------|----------|-------------| -| `object_storage.project_name` | string | Yes | Unique project identifier (must match store metadata) | -| `object_storage.protocol` | string | Yes | Storage backend: `file`, `s3`, `gcs`, `azure` | -| `object_storage.location` | string | Yes | Base path or bucket prefix | -| `object_storage.bucket` | string | For cloud | Bucket name (S3, GCS, Azure) | -| `object_storage.endpoint` | string | For S3 | S3 endpoint URL | -| `object_storage.partition_pattern` | string | No | Path pattern with `{attribute}` placeholders | -| `object_storage.token_length` | int | No | Random suffix length for filenames (default: 8, range: 4-16) | -| `object_storage.access_key` | string | For cloud | Access key (can use secrets file) | -| `object_storage.secret_key` | string | For cloud | Secret key (can use secrets file) | - -### Configuration Immutability - -**CRITICAL**: Once a project has been instantiated (i.e., `datajoint_store.json` has been created and the first object stored), the following settings MUST NOT be changed: - -- `object_storage.project_name` -- `object_storage.protocol` -- `object_storage.bucket` -- `object_storage.location` -- `object_storage.partition_pattern` - -Changing these settings after objects have been stored will result in **broken references**—existing paths stored in the database will no longer resolve to valid storage locations. - -DataJoint validates `project_name` against `datajoint_store.json` on connect, but administrators must ensure other settings remain consistent across all clients for the lifetime of the project. - -### Environment Variables - -Settings can be overridden via environment variables: - -```bash -DJ_OBJECT_STORAGE_PROTOCOL=s3 -DJ_OBJECT_STORAGE_BUCKET=my-bucket -DJ_OBJECT_STORAGE_LOCATION=my_project -DJ_OBJECT_STORAGE_PARTITION_PATTERN="subject{subject_id}/session{session_id}" -``` - -### Secrets - -Credentials can be stored in the `.secrets/` directory: - -``` -.secrets/ -├── object_storage.access_key -└── object_storage.secret_key -``` - -### Partition Pattern - -The partition pattern is configured **per pipeline** (one per settings file). Placeholders use `{attribute_name}` syntax and are replaced with primary key values. - -```json -{ - "object_storage.partition_pattern": "subject{subject_id}/session{session_id}" -} -``` - -**Example with partitioning:** - -``` -s3://my-bucket/my_project/subject_id=123/session_id=45/schema_name/Recording/objects/raw_data_Ax7bQ2kM.dat -``` - -If no partition pattern is specified, files are organized directly under `{location}/{schema}/{Table}/objects/`. - -## Store Metadata (`datajoint_store.json`) - -Each object store contains a metadata file at its root that identifies the store and enables verification by DataJoint clients. This file is named `datajoint_store.json` to distinguish it from client configuration files (`datajoint.json`). - -### Location - -``` -{location}/datajoint_store.json -``` - -For cloud storage: -``` -s3://bucket/my_project/datajoint_store.json -``` - -### Content - -```json -{ - "project_name": "my_project", - "created": "2025-01-15T10:30:00Z", - "format_version": "1.0", - "datajoint_version": "0.15.0", - "database_host": "db.example.com", - "database_name": "my_project_db" -} -``` - -### Schema - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `project_name` | string | Yes | Unique project identifier | -| `created` | string | Yes | ISO 8601 timestamp of store creation | -| `format_version` | string | Yes | Store format version for compatibility | -| `datajoint_version` | string | Yes | DataJoint version that created the store | -| `database_host` | string | No | Database server hostname (for bidirectional mapping) | -| `database_name` | string | No | Database name on the server (for bidirectional mapping) | - -The `database_name` field exists for DBMS platforms that support multiple databases on a single server (e.g., PostgreSQL, MySQL). The object storage configuration is **shared across all schemas comprising the pipeline**—it's a pipeline-level setting, not a per-schema setting. - -The optional `database_host` and `database_name` fields enable bidirectional mapping between object stores and databases: - -- **Forward**: Client settings → object store location -- **Reverse**: Object store metadata → originating database - -This is informational only—not enforced at runtime. Administrators can alternatively ensure unique `project_name` values across their namespace, and managed platforms may handle this mapping externally. - -### Store Initialization - -The store metadata file is created when the first `object` attribute is used: - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Client attempts first file operation │ -├─────────────────────────────────────────────────────────┤ -│ 2. Check if datajoint_store.json exists │ -│ ├─ If exists: verify project_name matches │ -│ └─ If not: create with current project_name │ -├─────────────────────────────────────────────────────────┤ -│ 3. On mismatch: raise DataJointError │ -└─────────────────────────────────────────────────────────┘ -``` - -### Client Verification - -DataJoint performs a basic verification on connect to ensure store-database cohesion: - -1. **On connect**: Client reads `datajoint_store.json` from store -2. **Verify**: `project_name` in client settings matches store metadata -3. **On mismatch**: Raise `DataJointError` with descriptive message - -```python -# Example error -DataJointError: Object store project name mismatch. - Client configured: "project_a" - Store metadata: "project_b" - Ensure all clients use the same object_storage.project_name setting. -``` - -### Administrative Responsibility - -A 1:1 correspondence is assumed between: -- Database location + `project_name` in client settings -- Object store + `project_name` in store metadata - -DataJoint performs basic verification but does **not** enforce this mapping. Administrators are responsible for ensuring correct configuration across all clients. - -## Syntax - -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object # uses default store - processed : object # another object attribute (default store) - published : object@public # uses named 'public' store - """ -``` - -- `object` — uses the default storage backend -- `object@store_name` — uses a named store (must be configured in settings) - -## Database Storage - -The `object` type is stored as a `JSON` column in MySQL containing: - -**File in default store:** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", - "size": 12345, - "hash": null, - "ext": ".dat", - "is_dir": false, - "timestamp": "2025-01-15T10:30:00Z", - "mime_type": "application/octet-stream" -} -``` - -**File in named store:** -```json -{ - "store": "public", - "url": "s3://public-bucket/my_project/my_schema/Dataset/objects/dataset_id=1/published_data_Bx8cD3kM.dat", - "path": "my_schema/Dataset/objects/dataset_id=1/published_data_Bx8cD3kM.dat", - "size": 12345, - "hash": "sha256:abcdef1234...", - "ext": ".dat", - "is_dir": false, - "timestamp": "2025-01-15T10:30:00Z", - "mime_type": "application/octet-stream" -} -``` - -**Folder example:** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_pL9nR4wE", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_pL9nR4wE", - "size": 567890, - "hash": null, - "ext": null, - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z", - "item_count": 42 -} -``` - -**Zarr example (large dataset, metadata fields omitted for performance):** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "size": null, - "hash": null, - "ext": ".zarr", - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z" -} -``` - -### JSON Schema - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `store` | string/null | Yes | Store name (e.g., `"public"`), or `null` for default store | -| `url` | string | Yes | Full URL including protocol and bucket (e.g., `s3://bucket/path`) | -| `path` | string | Yes | Relative path within store (excludes protocol/bucket, includes token) | -| `size` | integer/null | No | Total size in bytes (sum for folders), or null if not computed. See [Performance Considerations](#performance-considerations). | -| `hash` | string/null | Yes | Content hash with algorithm prefix, or null (default) | -| `ext` | string/null | Yes | File extension as tooling hint (e.g., `.dat`, `.zarr`) or null. See [Extension Field](#extension-field). | -| `is_dir` | boolean | Yes | True if stored content is a directory/key-prefix (e.g., Zarr store) | -| `timestamp` | string | Yes | ISO 8601 upload timestamp | -| `mime_type` | string | No | MIME type (files only, auto-detected from extension) | -| `item_count` | integer | No | Number of files (folders only), or null if not computed. See [Performance Considerations](#performance-considerations). | - -**Why both `url` and `path`?** -- `url`: Self-describing, enables cross-validation, robust to config changes -- `path`: Enables store name re-derivation at migration time, consistent structure across stores -- At migration, the store name can be derived by matching `url` against configured stores - -### Extension Field - -The `ext` field is a **tooling hint** that preserves the original file extension or provides a conventional suffix for directory-based formats. It is: - -- **Not a content-type declaration**: Unlike `mime_type`, it does not attempt to describe the internal content format -- **Useful for tooling**: Enables file browsers, IDEs, and other tools to display appropriate icons or suggest applications -- **Conventional for formats like Zarr**: The `.zarr` extension is recognized by the ecosystem even though a Zarr store contains mixed content (JSON metadata + binary chunks) - -For single files, `ext` is extracted from the source filename. For staged inserts (like Zarr), it can be explicitly provided. - -### Performance Considerations - -For large hierarchical data like Zarr stores, computing certain metadata can be expensive: - -- **`size`**: Requires listing all objects and summing their sizes. For stores with millions of chunks, this can take minutes or hours. -- **`item_count`**: Requires listing all objects. Same performance concern as `size`. -- **`hash`**: Requires reading all content. Explicitly not supported for staged inserts. - -**These fields are optional** and default to `null` for staged inserts. Users can explicitly request computation when needed, understanding the performance implications. - -### Content Hashing - -By default, **no content hash is computed** to avoid performance overhead for large objects. Storage backend integrity is trusted. - -**Explicit hash control** via insert kwarg: - -```python -# Default - no hash (fast) -Recording.insert1({..., "raw_data": "/path/to/large.dat"}) - -# Explicit hash request - user specifies algorithm -Recording.insert1({..., "raw_data": "/path/to/important.dat"}, hash="sha256") - -# Other supported algorithms -Recording.insert1({..., "raw_data": "/path/to/data.bin"}, hash="md5") -Recording.insert1({..., "raw_data": "/path/to/large.bin"}, hash="xxhash") # xxh3, faster for large files -``` - -**Design principles:** - -- **Explicit over implicit**: No automatic hashing based on file size or other heuristics -- **User controls the tradeoff**: User decides when integrity verification is worth the performance cost -- **Files only**: Hash applies to files, not folders (folders use manifests for integrity) -- **Staged inserts**: Hash is always `null` regardless of kwarg—data flows directly to storage without a local copy to hash - -Supported hash algorithms: `sha256`, `md5`, `xxhash` (xxh3, faster for large files) - -### Folder Manifests - -For folders (directories), a **manifest file** is created alongside the folder in the object store to enable integrity verification without computing content hashes: - -``` -raw_data_pL9nR4wE/ -raw_data_pL9nR4wE.manifest.json -``` - -**Manifest content:** -```json -{ - "files": [ - {"path": "file1.dat", "size": 1234}, - {"path": "subdir/file2.dat", "size": 5678}, - {"path": "subdir/file3.dat", "size": 91011} - ], - "total_size": 567890, - "item_count": 42, - "created": "2025-01-15T10:30:00Z" -} -``` - -**Design rationale:** -- Stored in object store (not database) to avoid bloating the JSON for folders with many files -- Placed alongside folder (not inside) to avoid polluting folder contents and interfering with tools like Zarr -- Enables self-contained verification without database access - -The manifest enables: -- Quick verification that all expected files exist -- Size validation without reading file contents -- Detection of missing or extra files - -### Filename Convention - -The stored filename is **always derived from the field name**: -- **Base name**: The attribute/field name (e.g., `raw_data`) -- **Extension**: Adopted from source file (copy insert) or optionally provided (staged insert) -- **Token**: Random suffix for collision avoidance - -``` -Stored filename = {field}_{token}{ext} - -Examples: - raw_data_Ax7bQ2kM.dat # file with .dat extension - raw_data_pL9nR4wE.zarr # Zarr directory with .zarr extension - raw_data_kM3nP2qR # directory without extension -``` - -This convention ensures: -- Consistent, predictable naming across all objects -- Field name visible in storage for easier debugging -- Extension preserved for MIME type detection and tooling compatibility - -## Path Generation - -Storage paths are **deterministically constructed** from record metadata, enabling bidirectional lookup between database records and stored files. - -### Path Components - -1. **Location** - from configuration (`object_storage.location`) -2. **Partition attributes** - promoted PK attributes (if `partition_pattern` configured) -3. **Schema name** - from the table's schema -4. **Table name** - the table class name -5. **Object directory** - `objects/` -6. **Primary key encoding** - remaining PK attributes and values -7. **Suffixed filename** - `{field}_{token}{ext}` - -### Path Template - -**Without partitioning:** -``` -{location}/{schema}/{Table}/objects/{pk_attr1}={pk_val1}/{pk_attr2}={pk_val2}/.../{field}_{token}{ext} -``` - -**With partitioning:** -``` -{location}/{partition_attr}={val}/.../schema/{Table}/objects/{remaining_pk_attrs}/.../{field}_{token}{ext} -``` - -Note: The `objects/` directory follows the table name, allowing each table folder to also contain tabular data exports (e.g., `data.parquet`) alongside the objects. - -### Partitioning - -The **partition pattern** allows promoting certain primary key attributes to the beginning of the path (after `location`). This organizes storage by high-level attributes like subject or experiment, enabling: -- Efficient data locality for related records -- Easier manual browsing of storage -- Potential for storage tiering by partition - -**Configuration:** -```json -{ - "object_storage.partition_pattern": "{subject_id}/{experiment_id}" -} -``` - -Partition attributes are extracted from the primary key and placed at the path root. Remaining PK attributes appear in their normal position. - -### Example Without Partitioning - -For a table: -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object - """ -``` - -Inserting `{"subject_id": 123, "session_id": 45, "raw_data": "/path/to/recording.dat"}` produces: - -``` -my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat -``` - -Note: The filename is `raw_data` (field name) with `.dat` extension (from source file). - -### Example With Partitioning - -With `partition_pattern = "{subject_id}"`: - -``` -my_project/subject_id=123/my_schema/Recording/objects/session_id=45/raw_data_Ax7bQ2kM.dat -``` - -The `subject_id` is promoted to the path root, grouping all files for subject 123 together regardless of schema or table. - -### Deterministic Bidirectional Mapping - -The path structure (excluding the random token) is fully deterministic: -- **Record → File**: Given a record's primary key, construct the path prefix to locate its file -- **File → Record**: Parse the path to extract schema, table, field, and primary key values - -This enables: -- Finding all files for a specific record -- Identifying which record a file belongs to -- Auditing storage against database contents - -The **random token** is stored in the JSON metadata to complete the full path. - -### Primary Key Value Encoding - -Primary key values are encoded directly in paths when they are simple, path-safe types: -- **Integers**: Used directly (`subject_id=123`) -- **Dates**: ISO format (`session_date=2025-01-15`) -- **Timestamps**: ISO format with safe separators (`created=2025-01-15T10-30-00`) -- **Simple strings**: Used directly if path-safe (`experiment=baseline`) - -**Conversion to path-safe strings** is applied only when necessary: -- Strings containing `/`, `\`, or other path-unsafe characters -- Very long strings (truncated with hash suffix) -- Binary or complex types (hashed) - -```python -# Direct encoding (no conversion needed) -subject_id=123 -session_date=2025-01-15 -trial_type=control - -# Converted encoding (path-unsafe characters) -filename=my%2Ffile.dat # "/" encoded -description=a1b2c3d4_abc123 # long string truncated + hash -``` - -### Filename Collision Avoidance - -To prevent filename collisions, each stored object receives a **random token suffix** appended to the field name: - -``` -field: raw_data, source: recording.dat -stored: raw_data_Ax7bQ2kM.dat - -field: image, source: scan.tiff -stored: image_pL9nR4wE.tiff - -field: neural_data (staged with .zarr) -stored: neural_data_kM3nP2qR.zarr -``` - -#### Token Suffix Specification - -- **Alphabet**: URL-safe and filename-safe Base64 characters: `A-Z`, `a-z`, `0-9`, `-`, `_` -- **Length**: Configurable via `object_storage.token_length` (default: 8, range: 4-16) -- **Generation**: Cryptographically random using `secrets.token_urlsafe()` - -At 8 characters with 64 possible values per character: 64^8 = 281 trillion combinations. - -#### Rationale - -- Avoids collisions without requiring existence checks -- Field name visible in storage for easier debugging/auditing -- URL-safe for web-based access to cloud storage -- Filesystem-safe across all supported platforms - -### No Deduplication - -Each insert stores a separate copy of the file, even if identical content was previously stored. This ensures: -- Clear 1:1 relationship between records and files -- Simplified delete behavior -- No reference counting complexity - -## Insert Behavior - -At insert time, the `object` attribute accepts: - -1. **Local file path** (string or `Path`): Path to an existing local file (extension extracted) -2. **Local folder path** (string or `Path`): Path to an existing local directory -3. **Remote URL** (string): URL to remote file or folder (`s3://`, `gs://`, `az://`, `http://`, `https://`) -4. **Tuple of (ext, stream)**: File-like object with explicit extension - -```python -# From local file path - extension (.dat) extracted from source -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "/local/path/to/recording.dat" -}) -# Stored as: raw_data_Ax7bQ2kM.dat - -# From local folder path - no extension -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "/local/path/to/data_folder/" -}) -# Stored as: raw_data_pL9nR4wE/ - -# From remote URL - copies from source to managed storage -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "s3://source-bucket/path/to/data.dat" -}) -# Stored as: raw_data_kM3nP2qR.dat - -# From remote Zarr store (e.g., collaborator data on GCS) -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "neural_data": "gs://collaborator-bucket/shared/experiment.zarr" -}) -# Copied to managed storage as: neural_data_pL9nR4wE.zarr - -# From stream with explicit extension -with open("/local/path/data.bin", "rb") as f: - Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": (".bin", f) - }) -# Stored as: raw_data_xY8zW3vN.bin -``` - -### Remote URL Support - -Remote URLs are detected by protocol prefix and handled via fsspec: - -| Protocol | Example | Notes | -|----------|---------|-------| -| `s3://` | `s3://bucket/path/file.dat` | AWS S3, MinIO | -| `gs://` | `gs://bucket/path/file.dat` | Google Cloud Storage | -| `az://` | `az://container/path/file.dat` | Azure Blob Storage | -| `http://` | `http://server/path/file.dat` | HTTP (read-only source) | -| `https://` | `https://server/path/file.dat` | HTTPS (read-only source) | - -**Authentication**: Remote sources may require credentials. fsspec uses standard credential discovery (environment variables, config files, IAM roles). For cross-cloud copies, ensure credentials are configured for both source and destination. - -**Performance note**: For large remote-to-remote copies, data flows through the client. This is acceptable for most use cases but may be slow for very large datasets. Future optimizations could include server-side copy for same-provider transfers. - -### Insert Processing Steps - -1. Validate input (file/folder exists, stream is readable) -2. Generate deterministic storage path with random token -3. **Copy content to storage backend** via `fsspec` -4. **If copy fails: abort insert** (no database operation attempted) -5. Compute content hash if requested (optional, default: no hash) -6. Build JSON metadata structure -7. Execute database INSERT - -### Copy-First Semantics - -The file/folder is copied to storage **before** the database insert is attempted: -- If the copy fails, the insert does not proceed -- If the copy succeeds but the database insert fails, an orphaned file may remain -- Orphaned files are acceptable due to the random token (no collision with future inserts) - -### Staged Insert (Direct Write Mode) - -For large objects like Zarr arrays, copying from local storage is inefficient. **Staged insert** allows writing directly to the destination. - -#### Why a Separate Method? - -Staged insert uses a dedicated `staged_insert1` method rather than co-opting `insert1` because: - -1. **Explicit over implicit** - Staged inserts have fundamentally different semantics (file creation happens during context, commit on exit). A separate method makes this explicit. -2. **Backward compatibility** - `insert1` returns `None` and doesn't support context manager protocol. Changing this could break existing code. -3. **Clear error handling** - The context manager semantics (success = commit, exception = rollback) are obvious with `staged_insert1`. -4. **Type safety** - The staged context exposes `.store()` for object fields. A dedicated method can return a properly-typed `StagedInsert` object. - -**Staged inserts are limited to `insert1`** (one row at a time). Multi-row inserts are not supported for staged operations. - -#### Basic Usage - -```python -# Stage an insert with direct object storage writes -with Recording.staged_insert1 as staged: - # Set primary key values - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Create object storage directly using store() - # Extension is optional - .zarr is conventional for Zarr arrays - z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(10000, 10000), dtype='f4') - z[:] = compute_large_array() - - # Assign the created object to the record - staged.rec['raw_data'] = z - -# On successful exit: metadata computed, record inserted -# On exception: storage cleaned up, no record inserted -# Stored as: raw_data_Ax7bQ2kM.zarr -``` - -#### StagedInsert Interface - -```python -class StagedInsert: - """Context manager for staged insert operations.""" - - rec: dict[str, Any] # Record dict for setting attribute values - - def store(self, field: str, ext: str = "") -> fsspec.FSMap: - """ - Get an FSMap store for direct writes to an object field. - - Args: - field: Name of the object attribute - ext: Optional extension (e.g., ".zarr", ".hdf5") - - Returns: - fsspec.FSMap suitable for Zarr/xarray - """ - ... - - def open(self, field: str, ext: str = "", mode: str = "wb") -> IO: - """ - Open a file for direct writes to an object field. - - Args: - field: Name of the object attribute - ext: Optional extension (e.g., ".bin", ".dat") - mode: File mode (default: "wb") - - Returns: - File-like object for writing - """ - ... - - @property - def fs(self) -> fsspec.AbstractFileSystem: - """Return fsspec filesystem for advanced operations.""" - ... -``` - -#### Staged Insert Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Enter context: create StagedInsert with empty rec │ -├─────────────────────────────────────────────────────────┤ -│ 2. User sets primary key values in staged.rec │ -├─────────────────────────────────────────────────────────┤ -│ 3. User calls store()/open() to get storage handles │ -│ - Path reserved with random token on first call │ -│ - User writes data directly via fsspec │ -├─────────────────────────────────────────────────────────┤ -│ 4. User assigns object references to staged.rec │ -├─────────────────────────────────────────────────────────┤ -│ 5. On context exit (success): │ -│ - Build metadata (size/item_count optional, no hash) │ -│ - Execute database INSERT │ -├─────────────────────────────────────────────────────────┤ -│ 6. On context exit (exception): │ -│ - Delete any written data │ -│ - Re-raise exception │ -└─────────────────────────────────────────────────────────┘ -``` - -#### Zarr Example - -```python -import zarr -import numpy as np - -# Create a large Zarr array directly in object storage -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Create Zarr hierarchy directly in object storage - # .zarr extension is optional but conventional - root = zarr.open(staged.store('neural_data', '.zarr'), mode='w') - root.create_dataset('timestamps', data=np.arange(1000000)) - root.create_dataset('waveforms', shape=(1000000, 82), chunks=(10000, 82)) - - # Write in chunks (streaming from acquisition) - for i, chunk in enumerate(data_stream): - root['waveforms'][i*10000:(i+1)*10000] = chunk - - # Assign to record - staged.rec['neural_data'] = root - -# Record automatically inserted with computed metadata -# Stored as: neural_data_kM3nP2qR.zarr -``` - -#### Multiple Object Fields - -```python -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Write multiple object fields - extension optional - raw = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) - raw[:] = raw_array - - processed = zarr.open(staged.store('processed', '.zarr'), mode='w', shape=(100, 100)) - processed[:] = processed_array - - staged.rec['raw_data'] = raw - staged.rec['processed'] = processed - -# Stored as: raw_data_Ax7bQ2kM.zarr, processed_pL9nR4wE.zarr -``` - -#### Comparison: Copy vs Staged Insert - -| Aspect | Copy Insert | Staged Insert | -|--------|-------------|---------------| -| Data location | Must exist locally first | Written directly to storage | -| Efficiency | Copy overhead | No copy needed | -| Use case | Small files, existing data | Large arrays, streaming data | -| Cleanup on failure | Orphan possible | Cleaned up | -| API | `insert1({..., "field": path})` | `staged_insert1` context manager | -| Multi-row | Supported | Not supported (insert1 only) | - -## Transaction Handling - -Since storage backends don't support distributed transactions with MySQL, DataJoint uses a **copy-first** strategy. - -### Insert Transaction Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Validate input and generate storage path with token │ -├─────────────────────────────────────────────────────────┤ -│ 2. Copy file/folder to storage backend │ -│ └─ On failure: raise error, INSERT not attempted │ -├─────────────────────────────────────────────────────────┤ -│ 3. Compute hash (if requested) and build JSON metadata │ -├─────────────────────────────────────────────────────────┤ -│ 4. Execute database INSERT │ -│ └─ On failure: orphaned file remains (acceptable) │ -├─────────────────────────────────────────────────────────┤ -│ 5. Commit database transaction │ -│ └─ On failure: orphaned file remains (acceptable) │ -└─────────────────────────────────────────────────────────┘ -``` - -### Failure Scenarios - -| Scenario | Result | Orphaned File? | -|----------|--------|----------------| -| Copy fails | Clean failure, no INSERT | No | -| DB insert fails | Error raised | Yes (acceptable) | -| DB commit fails | Error raised | Yes (acceptable) | - -### Orphaned Files - -Orphaned files (files in storage without corresponding database records) may accumulate due to: -- Failed database inserts after successful copy -- Process crashes -- Network failures - -**This is acceptable** because: -- Random tokens prevent collisions with future inserts -- Orphaned files can be identified by comparing storage contents with database records -- A separate cleanup procedure removes orphaned files during maintenance - -### Orphan Cleanup Procedure - -Orphan cleanup is a **separate maintenance operation** provided via the `schema.object_storage` utility object. Cleanup operates **per-store**, iterating through all configured stores. - -```python -# Maintenance utility methods (not a hidden table) -schema.object_storage.find_orphaned(grace_period_minutes=30) # List orphaned files (all stores) -schema.object_storage.find_orphaned(store="public") # List orphaned files (specific store) -schema.object_storage.cleanup_orphaned(dry_run=True) # Delete orphaned files -schema.object_storage.verify_integrity() # Check all objects exist -schema.object_storage.stats() # Storage usage statistics -``` - -**Note**: `schema.object_storage` is a utility object, not a hidden table. Unlike `attach@store` which uses `~external_*` tables, the `object` type stores all metadata inline in JSON columns and has no hidden tables. - -**Efficient listing for Zarr and large stores:** - -For stores with Zarr arrays (potentially millions of chunk objects), cleanup uses **delimiter-based listing** to enumerate only root object names, not individual chunks: - -```python -# S3 API with delimiter - lists "directories" only -response = s3.list_objects_v2( - Bucket=bucket, - Prefix='project/schema/Table/objects/', - Delimiter='/' -) -# Returns: ['neural_data_kM3nP2qR.zarr/', 'raw_data_Ax7bQ2kM.dat'] -# NOT millions of individual chunk keys -``` - -Orphan deletion uses recursive delete to remove entire Zarr stores efficiently. - -**Grace period for in-flight inserts:** - -While random tokens prevent filename collisions, there's a race condition with in-flight inserts: - -1. Insert starts: file copied to storage with token `Ax7bQ2kM` -2. Orphan cleanup runs: lists storage, queries DB for references -3. File `Ax7bQ2kM` not yet in DB (INSERT not committed) -4. Cleanup identifies it as orphan and deletes it -5. Insert commits: DB now references deleted file! - -**Solution**: The `grace_period_minutes` parameter (default: 30) excludes files created within that window, assuming they are in-flight inserts. - -**Important considerations:** -- Cleanup enumerates all configured stores (default + named) -- Uses delimiter-based listing for efficiency with Zarr stores -- Grace period handles race conditions—cleanup is safe to run anytime -- `dry_run=True` previews deletions before execution -- Compares storage contents against JSON metadata in table columns - -## Fetch Behavior - -On fetch, the `object` type returns a **handle** (`ObjectRef` object) to the stored content. **The file is not copied** - all operations access the storage backend directly. - -```python -record = Recording.fetch1() -file_ref = record["raw_data"] - -# Access metadata (no I/O) -print(file_ref.path) # Full storage path -print(file_ref.size) # File size in bytes -print(file_ref.hash) # Content hash (if computed) or None -print(file_ref.ext) # File extension (e.g., ".dat") or None -print(file_ref.is_dir) # True if stored content is a folder - -# Read content directly from storage backend -content = file_ref.read() # Returns bytes (files only) - -# Open as fsspec file object (files only) -with file_ref.open() as f: - data = f.read() - -# List contents (folders only) -contents = file_ref.listdir() # Returns list of relative paths - -# Access specific file within folder -with file_ref.open("subdir/file.dat") as f: - data = f.read() -``` - -### No Automatic Download - -Unlike `attach@store`, the `object` type does **not** automatically download content to a local path. Users access content directly through the `ObjectRef` handle, which streams from the storage backend. - -For local copies, users explicitly download: - -```python -# Download file to local destination -local_path = file_ref.download("/local/destination/") - -# Download specific file from folder -local_path = file_ref.download("/local/destination/", "subdir/file.dat") -``` - -## Implementation Components - -### 1. Settings Extension (`settings.py`) - -New `ObjectStorageSettings` class: - -```python -class ObjectStorageSettings(BaseSettings): - """Object storage configuration for object columns.""" - - model_config = SettingsConfigDict( - env_prefix="DJ_OBJECT_STORAGE_", - extra="forbid", - validate_assignment=True, - ) - - project_name: str | None = None # Must match store metadata - protocol: Literal["object", "s3", "gcs", "azure"] | None = None - location: str | None = None - bucket: str | None = None - endpoint: str | None = None - partition_pattern: str | None = None - token_length: int = Field(default=8, ge=4, le=16) - access_key: str | None = None - secret_key: SecretStr | None = None -``` - -Add to main `Config` class: - -```python -object_storage: ObjectStorageSettings = Field(default_factory=ObjectStorageSettings) -``` - -### 2. Storage Backend (`storage.py` - new module) - -- `StorageBackend` class wrapping `fsspec` -- Methods: `upload()`, `download()`, `open()`, `exists()`, `delete()` -- Path generation with partition support - -### 3. Type Declaration (`declare.py`) - -- Add `OBJECT` pattern: `object$` -- Add to `SPECIAL_TYPES` -- Substitute to `JSON` type in database - -### 4. Schema Integration (`schemas.py`) - -- Associate storage backend with schema -- Validate storage configuration on schema creation - -### 5. Insert Processing (`table.py`) - -- New `__process_file_attribute()` method -- Path generation using primary key and partition pattern -- Upload via storage backend - -### 6. Fetch Processing (`fetch.py`) - -- New `ObjectRef` class -- Lazy loading from storage backend -- Metadata access interface - -### 7. ObjectRef Class (`objectref.py` - new module) - -```python -@dataclass -class ObjectRef: - """Handle to a file or folder stored in the pipeline's storage backend.""" - - path: str - size: int - hash: str | None # content hash (if computed) or None - ext: str | None # file extension (e.g., ".dat") or None - is_dir: bool - timestamp: datetime - mime_type: str | None # files only, derived from ext - item_count: int | None # folders only - _backend: StorageBackend # internal reference - - # fsspec access (for Zarr, xarray, etc.) - @property - def fs(self) -> fsspec.AbstractFileSystem: - """Return fsspec filesystem for direct access.""" - ... - - @property - def store(self) -> fsspec.FSMap: - """Return FSMap suitable for Zarr/xarray.""" - ... - - @property - def full_path(self) -> str: - """Return full URI (e.g., 's3://bucket/path').""" - ... - - # File operations - def read(self) -> bytes: ... - def open(self, subpath: str | None = None, mode: str = "rb") -> IO: ... - - # Folder operations - def listdir(self, subpath: str = "") -> list[str]: ... - def walk(self) -> Iterator[tuple[str, list[str], list[str]]]: ... - - # Common operations - def download(self, destination: Path | str, subpath: str | None = None) -> Path: ... - def exists(self, subpath: str | None = None) -> bool: ... - - # Integrity verification - def verify(self) -> bool: - """ - Verify object integrity. - - For files: checks size matches, and hash if available. - For folders: validates manifest (all files exist with correct sizes). - - Returns True if valid, raises IntegrityError with details if not. - """ - ... -``` - -#### fsspec Integration - -The `ObjectRef` provides direct fsspec access for integration with array libraries: - -```python -import zarr -import xarray as xr - -record = Recording.fetch1() -obj_ref = record["raw_data"] - -# Direct Zarr access -z = zarr.open(obj_ref.store, mode='r') -print(z.shape) - -# Direct xarray access -ds = xr.open_zarr(obj_ref.store) - -# Use fsspec filesystem directly -fs = obj_ref.fs -files = fs.ls(obj_ref.full_path) -``` - -## Dependencies - -New dependency: `fsspec` with optional backend-specific packages: - -```toml -[project.dependencies] -fsspec = ">=2023.1.0" - -[project.optional-dependencies] -s3 = ["s3fs"] -gcs = ["gcsfs"] -azure = ["adlfs"] -``` - -### Storage Access Architecture - -The `object` type separates **data declaration** (the JSON metadata stored in the database) from **storage access** (the library used to read/write objects): - -- **Data declaration**: The JSON schema (path, size, hash, etc.) is a pure data structure with no library dependencies -- **Storage access**: Currently uses `fsspec` as the default accessor, but the architecture supports alternative backends - -**Why this matters**: While `fsspec` is a mature and widely-used library, alternatives like [`obstore`](https://github.com/developmentseed/obstore) offer performance advantages for certain workloads. By keeping the data model independent of the access library, future versions can support pluggable storage accessors without schema changes. - -**Current implementation**: The `ObjectRef` class provides fsspec-based accessors (`fs`, `store` properties). Future versions may add: -- Pluggable accessor interface -- Alternative backends (obstore, custom implementations) -- Backend selection per-operation or per-configuration - -## Comparison with Existing Types - -| Feature | `attach@store` | `filepath@store` | `object` | -|---------|----------------|------------------|--------| -| Store config | Per-attribute | Per-attribute | Per-pipeline | -| Path control | DataJoint | User-managed | DataJoint | -| DB column | binary(16) UUID | binary(16) UUID | JSON | -| Hidden tables | Yes (external) | Yes (external) | **No** | -| Backend | File/S3 only | File/S3 only | fsspec (any) | -| Partitioning | Hash-based | User path | Configurable | -| Metadata storage | External table | External table | Inline JSON | -| Deduplication | By content | By path | None | - -### No Hidden Tables - -A key architectural difference: the `object` type does **not** use hidden external tables. - -The legacy `attach@store` and `filepath@store` types store a UUID in the table column and maintain a separate hidden `~external_*` table containing: -- File paths/keys -- Checksums -- Size information -- Reference counts - -The `object` type eliminates this complexity by storing all metadata **inline** in the JSON column. This provides: -- **Simpler schema** - no hidden tables to manage or migrate -- **Self-contained records** - all information in one place -- **Easier debugging** - metadata visible directly in queries -- **No reference counting** - each record owns its object exclusively - -### Legacy Type Deprecation - -The existing `attach@store` and `filepath@store` types will be: -- **Maintained** for backward compatibility with existing pipelines -- **Deprecated** in future releases with migration warnings -- **Eventually removed** after a transition period - -New pipelines should use the `object` type exclusively. - -## Delete Behavior - -When a record with a `object` attribute is deleted: - -1. **Database delete executes first** (within transaction) -2. **File delete is attempted** after successful DB commit -3. **File delete is best-effort** - the delete transaction succeeds even if file deletion fails - -### Delete Transaction Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Execute database DELETE │ -├─────────────────────────────────────────────────────────┤ -│ 2. Commit database transaction │ -│ └─ On failure: rollback, files unchanged │ -├─────────────────────────────────────────────────────────┤ -│ 3. Issue delete command to storage backend │ -│ └─ On failure: log warning, transaction still OK │ -└─────────────────────────────────────────────────────────┘ -``` - -### Stale Files - -If file deletion fails (network error, permissions, etc.), **stale files** may remain in storage. This is acceptable because: -- The database record is already deleted (authoritative source) -- Random tokens prevent any collision with future inserts -- Stale files can be identified and cleaned via orphan detection utilities - -### No Reference Counting - -Each record owns its file exclusively. There is no deduplication or reference counting, simplifying delete logic. - -## Migration Path - -- Existing `attach@store` and `filepath@store` remain unchanged -- `object` type is additive - new tables only -- Future: Migration utilities to convert existing external storage - -## Zarr, TileDB, and Large Hierarchical Data - -The `object` type is designed with **chunk-based formats** like Zarr and TileDB in mind. These formats store each chunk as a separate object, which maps naturally to object storage. - -### Staged Insert Compatibility - -**Staged inserts work with formats that support chunk-based writes:** - -| Format | Staged Insert | Why | -|--------|---------------|-----| -| **Zarr** | ✅ Yes | Each chunk is a separate object | -| **TileDB** | ✅ Yes | Fragment-based storage maps to objects | -| **HDF5** | ❌ No | Single monolithic file requires random-access seek/write | - -**HDF5 limitation**: HDF5 files have internal B-tree structures that require random-access modifications. Object storage only supports full object PUT/GET operations, not partial updates. For HDF5, use **copy insert**: - -```python -# HDF5: Write locally, then copy to object storage -import h5py -import tempfile - -with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as f: - with h5py.File(f.name, 'w') as h5: - h5.create_dataset('data', data=large_array) - Recording.insert1({..., 'data_file': f.name}) -``` - -For cloud-native workflows with large arrays, **Zarr is recommended** over HDF5. - -### Recommended Workflow (Zarr) - -For large Zarr stores, use **staged insert** to write directly to object storage: - -```python -import zarr -import numpy as np - -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Write Zarr directly to object storage - store = staged.store('neural_data', '.zarr') - root = zarr.open(store, mode='w') - root.create_dataset('spikes', shape=(1000000, 384), chunks=(10000, 384), dtype='f4') - - # Stream data without local intermediate copy - for i, chunk in enumerate(acquisition_stream): - root['spikes'][i*10000:(i+1)*10000] = chunk - - staged.rec['neural_data'] = root - -# Metadata recorded, no expensive size/hash computation -``` - -### JSON Metadata for Zarr - -For Zarr stores, the recommended JSON metadata omits expensive-to-compute fields: - -```json -{ - "path": "schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "size": null, - "hash": null, - "ext": ".zarr", - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z" -} -``` - -**Field notes for Zarr:** -- **`size`**: Set to `null` - computing total size requires listing all chunks -- **`hash`**: Always `null` for staged inserts - no merkle tree support currently -- **`ext`**: Set to `.zarr` as a conventional tooling hint -- **`is_dir`**: Set to `true` - Zarr stores are key prefixes (logical directories) -- **`item_count`**: Omitted - counting chunks is expensive and rarely useful -- **`mime_type`**: Omitted - Zarr contains mixed content types - -### Reading Zarr Data - -The `ObjectRef` provides direct access compatible with Zarr and xarray: - -```python -record = Recording.fetch1() -obj_ref = record['neural_data'] - -# Direct Zarr access -z = zarr.open(obj_ref.store, mode='r') -print(z['spikes'].shape) - -# xarray integration -ds = xr.open_zarr(obj_ref.store) - -# Dask integration (lazy loading) -import dask.array as da -arr = da.from_zarr(obj_ref.store, component='spikes') -``` - -### Performance Tips - -1. **Use chunked writes**: Write data in chunks that match your Zarr chunk size -2. **Avoid metadata computation**: Let `size` and `item_count` default to `null` -3. **Use appropriate chunk sizes**: Balance between too many small files (overhead) and too few large files (memory) -4. **Consider compression**: Configure Zarr compression (blosc, zstd) to reduce storage costs - -## Future Extensions - -- [ ] Compression options (gzip, lz4, zstd) -- [ ] Encryption at rest -- [ ] Versioning support -- [ ] Streaming upload for large files -- [ ] Checksum verification on fetch -- [ ] Cache layer for frequently accessed files -- [ ] Parallel upload/download for large folders -- [ ] Row-level object access control via signed URLs (project DB permissions onto object access) From 8271c86db1c8d984c72f4ab202993d8bddc05818 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 18:17:31 -0600 Subject: [PATCH 27/32] Clean up dead code and outdated terminology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code cleanup: - Remove backward compatibility aliases (ObjectType, AttachType, etc.) - Remove misleading comments about non-existent DJBlobType/ContentType - Remove unused build_foreign_key_parser_old function - Remove unused feature switches (ADAPTED_TYPE_SWITCH, FILEPATH_FEATURE_SWITCH) - Remove unused os import from errors.py - Rename ADAPTED type category to CODEC Documentation fixes: - Update mkdocs.yaml nav: customtype.md → codecs.md - Fix dead links in attributes.md pointing to customtype.md Terminology updates: - Replace "AttributeType" with "Codec" in all comments - Replace "Adapter" with "Codec" in docstrings - Fix SHA256 → MD5 in content_registry.py docstring Version bump to 2.0.0a6 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/mkdocs.yaml | 2 +- docs/src/design/tables/attributes.md | 4 +-- src/datajoint/builtin_codecs.py | 19 ------------- src/datajoint/content_registry.py | 4 +-- src/datajoint/declare.py | 30 ++++---------------- src/datajoint/errors.py | 42 ---------------------------- src/datajoint/heading.py | 4 +-- src/datajoint/migrate.py | 2 +- src/datajoint/table.py | 2 +- src/datajoint/version.py | 2 +- tests/conftest.py | 10 ++----- 11 files changed, 19 insertions(+), 102 deletions(-) diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 4de4f58e1..03c10f69b 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -33,7 +33,7 @@ nav: - Blobs: design/tables/blobs.md - Attachments: design/tables/attach.md - Filepaths: design/tables/filepath.md - - Custom Datatypes: design/tables/customtype.md + - Custom Codecs: design/tables/codecs.md - Dependencies: design/tables/dependencies.md - Indexes: design/tables/indexes.md - Master-Part Relationships: design/tables/master-part.md diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index e122253ef..39a80ff67 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -96,7 +96,7 @@ The `@` character indicates **external storage** (object store vs database): - ``: DataJoint's native serialization format for Python objects. Supports NumPy arrays, dicts, lists, datetime objects, and nested structures. Stores in - database. Compatible with MATLAB. See [custom types](customtype.md) for details. + database. Compatible with MATLAB. See [custom codecs](codecs.md) for details. - `` / ``: Like `` but stores externally with hash- addressed deduplication. Use for large arrays that may be duplicated across rows. @@ -125,7 +125,7 @@ The `@` character indicates **external storage** (object store vs database): ### User-defined codecs -- ``: Define your own [custom codec](customtype.md) with +- ``: Define your own [custom codec](codecs.md) with bidirectional conversion between Python objects and database storage. Use for graphs, domain-specific objects, or custom data structures. diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index 56aef6779..a55494e82 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -119,9 +119,6 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: return blob.unpack(stored, squeeze=False) -# Note: DJBlobType is defined at end of file as DJBlobCodec (not BlobCodec) - - # ============================================================================= # Hash-Addressed Storage Codec # ============================================================================= @@ -203,9 +200,6 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects bytes, got {type(value).__name__}") -# Note: ContentType is defined at end of file as ContentCodec (not HashCodec) - - # ============================================================================= # Path-Addressed Storage Codec (OAS - Object-Augmented Schema) # ============================================================================= @@ -419,10 +413,6 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects bytes or path, got {type(value).__name__}") -# Backward compatibility alias -ObjectType = ObjectCodec - - # ============================================================================= # File Attachment Codecs # ============================================================================= @@ -544,11 +534,6 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects a file path, got {type(value).__name__}") -# Backward compatibility aliases -AttachType = AttachCodec -XAttachType = AttachCodec # is now just AttachCodec with external storage - - # ============================================================================= # Filepath Reference Codec # ============================================================================= @@ -664,7 +649,3 @@ def validate(self, value: Any) -> None: if not isinstance(value, (str, Path)): raise TypeError(f" expects a path string or Path, got {type(value).__name__}") - - -# Backward compatibility alias -FilepathType = FilepathCodec diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py index f9747cca7..abed955a0 100644 --- a/src/datajoint/content_registry.py +++ b/src/datajoint/content_registry.py @@ -2,8 +2,8 @@ Content-addressed storage registry for DataJoint. This module provides content-addressed storage with deduplication for the -AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical -directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash} +Codec. Content is identified by its MD5 hash and stored in a hierarchical +directory structure: _hash/{hash[:2]}/{hash[2:4]}/{hash} The ContentRegistry tracks stored content for garbage collection purposes. """ diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 777136f0b..8b6bfda80 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -76,8 +76,8 @@ TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$", # time, timestamp, year (not date/datetime) NATIVE_BLOB=r"(tiny|small|medium|long)blob$", # Specific blob variants NATIVE_TEXT=r"(tiny|small|medium|long)text$", # Text variants (use plain 'text' instead) - # AttributeTypes use angle brackets - ADAPTED=r"<.+>$", + # Codecs use angle brackets + CODEC=r"<.+>$", ).items() } @@ -85,7 +85,7 @@ CORE_TYPE_NAMES = {name.upper() for name in CORE_TYPES} # Special types that need comment storage (core types + adapted) -SPECIAL_TYPES = CORE_TYPE_NAMES | {"ADAPTED"} +SPECIAL_TYPES = CORE_TYPE_NAMES | {"CODEC"} # Native SQL types that pass through (with optional warning) NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES @@ -104,23 +104,6 @@ def match_type(attribute_type): logger = logging.getLogger(__name__.split(".")[0]) -def build_foreign_key_parser_old(): - # old-style foreign key parser. Superseded by expression-based syntax. See issue #436 - # This will be deprecated in a future release. - left = pp.Literal("(").suppress() - right = pp.Literal(")").suppress() - attribute_name = pp.Word(pp.srange("[a-z]"), pp.srange("[a-z0-9_]")) - new_attrs = pp.Optional(left + pp.DelimitedList(attribute_name) + right).set_results_name("new_attrs") - arrow = pp.Literal("->").suppress() - lbracket = pp.Literal("[").suppress() - rbracket = pp.Literal("]").suppress() - option = pp.Word(pp.srange("[a-zA-Z]")) - options = pp.Optional(lbracket + pp.DelimitedList(option) + rbracket).set_results_name("options") - ref_table = pp.Word(pp.alphas, pp.alphanums + "._").set_results_name("ref_table") - ref_attrs = pp.Optional(left + pp.DelimitedList(attribute_name) + right).set_results_name("ref_attrs") - return new_attrs + arrow + options + ref_table + ref_attrs - - def build_foreign_key_parser(): arrow = pp.Literal("->").suppress() lbracket = pp.Literal("[").suppress() @@ -144,7 +127,6 @@ def build_attribute_parser(): return attribute_name + pp.Optional(default) + colon + data_type + comment -foreign_key_parser_old = build_foreign_key_parser_old() foreign_key_parser = build_foreign_key_parser() attribute_parser = build_attribute_parser() @@ -459,14 +441,14 @@ def substitute_special_type(match, category, foreign_key_sql, context): Special types are: - Core DataJoint types (float32 → float, uuid → binary(16), bytes → longblob, etc.) - - ADAPTED types (AttributeTypes in angle brackets) + - CODEC types (Codecs in angle brackets) :param match: dict containing with keys "type" and "comment" -- will be modified in place :param category: attribute type category from TYPE_PATTERN :param foreign_key_sql: list of foreign key declarations to add to :param context: context for looking up user-defined codecs (unused, kept for compatibility) """ - if category == "ADAPTED": + if category == "CODEC": # Codec - resolve to underlying dtype codec, store_name = lookup_codec(match["type"]) if store_name is not None: @@ -540,7 +522,7 @@ def compile_attribute(line, in_key, foreign_key_sql, context): category = match_type(match["type"]) if category in SPECIAL_TYPES: - # Core types and AttributeTypes are recorded in comment for reconstruction + # Core types and Codecs are recorded in comment for reconstruction match["comment"] = ":{type}:{comment}".format(**match) substitute_special_type(match, category, foreign_key_sql, context) elif category in NATIVE_TYPES: diff --git a/src/datajoint/errors.py b/src/datajoint/errors.py index 03555bf13..aadc74caf 100644 --- a/src/datajoint/errors.py +++ b/src/datajoint/errors.py @@ -2,8 +2,6 @@ Exception classes for the DataJoint library """ -import os - # --- Top Level --- class DataJointError(Exception): @@ -87,43 +85,3 @@ class BucketInaccessible(DataJointError): """ Error raised when a S3 bucket is inaccessible """ - - -# environment variables to control availability of experimental features - -ADAPTED_TYPE_SWITCH = "DJ_SUPPORT_ADAPTED_TYPES" -FILEPATH_FEATURE_SWITCH = "DJ_SUPPORT_FILEPATH_MANAGEMENT" - - -def _switch_adapted_types(on): - """ - Enable (on=True) or disable (on=False) support for AttributeAdapter - """ - if on: - os.environ[ADAPTED_TYPE_SWITCH] = "TRUE" - else: - del os.environ[ADAPTED_TYPE_SWITCH] - - -def _support_adapted_types(): - """ - check if support for AttributeAdapter is enabled - """ - return os.getenv(ADAPTED_TYPE_SWITCH, "FALSE").upper() == "TRUE" - - -def _switch_filepath_types(on): - """ - Enable (on=True) or disable (on=False) support for AttributeAdapter - """ - if on: - os.environ[FILEPATH_FEATURE_SWITCH] = "TRUE" - else: - del os.environ[FILEPATH_FEATURE_SWITCH] - - -def _support_filepath_types(): - """ - check if support for AttributeAdapter is enabled - """ - return os.getenv(FILEPATH_FEATURE_SWITCH, "FALSE").upper() == "TRUE" diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index c451089c0..bc555224c 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -307,8 +307,8 @@ def _init_from_database(self): # Store the original type name for display but keep db_type for SQL attr["original_type"] = special["type"] - # process Codecs (adapted types in angle brackets) - if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): + # process Codecs (types in angle brackets) + if special and TYPE_PATTERN["CODEC"].match(attr["type"]): # Context can be None for built-in types that are globally registered codec_spec = special["type"] try: diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 0bfc355db..1948cbe06 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -2,7 +2,7 @@ Migration utilities for DataJoint schema updates. This module provides tools for migrating existing schemas to use the new -AttributeType system, particularly for upgrading blob columns to use +Codec system, particularly for upgrading blob columns to use explicit `` type declarations. """ diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 9a3328a48..23648e1d7 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -732,7 +732,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): processing by mysql API. In the simplified type system: - - Adapters (AttributeTypes) handle all custom encoding via type chains + - Codecs handle all custom encoding via type chains - UUID values are converted to bytes - JSON values are serialized - Blob values pass through as bytes diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 200fd9ba4..ae02042b2 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a5" +__version__ = "2.0.0a6" diff --git a/tests/conftest.py b/tests/conftest.py index 18e5a539a..d7e814436 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,10 +24,7 @@ from packaging import version import datajoint as dj -from datajoint.errors import ( - FILEPATH_FEATURE_SWITCH, - DataJointError, -) +from datajoint.errors import DataJointError from . import schema, schema_advanced, schema_external, schema_object, schema_simple from . import schema_uuid as schema_uuid_module @@ -334,10 +331,9 @@ def enable_adapted_types(): @pytest.fixture -def enable_filepath_feature(monkeypatch): - monkeypatch.setenv(FILEPATH_FEATURE_SWITCH, "TRUE") +def enable_filepath_feature(): + """Legacy fixture - filepath feature is now always enabled.""" yield - monkeypatch.delenv(FILEPATH_FEATURE_SWITCH, raising=True) # --- Cleanup fixtures --- From 39cd993faf88a01e1942369b8c2a8ba50dce4e02 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 18:20:17 -0600 Subject: [PATCH 28/32] Remove enable_filepath_feature fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filepath feature is now always enabled; no feature flag needed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- tests/conftest.py | 8 +------- tests/integration/test_adapted_attributes.py | 1 - tests/integration/test_update1.py | 8 ++++---- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index d7e814436..af1043927 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -330,12 +330,6 @@ def enable_adapted_types(): yield -@pytest.fixture -def enable_filepath_feature(): - """Legacy fixture - filepath feature is now always enabled.""" - yield - - # --- Cleanup fixtures --- @@ -550,7 +544,7 @@ def schema_adv(connection_test, prefix): @pytest.fixture -def schema_ext(connection_test, enable_filepath_feature, mock_stores, mock_cache, prefix): +def schema_ext(connection_test, mock_stores, mock_cache, prefix): schema = dj.Schema( prefix + "_extern", context=schema_external.LOCALS_EXTERNAL, diff --git a/tests/integration/test_adapted_attributes.py b/tests/integration/test_adapted_attributes.py index 3fe67a96a..81037fac0 100644 --- a/tests/integration/test_adapted_attributes.py +++ b/tests/integration/test_adapted_attributes.py @@ -23,7 +23,6 @@ def schema_name(prefix): @pytest.fixture def schema_ad( connection_test, - enable_filepath_feature, s3_creds, tmpdir, schema_name, diff --git a/tests/integration/test_update1.py b/tests/integration/test_update1.py index 92f68a8d4..eb525a6be 100644 --- a/tests/integration/test_update1.py +++ b/tests/integration/test_update1.py @@ -57,7 +57,7 @@ def schema_update1(connection_test, prefix): schema.drop() -def test_update1(tmpdir, enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1(tmpdir, schema_update1, mock_stores_update): """Test normal updates""" # CHECK 1 -- initial insert key = dict(thing=1) @@ -128,19 +128,19 @@ def test_update1(tmpdir, enable_filepath_feature, schema_update1, mock_stores_up assert original_file_data == final_file_data -def test_update1_nonexistent(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_nonexistent(schema_update1, mock_stores_update): with pytest.raises(DataJointError): # updating a non-existent entry Thing.update1(dict(thing=100, frac=0.5)) -def test_update1_noprimary(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_noprimary(schema_update1, mock_stores_update): with pytest.raises(DataJointError): # missing primary key Thing.update1(dict(number=None)) -def test_update1_misspelled_attribute(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_misspelled_attribute(schema_update1, mock_stores_update): key = dict(thing=17) Thing.insert1(dict(key, frac=1.5)) with pytest.raises(DataJointError): From 11e492ba46459aaa9c09d361819ffe2305256c9e Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 18:24:47 -0600 Subject: [PATCH 29/32] Rename tests to use Codec terminology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File renames: - schema_adapted.py → schema_codecs.py - test_adapted_attributes.py → test_codecs.py - test_type_composition.py → test_codec_chaining.py Content updates: - LOCALS_ADAPTED → LOCALS_CODECS - GraphType → GraphCodec, LayoutToFilepathType → LayoutCodec - Test class names: TestTypeChain* → TestCodecChain* - Test function names: test_adapted_* → test_codec_* - Updated docstrings and comments 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- ..._composition.py => test_codec_chaining.py} | 64 +++++++++---------- ...t_adapted_attributes.py => test_codecs.py} | 45 +++++++------ tests/{schema_adapted.py => schema_codecs.py} | 22 +++---- 3 files changed, 67 insertions(+), 64 deletions(-) rename tests/integration/{test_type_composition.py => test_codec_chaining.py} (87%) rename tests/integration/{test_adapted_attributes.py => test_codecs.py} (72%) rename tests/{schema_adapted.py => schema_codecs.py} (66%) diff --git a/tests/integration/test_type_composition.py b/tests/integration/test_codec_chaining.py similarity index 87% rename from tests/integration/test_type_composition.py rename to tests/integration/test_codec_chaining.py index 23ca927b0..defbd428f 100644 --- a/tests/integration/test_type_composition.py +++ b/tests/integration/test_codec_chaining.py @@ -1,8 +1,8 @@ """ -Tests for type composition (type chain encoding/decoding). +Tests for codec chaining (composition). This tests the → json composition pattern -and similar type chains. +and similar codec chains. """ from datajoint.codecs import ( @@ -12,23 +12,23 @@ ) -class TestTypeChainResolution: - """Tests for resolving type chains.""" +class TestCodecChainResolution: + """Tests for resolving codec chains.""" def setup_method(self): - """Clear test types from registry before each test.""" + """Clear test codecs from registry before each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] def teardown_method(self): - """Clean up test types after each test.""" + """Clean up test codecs after each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] - def test_single_type_chain(self): - """Test resolving a single-type chain.""" + def test_single_codec_chain(self): + """Test resolving a single-codec chain.""" class TestSingle(Codec): name = "test_single" @@ -49,8 +49,8 @@ def decode(self, stored, *, key=None): assert chain[0].name == "test_single" assert store is None - def test_two_type_chain(self): - """Test resolving a two-type chain.""" + def test_two_codec_chain(self): + """Test resolving a two-codec chain.""" class TestInner(Codec): name = "test_inner" @@ -83,8 +83,8 @@ def decode(self, stored, *, key=None): assert chain[0].name == "test_outer" assert chain[1].name == "test_inner" - def test_three_type_chain(self): - """Test resolving a three-type chain.""" + def test_three_codec_chain(self): + """Test resolving a three-codec chain.""" class TestBase(Codec): name = "test_base" @@ -131,17 +131,17 @@ def decode(self, stored, *, key=None): assert chain[2].name == "test_base" -class TestTypeChainEncodeDecode: - """Tests for encode/decode through type chains.""" +class TestCodecChainEncodeDecode: + """Tests for encode/decode through codec chains.""" def setup_method(self): - """Clear test types from registry before each test.""" + """Clear test codecs from registry before each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] def teardown_method(self): - """Clean up test types after each test.""" + """Clean up test codecs after each test.""" for name in list(_codec_registry.keys()): if name.startswith("test_"): del _codec_registry[name] @@ -180,8 +180,8 @@ def decode(self, stored, *, key=None): # Apply encode in order: outer first, then inner value = b"start" - for attr_type in chain: - value = attr_type.encode(value) + for codec in chain: + value = codec.encode(value) assert encode_order == ["outer", "inner"] assert value == b"start_outer_inner" @@ -220,14 +220,14 @@ def decode(self, stored, *, key=None): # Apply decode in reverse order: inner first, then outer value = b"start_outer_inner" - for attr_type in reversed(chain): - value = attr_type.decode(value) + for codec in reversed(chain): + value = codec.decode(value) assert decode_order == ["inner", "outer"] assert value == b"start" def test_roundtrip(self): - """Test encode/decode roundtrip through a type chain.""" + """Test encode/decode roundtrip through a codec chain.""" class TestInnerRt(Codec): name = "test_inner_rt" @@ -264,21 +264,21 @@ def decode(self, stored, *, key=None): # Encode: outer → inner encoded = original - for attr_type in chain: - encoded = attr_type.encode(encoded) + for codec in chain: + encoded = codec.encode(encoded) assert encoded == b"COMPRESSED:test data" # Decode: inner → outer (reversed) decoded = encoded - for attr_type in reversed(chain): - decoded = attr_type.decode(decoded) + for codec in reversed(chain): + decoded = codec.decode(decoded) assert decoded == original -class TestBuiltinTypeComposition: - """Tests for built-in type composition.""" +class TestBuiltinCodecChains: + """Tests for built-in codec chains.""" def test_blob_internal_resolves_to_bytes(self): """Test that (internal) → bytes.""" @@ -345,17 +345,17 @@ def test_filepath_external_resolves_to_json(self): class TestStoreNameParsing: - """Tests for store name parsing in type specs.""" + """Tests for store name parsing in codec specs.""" - def test_type_with_store(self): - """Test parsing type with store name.""" + def test_codec_with_store(self): + """Test parsing codec with store name.""" final_dtype, chain, store = resolve_dtype("") assert final_dtype == "json" assert store == "mystore" - def test_type_without_store(self): - """Test parsing type without store name.""" + def test_codec_without_store(self): + """Test parsing codec without store name.""" final_dtype, chain, store = resolve_dtype("") assert store is None diff --git a/tests/integration/test_adapted_attributes.py b/tests/integration/test_codecs.py similarity index 72% rename from tests/integration/test_adapted_attributes.py rename to tests/integration/test_codecs.py index 81037fac0..05b8aabef 100644 --- a/tests/integration/test_adapted_attributes.py +++ b/tests/integration/test_codecs.py @@ -1,5 +1,5 @@ """ -Tests for adapted/custom attribute types. +Tests for custom codecs. These tests verify the Codec system for custom data types. """ @@ -11,50 +11,51 @@ import datajoint as dj -from tests import schema_adapted -from tests.schema_adapted import Connectivity, Layout +from tests import schema_codecs +from tests.schema_codecs import Connectivity, Layout @pytest.fixture def schema_name(prefix): - return prefix + "_test_custom_datatype" + return prefix + "_test_codecs" @pytest.fixture -def schema_ad( +def schema_codec( connection_test, s3_creds, tmpdir, schema_name, ): - dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} - # Codecs are auto-registered via __init_subclass__ in schema_adapted - context = {**schema_adapted.LOCALS_ADAPTED} + dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="codecs/repo", stage=str(tmpdir))} + # Codecs are auto-registered via __init_subclass__ in schema_codecs + context = {**schema_codecs.LOCALS_CODECS} schema = dj.schema(schema_name, context=context, connection=connection_test) - schema(schema_adapted.Connectivity) - schema(schema_adapted.Layout) + schema(schema_codecs.Connectivity) + schema(schema_codecs.Layout) yield schema schema.drop() @pytest.fixture -def local_schema(schema_ad, schema_name): +def local_schema(schema_codec, schema_name): """Fixture for testing spawned classes""" - local_schema = dj.Schema(schema_name, connection=schema_ad.connection) + local_schema = dj.Schema(schema_name, connection=schema_codec.connection) local_schema.spawn_missing_classes() yield local_schema - # Don't drop - schema_ad fixture handles cleanup + # Don't drop - schema_codec fixture handles cleanup @pytest.fixture -def schema_virtual_module(schema_ad, schema_name): +def schema_virtual_module(schema_codec, schema_name): """Fixture for testing virtual modules""" - # Types are registered globally, no need to add_objects for codecs - schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, connection=schema_ad.connection) + # Codecs are registered globally, no need to add_objects + schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, connection=schema_codec.connection) return schema_virtual_module -def test_adapted_type(schema_ad): +def test_codec_graph(schema_codec): + """Test basic codec encode/decode with graph type.""" c = Connectivity() graphs = [ nx.lollipop_graph(4, 2), @@ -71,8 +72,8 @@ def test_adapted_type(schema_ad): c.delete() -def test_adapted_filepath_type(schema_ad, minio_client): - """https://github.com/datajoint/datajoint-python/issues/684""" +def test_codec_chained(schema_codec, minio_client): + """Test codec chaining (layout -> blob).""" c = Connectivity() c.delete() c.insert1((0, nx.lollipop_graph(4, 2))) @@ -88,7 +89,8 @@ def test_adapted_filepath_type(schema_ad, minio_client): c.delete() -def test_adapted_spawned(local_schema): +def test_codec_spawned(local_schema): + """Test codecs work with spawned classes.""" c = Connectivity() # a spawned class graphs = [ nx.lollipop_graph(4, 2), @@ -105,7 +107,8 @@ def test_adapted_spawned(local_schema): c.delete() -def test_adapted_virtual(schema_virtual_module): +def test_codec_virtual_module(schema_virtual_module): + """Test codecs work with virtual modules.""" c = schema_virtual_module.Connectivity() graphs = [ nx.lollipop_graph(4, 2), diff --git a/tests/schema_adapted.py b/tests/schema_codecs.py similarity index 66% rename from tests/schema_adapted.py rename to tests/schema_codecs.py index 8edf8d65d..6a8d478d4 100644 --- a/tests/schema_adapted.py +++ b/tests/schema_codecs.py @@ -5,13 +5,13 @@ import datajoint as dj -class GraphType(dj.Codec): +class GraphCodec(dj.Codec): """Custom codec for storing NetworkX graphs as edge lists.""" name = "graph" def get_dtype(self, is_external: bool) -> str: - """Chain to djblob for serialization.""" + """Chain to blob for serialization.""" return "" def encode(self, obj, *, key=None, store_name=None): @@ -24,22 +24,22 @@ def decode(self, stored, *, key=None): return nx.Graph(stored) -class LayoutToFilepathType(dj.Codec): - """Custom codec that saves a graph layout as serialized JSON blob.""" +class LayoutCodec(dj.Codec): + """Custom codec that saves a graph layout as serialized blob.""" - name = "layout_to_filepath" + name = "layout" def get_dtype(self, is_external: bool) -> str: - """Chain to djblob for serialization.""" + """Chain to blob for serialization.""" return "" def encode(self, layout, *, key=None, store_name=None): """Serialize layout dict.""" - return layout # djblob handles serialization + return layout # blob handles serialization def decode(self, stored, *, key=None): """Deserialize layout dict.""" - return stored # djblob handles deserialization + return stored # blob handles deserialization class Connectivity(dj.Manual): @@ -55,9 +55,9 @@ class Layout(dj.Manual): # stores graph layout -> Connectivity --- - layout: + layout: """ -LOCALS_ADAPTED = {k: v for k, v in locals().items() if inspect.isclass(v)} -__all__ = list(LOCALS_ADAPTED) +LOCALS_CODECS = {k: v for k, v in locals().items() if inspect.isclass(v)} +__all__ = list(LOCALS_CODECS) From d4560e64247016634c28f234409173c1a8f34b35 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 19:14:20 -0600 Subject: [PATCH 30/32] Implement testcontainers for pytest-managed container lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests now automatically start MySQL and MinIO containers via testcontainers. No manual `docker-compose up` required - just run `pytest tests/`. Changes: - conftest.py: Add mysql_container and minio_container fixtures that auto-start containers when tests run and stop them afterward - pyproject.toml: Add testcontainers[mysql,minio] dependency, update pixi tasks, remove pytest-env (no longer needed) - docker-compose.yaml: Update docs to clarify it's optional for tests - README.md: Comprehensive developer guide with clear instructions for running tests, pre-commit hooks, and PR submission checklist Usage: - Default: `pytest tests/` - testcontainers manages containers - External: `DJ_USE_EXTERNAL_CONTAINERS=1 pytest` - use docker-compose Benefits: - Zero setup for developers - just `pip install -e ".[test]" && pytest` - Dynamic ports (no conflicts with other services) - Automatic cleanup after tests - Simpler CI configuration Version bump to 2.0.0a7 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 83 ++++++++--- docker-compose.yaml | 13 +- pyproject.toml | 28 +--- src/datajoint/version.py | 2 +- tests/conftest.py | 312 +++++++++++++++++++++++++-------------- 5 files changed, 289 insertions(+), 149 deletions(-) diff --git a/README.md b/README.md index 6466b9e7b..f4a6f8352 100644 --- a/README.md +++ b/README.md @@ -146,30 +146,57 @@ DataJoint (). ### Prerequisites -- [Docker](https://docs.docker.com/get-docker/) for MySQL and MinIO services +- [Docker](https://docs.docker.com/get-docker/) (Docker daemon must be running) - Python 3.10+ -### Running Tests - -Tests are organized into `unit/` (no external services) and `integration/` (requires MySQL + MinIO): +### Quick Start ```bash -# Install dependencies +# Clone and install +git clone https://github.com/datajoint/datajoint-python.git +cd datajoint-python pip install -e ".[test]" -# Run unit tests only (fast, no Docker needed) -pytest tests/unit/ +# Run all tests (containers start automatically via testcontainers) +pytest tests/ -# Start MySQL and MinIO for integration tests -docker compose up -d db minio +# Install and run pre-commit hooks +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` -# Run all tests +### Running Tests + +Tests use [testcontainers](https://testcontainers.com/) to automatically manage MySQL and MinIO containers. +**No manual `docker-compose up` required** - containers start when tests run and stop afterward. + +```bash +# Run all tests (recommended) pytest tests/ +# Run with coverage report +pytest --cov-report term-missing --cov=datajoint tests/ + # Run specific test file pytest tests/integration/test_blob.py -v -# Stop services when done +# Run only unit tests (no containers needed) +pytest tests/unit/ +``` + +### Alternative: External Containers + +For development/debugging, you may prefer persistent containers that survive test runs: + +```bash +# Start containers manually +docker compose up -d db minio + +# Run tests using external containers +DJ_USE_EXTERNAL_CONTAINERS=1 pytest tests/ + +# Stop containers when done docker compose down ``` @@ -183,24 +210,46 @@ docker compose --profile test up djtest --build ### Alternative: Using pixi -[pixi](https://pixi.sh) users can run tests with automatic service management: +[pixi](https://pixi.sh) users can run tests with: ```bash pixi install # First time setup -pixi run test # Starts services and runs tests -pixi run services-down # Stop services +pixi run test # Runs tests (testcontainers manages containers) ``` ### Pre-commit Hooks +Pre-commit hooks run automatically on `git commit` to check code quality. +**All hooks must pass before committing.** + ```bash -pre-commit install # Install hooks (first time) -pre-commit run --all-files # Run all checks +# Install hooks (first time only) +pip install pre-commit +pre-commit install + +# Run all checks manually +pre-commit run --all-files + +# Run specific hook +pre-commit run ruff --all-files +pre-commit run codespell --all-files ``` +Hooks include: +- **ruff**: Python linting and formatting +- **codespell**: Spell checking +- **YAML/JSON/TOML validation** +- **Large file detection** + +### Before Submitting a PR + +1. **Run all tests**: `pytest tests/` +2. **Run pre-commit**: `pre-commit run --all-files` +3. **Check coverage**: `pytest --cov-report term-missing --cov=datajoint tests/` + ### Environment Variables -Tests use these defaults (configured in `pyproject.toml`): +For external container mode (`DJ_USE_EXTERNAL_CONTAINERS=1`): | Variable | Default | Description | |----------|---------|-------------| diff --git a/docker-compose.yaml b/docker-compose.yaml index 98a16f165..2c48ffd10 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,10 +1,15 @@ # Development environment with MySQL and MinIO services # -# Quick start: -# docker compose up -d db minio # Start services -# pytest tests/ # Run tests (uses localhost defaults) +# NOTE: docker-compose is OPTIONAL for running tests. +# Tests use testcontainers to automatically manage containers. +# Just run: pytest tests/ # -# Full Docker testing: +# Use docker-compose for development/debugging when you want +# persistent containers that survive test runs: +# docker compose up -d db minio # Start services manually +# pytest tests/ # Tests will use these containers +# +# Full Docker testing (CI): # docker compose --profile test up djtest --build services: db: diff --git a/pyproject.toml b/pyproject.toml index 82cad39ea..6f6da8a3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,9 +84,9 @@ datajoint = "datajoint.cli:cli" test = [ "pytest", "pytest-cov", - "pytest-env", "requests", "graphviz", + "testcontainers[mysql,minio]>=4.0", ] [project.optional-dependencies] @@ -96,9 +96,9 @@ azure = ["adlfs>=2023.1.0"] test = [ "pytest", "pytest-cov", - "pytest-env", "requests", "s3fs>=2023.1.0", + "testcontainers[mysql,minio]>=4.0", ] dev = [ "pre-commit", @@ -158,20 +158,7 @@ skip = ".git,*.pdf,*.svg,*.csv,*.ipynb,*.drawio" # astroid -- Python library name (not "asteroid") ignore-words-list = "rever,numer,astroid" -[tool.pytest_env] -# Default environment variables for tests (D: prefix = only set if not defined) -# These defaults work for local development with `docker compose up -d db minio` -# For devcontainer/docker: override DJ_HOST=db and S3_ENDPOINT=minio:9000 -"D:DJ_HOST" = "localhost" -"D:DJ_PORT" = "3306" -"D:DJ_USER" = "root" -"D:DJ_PASS" = "password" -"D:DJ_TEST_USER" = "datajoint" -"D:DJ_TEST_PASSWORD" = "datajoint" -"D:S3_ENDPOINT" = "localhost:9000" -"D:S3_ACCESS_KEY" = "datajoint" -"D:S3_SECRET_KEY" = "datajoint" -"D:S3_BUCKET" = "datajoint.test" +# pytest-env removed - testcontainers handles container lifecycle automatically [tool.pixi.workspace] @@ -187,12 +174,13 @@ dev = { features = ["dev"], solve-group = "default" } test = { features = ["test"], solve-group = "default" } [tool.pixi.tasks] -# Start required services (MySQL and MinIO) +# Tests use testcontainers - no manual setup required +test = "pytest tests/" +test-cov = "pytest --cov-report term-missing --cov=datajoint tests/" +# Optional: use external containers (docker-compose) instead of testcontainers services-up = "docker compose up -d db minio" services-down = "docker compose down" -# Run tests (requires services to be running, uses localhost defaults from pytest_env) -test = { cmd = "pytest tests/", depends-on = ["services-up"] } -test-cov = { cmd = "pytest --cov-report term-missing --cov=datajoint tests/", depends-on = ["services-up"] } +test-external = { cmd = "DJ_USE_EXTERNAL_CONTAINERS=1 pytest tests/", depends-on = ["services-up"] } [tool.pixi.dependencies] python = ">=3.10,<3.14" diff --git a/src/datajoint/version.py b/src/datajoint/version.py index ae02042b2..6075746ab 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a6" +__version__ = "2.0.0a7" diff --git a/tests/conftest.py b/tests/conftest.py index af1043927..adbabbd97 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,12 @@ """ Pytest configuration for DataJoint tests. -Expects MySQL and MinIO services to be running via docker-compose: - docker-compose up -d db minio - -Environment variables (with defaults from docker-compose.yaml): - DJ_HOST=db MySQL host - DJ_USER=root MySQL root user - DJ_PASS=password MySQL root password - S3_ENDPOINT=minio:9000 MinIO endpoint - S3_ACCESS_KEY=datajoint MinIO access key - S3_SECRET_KEY=datajoint MinIO secret key +Containers are automatically started via testcontainers - no manual setup required. +Just run: pytest tests/ + +To use external containers instead (e.g., docker-compose), set: + DJ_USE_EXTERNAL_CONTAINERS=1 + DJ_HOST=localhost DJ_PORT=3306 S3_ENDPOINT=localhost:9000 pytest """ import logging @@ -21,7 +17,6 @@ import certifi import pytest import urllib3 -from packaging import version import datajoint as dj from datajoint.errors import DataJointError @@ -33,7 +28,74 @@ logger = logging.getLogger(__name__) -# --- Database connection fixtures --- +# ============================================================================= +# Container Fixtures - Auto-start MySQL and MinIO via testcontainers +# ============================================================================= + +# Check if we should use external containers (for CI or manual docker-compose) +USE_EXTERNAL_CONTAINERS = os.environ.get("DJ_USE_EXTERNAL_CONTAINERS", "").lower() in ("1", "true", "yes") + + +@pytest.fixture(scope="session") +def mysql_container(): + """Start MySQL container for the test session (or use external).""" + if USE_EXTERNAL_CONTAINERS: + # Use external container - return None, credentials come from env + logger.info("Using external MySQL container") + yield None + return + + from testcontainers.mysql import MySqlContainer + + container = MySqlContainer( + image="mysql:8.0", + username="root", + password="password", + dbname="test", + ) + container.start() + + host = container.get_container_host_ip() + port = container.get_exposed_port(3306) + logger.info(f"MySQL container started at {host}:{port}") + + yield container + + container.stop() + logger.info("MySQL container stopped") + + +@pytest.fixture(scope="session") +def minio_container(): + """Start MinIO container for the test session (or use external).""" + if USE_EXTERNAL_CONTAINERS: + # Use external container - return None, credentials come from env + logger.info("Using external MinIO container") + yield None + return + + from testcontainers.minio import MinioContainer + + container = MinioContainer( + image="minio/minio:latest", + access_key="datajoint", + secret_key="datajoint", + ) + container.start() + + host = container.get_container_host_ip() + port = container.get_exposed_port(9000) + logger.info(f"MinIO container started at {host}:{port}") + + yield container + + container.stop() + logger.info("MinIO container stopped") + + +# ============================================================================= +# Credential Fixtures - Derived from containers or environment +# ============================================================================= @pytest.fixture(scope="session") @@ -42,45 +104,88 @@ def prefix(): @pytest.fixture(scope="session") -def db_creds_root() -> Dict: - """Root database credentials from environment.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") - return dict( - host=f"{host}:{port}" if port else host, - user=os.environ.get("DJ_USER", "root"), - password=os.environ.get("DJ_PASS", "password"), - ) +def db_creds_root(mysql_container) -> Dict: + """Root database credentials from container or environment.""" + if mysql_container is not None: + # From testcontainer + host = mysql_container.get_container_host_ip() + port = mysql_container.get_exposed_port(3306) + return dict( + host=f"{host}:{port}", + user="root", + password="password", + ) + else: + # From environment (external container) + host = os.environ.get("DJ_HOST", "localhost") + port = os.environ.get("DJ_PORT", "3306") + return dict( + host=f"{host}:{port}" if port else host, + user=os.environ.get("DJ_USER", "root"), + password=os.environ.get("DJ_PASS", "password"), + ) @pytest.fixture(scope="session") -def db_creds_test() -> Dict: - """Test user database credentials from environment.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") - return dict( - host=f"{host}:{port}" if port else host, - user=os.environ.get("DJ_TEST_USER", "datajoint"), - password=os.environ.get("DJ_TEST_PASSWORD", "datajoint"), - ) +def db_creds_test(mysql_container) -> Dict: + """Test user database credentials from container or environment.""" + if mysql_container is not None: + # From testcontainer + host = mysql_container.get_container_host_ip() + port = mysql_container.get_exposed_port(3306) + return dict( + host=f"{host}:{port}", + user="datajoint", + password="datajoint", + ) + else: + # From environment (external container) + host = os.environ.get("DJ_HOST", "localhost") + port = os.environ.get("DJ_PORT", "3306") + return dict( + host=f"{host}:{port}" if port else host, + user=os.environ.get("DJ_TEST_USER", "datajoint"), + password=os.environ.get("DJ_TEST_PASSWORD", "datajoint"), + ) @pytest.fixture(scope="session") -def s3_creds() -> Dict: - """S3/MinIO credentials from environment.""" - return dict( - endpoint=os.environ.get("S3_ENDPOINT", "minio:9000"), - access_key=os.environ.get("S3_ACCESS_KEY", "datajoint"), - secret_key=os.environ.get("S3_SECRET_KEY", "datajoint"), - bucket=os.environ.get("S3_BUCKET", "datajoint.test"), - ) +def s3_creds(minio_container) -> Dict: + """S3/MinIO credentials from container or environment.""" + if minio_container is not None: + # From testcontainer + host = minio_container.get_container_host_ip() + port = minio_container.get_exposed_port(9000) + return dict( + endpoint=f"{host}:{port}", + access_key="datajoint", + secret_key="datajoint", + bucket="datajoint.test", + ) + else: + # From environment (external container) + return dict( + endpoint=os.environ.get("S3_ENDPOINT", "localhost:9000"), + access_key=os.environ.get("S3_ACCESS_KEY", "datajoint"), + secret_key=os.environ.get("S3_SECRET_KEY", "datajoint"), + bucket=os.environ.get("S3_BUCKET", "datajoint.test"), + ) + + +# ============================================================================= +# DataJoint Configuration +# ============================================================================= @pytest.fixture(scope="session", autouse=True) def configure_datajoint(db_creds_root): - """Configure DataJoint to use docker-compose services.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") + """Configure DataJoint to use test database.""" + # Parse host:port from credentials + host_port = db_creds_root["host"] + if ":" in host_port: + host, port = host_port.rsplit(":", 1) + else: + host, port = host_port, "3306" dj.config["database.host"] = host dj.config["database.port"] = int(port) @@ -89,6 +194,11 @@ def configure_datajoint(db_creds_root): logger.info(f"Configured DataJoint to use MySQL at {host}:{port}") +# ============================================================================= +# Connection Fixtures +# ============================================================================= + + @pytest.fixture(scope="session") def connection_root_bare(db_creds_root): """Bare root connection without user setup.""" @@ -101,45 +211,29 @@ def connection_root(connection_root_bare, prefix): """Root database connection with test users created.""" conn_root = connection_root_bare - # Create MySQL users - if version.parse(conn_root.query("select @@version;").fetchone()[0]) >= version.parse("8.0.0"): - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'datajoint'@'%%' - IDENTIFIED BY 'datajoint'; - """ - ) - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'djview'@'%%' - IDENTIFIED BY 'djview'; - """ - ) - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'djssl'@'%%' - IDENTIFIED BY 'djssl' - REQUIRE SSL; - """ - ) - conn_root.query("GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%';") - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%';") - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%';") - else: - conn_root.query( - """ - GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%' - IDENTIFIED BY 'datajoint'; - """ - ) - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%' IDENTIFIED BY 'djview';") - conn_root.query( - """ - GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%' - IDENTIFIED BY 'djssl' - REQUIRE SSL; - """ - ) + # Create MySQL users (MySQL 8.0+ syntax - we only support 8.0+) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'datajoint'@'%%' + IDENTIFIED BY 'datajoint'; + """ + ) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'djview'@'%%' + IDENTIFIED BY 'djview'; + """ + ) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'djssl'@'%%' + IDENTIFIED BY 'djssl' + REQUIRE SSL; + """ + ) + conn_root.query("GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%';") + conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%';") + conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%';") yield conn_root @@ -164,27 +258,19 @@ def connection_test(connection_root, prefix, db_creds_test): database = f"{prefix}%%" permission = "ALL PRIVILEGES" - if version.parse(connection_root.query("select @@version;").fetchone()[0]) >= version.parse("8.0.0"): - connection_root.query( - f""" - CREATE USER IF NOT EXISTS '{db_creds_test["user"]}'@'%%' - IDENTIFIED BY '{db_creds_test["password"]}'; - """ - ) - connection_root.query( - f""" - GRANT {permission} ON `{database}`.* - TO '{db_creds_test["user"]}'@'%%'; - """ - ) - else: - connection_root.query( - f""" - GRANT {permission} ON `{database}`.* - TO '{db_creds_test["user"]}'@'%%' - IDENTIFIED BY '{db_creds_test["password"]}'; - """ - ) + # MySQL 8.0+ syntax + connection_root.query( + f""" + CREATE USER IF NOT EXISTS '{db_creds_test["user"]}'@'%%' + IDENTIFIED BY '{db_creds_test["password"]}'; + """ + ) + connection_root.query( + f""" + GRANT {permission} ON `{database}`.* + TO '{db_creds_test["user"]}'@'%%'; + """ + ) connection = dj.Connection(**db_creds_test) yield connection @@ -192,7 +278,9 @@ def connection_test(connection_root, prefix, db_creds_test): connection.close() -# --- S3/MinIO fixtures --- +# ============================================================================= +# S3/MinIO Fixtures +# ============================================================================= @pytest.fixture(scope="session") @@ -309,7 +397,9 @@ def minio_client(s3_creds, s3fs_client, teardown=False): pass -# --- Utility fixtures --- +# ============================================================================= +# Utility Fixtures +# ============================================================================= @pytest.fixture(scope="session") @@ -330,7 +420,9 @@ def enable_adapted_types(): yield -# --- Cleanup fixtures --- +# ============================================================================= +# Cleanup Fixtures +# ============================================================================= @pytest.fixture @@ -364,7 +456,9 @@ def clean_test_tables(test, test_extra, test_no_extra): test_no_extra.delete() -# --- Schema fixtures --- +# ============================================================================= +# Schema Fixtures +# ============================================================================= @pytest.fixture(scope="module") @@ -591,7 +685,9 @@ def schema_type_aliases(connection_test, prefix): schema.drop() -# --- Table fixtures --- +# ============================================================================= +# Table Fixtures +# ============================================================================= @pytest.fixture @@ -667,7 +763,9 @@ def trash(schema_any): return schema.UberTrash() -# --- Object storage fixtures --- +# ============================================================================= +# Object Storage Fixtures +# ============================================================================= @pytest.fixture From 70f53d4ec3e430874eeadbfa8b12073695d1f961 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 19:39:34 -0600 Subject: [PATCH 31/32] Fix test compatibility with testcontainers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update settings tests to accept dynamic ports (testcontainers uses random ports instead of default 3306) - Fix test_top_restriction_with_keywords to use set comparison since dj.Top only guarantees which elements are selected, not their order - Bump version to 2.0.0a8 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/version.py | 2 +- tests/integration/test_relational_operand.py | 56 ++++++++++++-------- tests/unit/test_settings.py | 8 ++- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 6075746ab..2cb3465e2 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a7" +__version__ = "2.0.0a8" diff --git a/tests/integration/test_relational_operand.py b/tests/integration/test_relational_operand.py index d6580ee8b..3f15a7319 100644 --- a/tests/integration/test_relational_operand.py +++ b/tests/integration/test_relational_operand.py @@ -561,30 +561,42 @@ def test_restrictions_by_top(self, schema_simp_pop): ] def test_top_restriction_with_keywords(self, schema_simp_pop): + # dj.Top only guarantees which elements are selected, not their order select = SelectPK() & dj.Top(limit=9, order_by=["select desc"]) key = KeyPK() & dj.Top(limit=9, order_by="key desc") - assert select.fetch(as_dict=True) == [ - {"id": 2, "select": 8}, - {"id": 2, "select": 6}, - {"id": 1, "select": 4}, - {"id": 2, "select": 4}, - {"id": 1, "select": 3}, - {"id": 1, "select": 2}, - {"id": 2, "select": 2}, - {"id": 1, "select": 1}, - {"id": 0, "select": 0}, - ] - assert key.fetch(as_dict=True) == [ - {"id": 2, "key": 6}, - {"id": 2, "key": 5}, - {"id": 1, "key": 5}, - {"id": 0, "key": 4}, - {"id": 1, "key": 4}, - {"id": 2, "key": 4}, - {"id": 0, "key": 3}, - {"id": 1, "key": 3}, - {"id": 2, "key": 3}, - ] + # Convert to sets of tuples for order-independent comparison + select_result = {tuple(sorted(d.items())) for d in select.fetch(as_dict=True)} + select_expected = { + tuple(sorted(d.items())) + for d in [ + {"id": 2, "select": 8}, + {"id": 2, "select": 6}, + {"id": 1, "select": 4}, + {"id": 2, "select": 4}, + {"id": 1, "select": 3}, + {"id": 1, "select": 2}, + {"id": 2, "select": 2}, + {"id": 1, "select": 1}, + {"id": 0, "select": 0}, + ] + } + assert select_result == select_expected + key_result = {tuple(sorted(d.items())) for d in key.fetch(as_dict=True)} + key_expected = { + tuple(sorted(d.items())) + for d in [ + {"id": 2, "key": 6}, + {"id": 2, "key": 5}, + {"id": 1, "key": 5}, + {"id": 0, "key": 4}, + {"id": 1, "key": 4}, + {"id": 2, "key": 4}, + {"id": 0, "key": 3}, + {"id": 1, "key": 3}, + {"id": 2, "key": 3}, + ] + } + assert key_result == key_expected def test_top_errors(self, schema_simp_pop): with pytest.raises(DataJointError) as err1: diff --git a/tests/unit/test_settings.py b/tests/unit/test_settings.py index d7122969a..66d817f0c 100644 --- a/tests/unit/test_settings.py +++ b/tests/unit/test_settings.py @@ -160,7 +160,9 @@ def test_attribute_access(self): # Host can be localhost or db (docker), just verify it's a string assert isinstance(dj.config.database.host, str) assert len(dj.config.database.host) > 0 - assert dj.config.database.port == 3306 + # Port may be 3306 (default) or a random port (testcontainers) + assert isinstance(dj.config.database.port, int) + assert 1 <= dj.config.database.port <= 65535 # safemode may be modified by conftest fixtures assert isinstance(dj.config.safemode, bool) @@ -169,7 +171,9 @@ def test_dict_style_access(self): # Host can be localhost or db (docker), just verify it's a string assert isinstance(dj.config["database.host"], str) assert len(dj.config["database.host"]) > 0 - assert dj.config["database.port"] == 3306 + # Port may be 3306 (default) or a random port (testcontainers) + assert isinstance(dj.config["database.port"], int) + assert 1 <= dj.config["database.port"] <= 65535 # safemode may be modified by conftest fixtures assert isinstance(dj.config["safemode"], bool) From fa47f474782fd5a35ec8116bff6bc123fb1bfddd Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Thu, 1 Jan 2026 19:46:59 -0600 Subject: [PATCH 32/32] Add pytest marks for test dependency management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Register requires_mysql and requires_minio marks in pyproject.toml - Add pytest_collection_modifyitems hook to auto-mark tests based on fixture usage - Remove autouse=True from configure_datajoint fixture so containers only start when needed - Fix test_drop_unauthorized to use connection_test fixture Tests can now run without Docker: pytest -m "not requires_mysql" # Run 192 unit tests Full test suite still works: DJ_USE_EXTERNAL_CONTAINERS=1 pytest tests/ # 471 tests Bump version to 2.0.0a9 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pyproject.toml | 6 ++- src/datajoint/version.py | 2 +- tests/conftest.py | 65 ++++++++++++++++++++++++++++++-- tests/integration/test_schema.py | 6 ++- 4 files changed, 71 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f6da8a3b..154a40395 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,7 +158,11 @@ skip = ".git,*.pdf,*.svg,*.csv,*.ipynb,*.drawio" # astroid -- Python library name (not "asteroid") ignore-words-list = "rever,numer,astroid" -# pytest-env removed - testcontainers handles container lifecycle automatically +[tool.pytest.ini_options] +markers = [ + "requires_mysql: marks tests as requiring MySQL database (deselect with '-m \"not requires_mysql\"')", + "requires_minio: marks tests as requiring MinIO object storage (deselect with '-m \"not requires_minio\"')", +] [tool.pixi.workspace] diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 2cb3465e2..4684015ad 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a8" +__version__ = "2.0.0a9" diff --git a/tests/conftest.py b/tests/conftest.py index adbabbd97..14b848d4b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,19 @@ """ Pytest configuration for DataJoint tests. -Containers are automatically started via testcontainers - no manual setup required. +Tests are organized by their dependencies: +- Unit tests: No external dependencies, run with `pytest -m "not requires_mysql"` +- Integration tests: Require MySQL/MinIO, marked with @pytest.mark.requires_mysql + +Containers are automatically started via testcontainers when needed. Just run: pytest tests/ To use external containers instead (e.g., docker-compose), set: DJ_USE_EXTERNAL_CONTAINERS=1 DJ_HOST=localhost DJ_PORT=3306 S3_ENDPOINT=localhost:9000 pytest + +To run only unit tests (no Docker required): + pytest -m "not requires_mysql" """ import logging @@ -28,6 +35,52 @@ logger = logging.getLogger(__name__) +# ============================================================================= +# Pytest Hooks +# ============================================================================= + + +def pytest_collection_modifyitems(config, items): + """Auto-mark integration tests based on their fixtures.""" + # Tests that use these fixtures require MySQL + mysql_fixtures = { + "connection_root", + "connection_root_bare", + "connection_test", + "schema_any", + "schema_any_fresh", + "schema_simp", + "schema_adv", + "schema_ext", + "schema_uuid", + "schema_type_aliases", + "schema_obj", + "db_creds_root", + "db_creds_test", + } + # Tests that use these fixtures require MinIO + minio_fixtures = { + "minio_client", + "s3fs_client", + "s3_creds", + "stores_config", + "mock_stores", + } + + for item in items: + # Get all fixtures this test uses (directly or indirectly) + try: + fixturenames = set(item.fixturenames) + except AttributeError: + continue + + # Auto-add marks based on fixture usage + if fixturenames & mysql_fixtures: + item.add_marker(pytest.mark.requires_mysql) + if fixturenames & minio_fixtures: + item.add_marker(pytest.mark.requires_minio) + + # ============================================================================= # Container Fixtures - Auto-start MySQL and MinIO via testcontainers # ============================================================================= @@ -177,9 +230,13 @@ def s3_creds(minio_container) -> Dict: # ============================================================================= -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def configure_datajoint(db_creds_root): - """Configure DataJoint to use test database.""" + """Configure DataJoint to use test database. + + This fixture is NOT autouse - it only runs when a test requests + a fixture that depends on it (e.g., connection_root_bare). + """ # Parse host:port from credentials host_port = db_creds_root["host"] if ":" in host_port: @@ -200,7 +257,7 @@ def configure_datajoint(db_creds_root): @pytest.fixture(scope="session") -def connection_root_bare(db_creds_root): +def connection_root_bare(db_creds_root, configure_datajoint): """Bare root connection without user setup.""" connection = dj.Connection(**db_creds_root) yield connection diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 36a154935..d463ccf45 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -66,8 +66,10 @@ def test_schema_list(schema_any): assert schema_any.database in schemas -def test_drop_unauthorized(): - info_schema = dj.schema("information_schema") +@pytest.mark.requires_mysql +def test_drop_unauthorized(connection_test): + """Test that dropping information_schema raises AccessError.""" + info_schema = dj.schema("information_schema", connection=connection_test) with pytest.raises(dj.errors.AccessError): info_schema.drop()