From 23fa80997ac91a4015f2d4a84638dcdefcba1c62 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 4 Nov 2025 20:06:14 +0000 Subject: [PATCH 1/5] Initial plan From e135bb24deb573b3b07865c729391a10272464f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 4 Nov 2025 20:21:39 +0000 Subject: [PATCH 2/5] Fix nested nullability in pyarrow_dtype for struct, list, and array columns Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 12 +++- dataframely/columns/list.py | 2 +- dataframely/columns/struct.py | 2 +- tests/columns/test_pyarrow.py | 120 ++++++++++++++++++++++++++++++++-- 4 files changed, 128 insertions(+), 8 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 2ff1b06..87c18dc 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -102,9 +102,17 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: def _pyarrow_dtype_of_shape(self, shape: Sequence[int]) -> pa.DataType: if shape: size, *rest = shape - return pa.list_(self._pyarrow_dtype_of_shape(rest), size) + if rest: + # Recursive case: build inner type and wrap with list + # The intermediate fields will have nullable=True by default + inner_type = self._pyarrow_dtype_of_shape(rest) + return pa.list_(inner_type, size) + else: + # Base case: use the inner field directly to preserve nullability + return pa.list_(self.inner.pyarrow_field("item"), size) else: - return self.inner.pyarrow_dtype + # Should not reach here if shape is always a non-empty tuple + return self.inner.pyarrow_field("item").type @property def pyarrow_dtype(self) -> pa.DataType: diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py index 0804cf1..e1dd88b 100644 --- a/dataframely/columns/list.py +++ b/dataframely/columns/list.py @@ -145,7 +145,7 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: @property def pyarrow_dtype(self) -> pa.DataType: # NOTE: Polars uses `large_list`s by default. - return pa.large_list(self.inner.pyarrow_dtype) + return pa.large_list(self.inner.pyarrow_field("item")) def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: # First, sample the number of items per list element diff --git a/dataframely/columns/struct.py b/dataframely/columns/struct.py index ea6a687..b8aecf9 100644 --- a/dataframely/columns/struct.py +++ b/dataframely/columns/struct.py @@ -112,7 +112,7 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: @property def pyarrow_dtype(self) -> pa.DataType: - return pa.struct({name: col.pyarrow_dtype for name, col in self.inner.items()}) + return pa.struct([col.pyarrow_field(name) for name, col in self.inner.items()]) def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: series = ( diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index 5ae35ea..744f2a8 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -59,13 +59,13 @@ def test_equal_polars_schema_enum(categories: list[str]) -> None: @pytest.mark.parametrize( "inner", - [c() for c in ALL_COLUMN_TYPES] - + [dy.List(t()) for t in ALL_COLUMN_TYPES] + [_nullable(c) for c in ALL_COLUMN_TYPES] + + [dy.List(_nullable(t), nullable=True) for t in ALL_COLUMN_TYPES] + [ - dy.Array(t() if t == dy.Any else t(nullable=True), 1) + dy.Array(t() if t == dy.Any else t(nullable=True), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES ] - + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES], + + [dy.Struct({"a": _nullable(t)}, nullable=True) for t in ALL_COLUMN_TYPES], ) def test_equal_polars_schema_list(inner: Column) -> None: schema = create_schema("test", {"a": dy.List(inner, nullable=True)}) @@ -177,3 +177,115 @@ def test_datetime_time_unit(time_unit: TimeUnit) -> None: "test", {"a": dy.Datetime(time_unit=time_unit, nullable=True)} ) assert str(schema.to_pyarrow_schema()) == f"a: timestamp[{time_unit}]" + + +# Tests for nested nullability preservation +@pytest.mark.parametrize("nullable", [True, False]) +def test_struct_preserves_inner_nullability(nullable: bool) -> None: + """Test that struct fields preserve their nullability setting.""" + schema = create_schema( + "test", + { + "a": dy.Struct( + {"field": dy.String(nullable=nullable)}, + nullable=True, + ) + }, + ) + pa_schema = schema.to_pyarrow_schema() + struct_field = pa_schema.field("a") + inner_field = struct_field.type[0] + assert inner_field.nullable == nullable + + +@pytest.mark.parametrize("nullable", [True, False]) +def test_list_preserves_inner_nullability(nullable: bool) -> None: + """Test that list inner types preserve their nullability setting.""" + schema = create_schema( + "test", + {"a": dy.List(dy.String(nullable=nullable), nullable=True)}, + ) + pa_schema = schema.to_pyarrow_schema() + list_field = pa_schema.field("a") + assert list_field.type.value_field.nullable == nullable + + +def test_array_preserves_inner_nullability() -> None: + """Test that array inner types preserve their nullability setting.""" + # Arrays only support nullable inner types (nullable=False would require validation) + schema = create_schema( + "test", + {"a": dy.Array(dy.String(nullable=True), shape=3, nullable=True)}, + ) + pa_schema = schema.to_pyarrow_schema() + array_field = pa_schema.field("a") + assert array_field.type.value_field.nullable == True + + +def test_nested_struct_in_list_preserves_nullability() -> None: + """Test that nested struct fields in lists preserve nullability.""" + schema = create_schema( + "test", + { + "a": dy.List( + dy.Struct( + { + "required": dy.String(nullable=False), + "optional": dy.String(nullable=True), + }, + nullable=True, + ), + nullable=True, + ) + }, + ) + pa_schema = schema.to_pyarrow_schema() + list_field = pa_schema.field("a") + struct_type = list_field.type.value_field.type + assert struct_type[0].nullable == False # required field + assert struct_type[1].nullable == True # optional field + + +def test_nested_list_in_struct_preserves_nullability() -> None: + """Test that nested list fields in structs preserve nullability.""" + schema = create_schema( + "test", + { + "a": dy.Struct( + { + "list_field": dy.List(dy.String(nullable=False), nullable=True), + }, + nullable=True, + ) + }, + ) + pa_schema = schema.to_pyarrow_schema() + struct_field = pa_schema.field("a") + list_type = struct_field.type[0].type + assert list_type.value_field.nullable == False + + +def test_deeply_nested_nullability() -> None: + """Test that deeply nested structures preserve nullability.""" + schema = create_schema( + "test", + { + "a": dy.Struct( + { + "nested": dy.Struct( + { + "required": dy.String(nullable=False), + "optional": dy.String(nullable=True), + }, + nullable=True, + ), + }, + nullable=True, + ) + }, + ) + pa_schema = schema.to_pyarrow_schema() + outer_struct = pa_schema.field("a").type + inner_struct = outer_struct[0].type + assert inner_struct[0].nullable == False # required field + assert inner_struct[1].nullable == True # optional field From b4dab24b32e81a7d62c6cdae6f60c868cf163019 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 4 Nov 2025 20:25:17 +0000 Subject: [PATCH 3/5] Fix linting issues in test code Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- tests/columns/test_pyarrow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index 744f2a8..c7c21cb 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -219,7 +219,7 @@ def test_array_preserves_inner_nullability() -> None: ) pa_schema = schema.to_pyarrow_schema() array_field = pa_schema.field("a") - assert array_field.type.value_field.nullable == True + assert array_field.type.value_field.nullable def test_nested_struct_in_list_preserves_nullability() -> None: @@ -242,8 +242,8 @@ def test_nested_struct_in_list_preserves_nullability() -> None: pa_schema = schema.to_pyarrow_schema() list_field = pa_schema.field("a") struct_type = list_field.type.value_field.type - assert struct_type[0].nullable == False # required field - assert struct_type[1].nullable == True # optional field + assert not struct_type[0].nullable # required field + assert struct_type[1].nullable # optional field def test_nested_list_in_struct_preserves_nullability() -> None: @@ -262,7 +262,7 @@ def test_nested_list_in_struct_preserves_nullability() -> None: pa_schema = schema.to_pyarrow_schema() struct_field = pa_schema.field("a") list_type = struct_field.type[0].type - assert list_type.value_field.nullable == False + assert not list_type.value_field.nullable def test_deeply_nested_nullability() -> None: @@ -287,5 +287,5 @@ def test_deeply_nested_nullability() -> None: pa_schema = schema.to_pyarrow_schema() outer_struct = pa_schema.field("a").type inner_struct = outer_struct[0].type - assert inner_struct[0].nullable == False # required field - assert inner_struct[1].nullable == True # optional field + assert not inner_struct[0].nullable # required field + assert inner_struct[1].nullable # optional field From 332d60fe3f84301e5bdc5911c19e088250758122 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 4 Nov 2025 20:29:37 +0000 Subject: [PATCH 4/5] Address code review feedback: improve comments in array.py Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 87c18dc..e5b5885 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -103,15 +103,19 @@ def _pyarrow_dtype_of_shape(self, shape: Sequence[int]) -> pa.DataType: if shape: size, *rest = shape if rest: - # Recursive case: build inner type and wrap with list - # The intermediate fields will have nullable=True by default + # Recursive case: build inner type and wrap with list. + # When pa.list_ is passed a DataType (not a Field), it creates + # an "item" field with nullable=True by default. inner_type = self._pyarrow_dtype_of_shape(rest) return pa.list_(inner_type, size) else: - # Base case: use the inner field directly to preserve nullability + # Base case: use the inner field directly to preserve nullability. + # Passing a Field to pa.list_ preserves its nullability. return pa.list_(self.inner.pyarrow_field("item"), size) else: - # Should not reach here if shape is always a non-empty tuple + # This branch handles the case where shape is an empty tuple. + # While shape is initialized as at least (size,) in __init__, + # this provides a safe fallback. return self.inner.pyarrow_field("item").type @property From f15f6fa970c11a572ea3aa038af8bd9581689139 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 5 Nov 2025 02:12:23 +0100 Subject: [PATCH 5/5] Update --- dataframely/columns/array.py | 21 ++----- tests/columns/test_pyarrow.py | 101 +++++++++++++--------------------- 2 files changed, 44 insertions(+), 78 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index e5b5885..db232ba 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -99,28 +99,17 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: # NOTE: We might want to add support for PostgreSQL's ARRAY type or use JSON in the future. raise NotImplementedError("SQL column cannot have 'Array' type.") - def _pyarrow_dtype_of_shape(self, shape: Sequence[int]) -> pa.DataType: + def _pyarrow_field_of_shape(self, shape: Sequence[int]) -> pa.Field: if shape: size, *rest = shape - if rest: - # Recursive case: build inner type and wrap with list. - # When pa.list_ is passed a DataType (not a Field), it creates - # an "item" field with nullable=True by default. - inner_type = self._pyarrow_dtype_of_shape(rest) - return pa.list_(inner_type, size) - else: - # Base case: use the inner field directly to preserve nullability. - # Passing a Field to pa.list_ preserves its nullability. - return pa.list_(self.inner.pyarrow_field("item"), size) + inner_type = self._pyarrow_field_of_shape(rest) + return pa.field("item", pa.list_(inner_type, size), nullable=True) else: - # This branch handles the case where shape is an empty tuple. - # While shape is initialized as at least (size,) in __init__, - # this provides a safe fallback. - return self.inner.pyarrow_field("item").type + return self.inner.pyarrow_field("item") @property def pyarrow_dtype(self) -> pa.DataType: - return self._pyarrow_dtype_of_shape(self.shape) + return self._pyarrow_field_of_shape(self.shape).type def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: # Sample the inner elements in a flat series diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index c7c21cb..9c485d2 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -61,10 +61,7 @@ def test_equal_polars_schema_enum(categories: list[str]) -> None: "inner", [_nullable(c) for c in ALL_COLUMN_TYPES] + [dy.List(_nullable(t), nullable=True) for t in ALL_COLUMN_TYPES] - + [ - dy.Array(t() if t == dy.Any else t(nullable=True), 1, nullable=True) - for t in NO_VALIDATION_COLUMN_TYPES - ] + + [dy.Array(_nullable(t), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + [dy.Struct({"a": _nullable(t)}, nullable=True) for t in ALL_COLUMN_TYPES], ) def test_equal_polars_schema_list(inner: Column) -> None: @@ -161,65 +158,30 @@ def test_nullability_information_struct(inner: Column, nullable: bool) -> None: assert ("not null" in str(schema.to_pyarrow_schema())) != nullable -def test_multiple_columns() -> None: - schema = create_schema( - "test", {"a": dy.Int32(nullable=False), "b": dy.Integer(nullable=True)} - ) - assert str(schema.to_pyarrow_schema()).split("\n") == [ - "a: int32 not null", - "b: int64", - ] - - -@pytest.mark.parametrize("time_unit", ["ns", "us", "ms"]) -def test_datetime_time_unit(time_unit: TimeUnit) -> None: - schema = create_schema( - "test", {"a": dy.Datetime(time_unit=time_unit, nullable=True)} - ) - assert str(schema.to_pyarrow_schema()) == f"a: timestamp[{time_unit}]" - - -# Tests for nested nullability preservation -@pytest.mark.parametrize("nullable", [True, False]) -def test_struct_preserves_inner_nullability(nullable: bool) -> None: - """Test that struct fields preserve their nullability setting.""" - schema = create_schema( - "test", - { - "a": dy.Struct( - {"field": dy.String(nullable=nullable)}, - nullable=True, - ) - }, - ) +@pytest.mark.parametrize("column_type", COLUMN_TYPES) +@pytest.mark.parametrize("inner_nullable", [True, False]) +def test_inner_nullability_struct( + column_type: type[Column], inner_nullable: bool +) -> None: + inner = column_type(nullable=inner_nullable) + schema = create_schema("test", {"a": dy.Struct({"a": inner})}) pa_schema = schema.to_pyarrow_schema() struct_field = pa_schema.field("a") inner_field = struct_field.type[0] - assert inner_field.nullable == nullable + assert inner_field.nullable == inner_nullable -@pytest.mark.parametrize("nullable", [True, False]) -def test_list_preserves_inner_nullability(nullable: bool) -> None: - """Test that list inner types preserve their nullability setting.""" - schema = create_schema( - "test", - {"a": dy.List(dy.String(nullable=nullable), nullable=True)}, - ) +@pytest.mark.parametrize("column_type", COLUMN_TYPES) +@pytest.mark.parametrize("inner_nullable", [True, False]) +def test_inner_nullability_list( + column_type: type[Column], inner_nullable: bool +) -> None: + inner = column_type(nullable=inner_nullable) + schema = create_schema("test", {"a": dy.List(inner)}) pa_schema = schema.to_pyarrow_schema() list_field = pa_schema.field("a") - assert list_field.type.value_field.nullable == nullable - - -def test_array_preserves_inner_nullability() -> None: - """Test that array inner types preserve their nullability setting.""" - # Arrays only support nullable inner types (nullable=False would require validation) - schema = create_schema( - "test", - {"a": dy.Array(dy.String(nullable=True), shape=3, nullable=True)}, - ) - pa_schema = schema.to_pyarrow_schema() - array_field = pa_schema.field("a") - assert array_field.type.value_field.nullable + inner_field = list_field.type.value_field + assert inner_field.nullable == inner_nullable def test_nested_struct_in_list_preserves_nullability() -> None: @@ -242,8 +204,8 @@ def test_nested_struct_in_list_preserves_nullability() -> None: pa_schema = schema.to_pyarrow_schema() list_field = pa_schema.field("a") struct_type = list_field.type.value_field.type - assert not struct_type[0].nullable # required field - assert struct_type[1].nullable # optional field + assert not struct_type[0].nullable + assert struct_type[1].nullable def test_nested_list_in_struct_preserves_nullability() -> None: @@ -252,9 +214,7 @@ def test_nested_list_in_struct_preserves_nullability() -> None: "test", { "a": dy.Struct( - { - "list_field": dy.List(dy.String(nullable=False), nullable=True), - }, + {"list_field": dy.List(dy.String(nullable=False), nullable=True)}, nullable=True, ) }, @@ -266,7 +226,6 @@ def test_nested_list_in_struct_preserves_nullability() -> None: def test_deeply_nested_nullability() -> None: - """Test that deeply nested structures preserve nullability.""" schema = create_schema( "test", { @@ -289,3 +248,21 @@ def test_deeply_nested_nullability() -> None: inner_struct = outer_struct[0].type assert not inner_struct[0].nullable # required field assert inner_struct[1].nullable # optional field + + +def test_multiple_columns() -> None: + schema = create_schema( + "test", {"a": dy.Int32(nullable=False), "b": dy.Integer(nullable=True)} + ) + assert str(schema.to_pyarrow_schema()).split("\n") == [ + "a: int32 not null", + "b: int64", + ] + + +@pytest.mark.parametrize("time_unit", ["ns", "us", "ms"]) +def test_datetime_time_unit(time_unit: TimeUnit) -> None: + schema = create_schema( + "test", {"a": dy.Datetime(time_unit=time_unit, nullable=True)} + ) + assert str(schema.to_pyarrow_schema()) == f"a: timestamp[{time_unit}]"