From f20cde5582cbb7376100aa2ffa2eef90a99c0275 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 16:24:51 +0000 Subject: [PATCH 01/29] refactor(display): use CSS classes in HTML tables --- bigframes/display/html.py | 68 +++++++++++++++++---------------- tests/unit/display/test_html.py | 5 +-- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 3f1667eb9c..912f1d7e3a 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -48,60 +48,62 @@ def render_html( orderable_columns: list[str] | None = None, ) -> str: """Render a pandas DataFrame to HTML with specific styling.""" - classes = "dataframe table table-striped table-hover" - table_html = [f''] - precision = options.display.precision orderable_columns = orderable_columns or [] + classes = "dataframe table table-striped table-hover" + table_html_parts = [f'
'] + table_html_parts.append(_render_table_header(dataframe, orderable_columns)) + table_html_parts.append(_render_table_body(dataframe)) + table_html_parts.append("
") + return "".join(table_html_parts) - # Render table head - table_html.append(" ") - table_html.append(' ') + +def _render_table_header(dataframe: pd.DataFrame, orderable_columns: list[str]) -> str: + """Render the header of the HTML table.""" + header_parts = [" ", " "] for col in dataframe.columns: th_classes = [] if col in orderable_columns: th_classes.append("sortable") class_str = f'class="{" ".join(th_classes)}"' if th_classes else "" - header_div = ( - '
' - f"{html.escape(str(col))}" - "
" - ) - table_html.append( - f' {header_div}' + header_parts.append( + f'
' + f"{html.escape(str(col))}
" ) - table_html.append(" ") - table_html.append(" ") + header_parts.extend([" ", " "]) + return "\n".join(header_parts) + + +def _render_table_body(dataframe: pd.DataFrame) -> str: + """Render the body of the HTML table.""" + body_parts = [" "] + precision = options.display.precision - # Render table body - table_html.append(" ") for i in range(len(dataframe)): - table_html.append(" ") + body_parts.append(" ") row = dataframe.iloc[i] for col_name, value in row.items(): dtype = dataframe.dtypes.loc[col_name] # type: ignore align = "right" if _is_dtype_numeric(dtype) else "left" - table_html.append( - ' '.format(align) - ) # TODO(b/438181139): Consider semi-exploding ARRAY/STRUCT columns # into multiple rows/columns like the BQ UI does. if pandas.api.types.is_scalar(value) and pd.isna(value): - table_html.append(' <NA>') + body_parts.append( + f' ' + '<NA>' + ) else: if isinstance(value, float): - formatted_value = f"{value:.{precision}f}" - table_html.append(f" {html.escape(formatted_value)}") + cell_content = f"{value:.{precision}f}" else: - table_html.append(f" {html.escape(str(value))}") - table_html.append(" ") - table_html.append(" ") - table_html.append(" ") - table_html.append("") - - return "\n".join(table_html) + cell_content = str(value) + body_parts.append( + f' ' + f"{html.escape(cell_content)}" + ) + body_parts.append(" ") + body_parts.append(" ") + return "\n".join(body_parts) def _obj_ref_rt_to_html(obj_ref_rt: str) -> str: diff --git a/tests/unit/display/test_html.py b/tests/unit/display/test_html.py index fcf1455362..0762a2fd8d 100644 --- a/tests/unit/display/test_html.py +++ b/tests/unit/display/test_html.py @@ -130,9 +130,8 @@ def test_render_html_alignment_and_precision( df = pd.DataFrame(data) html = bf_html.render_html(dataframe=df, table_id="test-table") - for _, align in expected_alignments.items(): - assert 'th style="text-align: left;"' in html - assert f' Date: Mon, 29 Dec 2025 16:24:51 +0000 Subject: [PATCH 02/29] refactor(display): use CSS classes in HTML tables --- bigframes/display/html.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 912f1d7e3a..70baa9364a 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -85,8 +85,6 @@ def _render_table_body(dataframe: pd.DataFrame) -> str: dtype = dataframe.dtypes.loc[col_name] # type: ignore align = "right" if _is_dtype_numeric(dtype) else "left" - # TODO(b/438181139): Consider semi-exploding ARRAY/STRUCT columns - # into multiple rows/columns like the BQ UI does. if pandas.api.types.is_scalar(value) and pd.isna(value): body_parts.append( f' ' From 4b6824305cc4410900ccef6aadc34afea288757a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 16:25:12 +0000 Subject: [PATCH 03/29] feat(display): support nested STRUCT and ARRAY data in interactive tables --- bigframes/display/_flatten.py | 287 ++++++++++++++++++++++ bigframes/display/html.py | 101 ++++++-- notebooks/dataframes/anywidget_mode.ipynb | 164 ++++++++++--- tests/js/table_widget.test.js | 125 ++++++++++ tests/system/small/test_anywidget.py | 224 ++++++----------- tests/unit/display/test_html.py | 50 +++- 6 files changed, 745 insertions(+), 206 deletions(-) create mode 100644 bigframes/display/_flatten.py diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py new file mode 100644 index 0000000000..e7f63777ae --- /dev/null +++ b/bigframes/display/_flatten.py @@ -0,0 +1,287 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for flattening nested data structures for display.""" + +from __future__ import annotations + +from typing import cast + +import pandas as pd +import pyarrow as pa + + +def flatten_nested_data( + dataframe: pd.DataFrame, +) -> tuple[pd.DataFrame, dict[str, list[int]], list[str], set[str]]: + """Flatten nested STRUCT and ARRAY columns for display.""" + if dataframe.empty: + return dataframe.copy(), {}, [], set() + + result_df = dataframe.copy() + + ( + struct_columns, + array_columns, + array_of_struct_columns, + clear_on_continuation_cols, + nested_originated_columns, + ) = _classify_columns(result_df) + + result_df, array_columns = _flatten_array_of_struct_columns( + result_df, array_of_struct_columns, array_columns, nested_originated_columns + ) + + result_df, clear_on_continuation_cols = _flatten_struct_columns( + result_df, struct_columns, clear_on_continuation_cols, nested_originated_columns + ) + + # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) + if not array_columns: + return ( + result_df, + {}, + clear_on_continuation_cols, + nested_originated_columns, + ) + + result_df, array_row_groups = _explode_array_columns(result_df, array_columns) + return ( + result_df, + array_row_groups, + clear_on_continuation_cols, + nested_originated_columns, + ) + + +def _classify_columns( + dataframe: pd.DataFrame, +) -> tuple[list[str], list[str], list[str], list[str], set[str]]: + """Identify all STRUCT and ARRAY columns.""" + initial_columns = list(dataframe.columns) + struct_columns: list[str] = [] + array_columns: list[str] = [] + array_of_struct_columns: list[str] = [] + clear_on_continuation_cols: list[str] = [] + nested_originated_columns: set[str] = set() + + for col_name_raw, col_data in dataframe.items(): + col_name = str(col_name_raw) + dtype = col_data.dtype + if isinstance(dtype, pd.ArrowDtype): + pa_type = dtype.pyarrow_dtype + if pa.types.is_struct(pa_type): + struct_columns.append(col_name) + nested_originated_columns.add(col_name) + elif pa.types.is_list(pa_type): + array_columns.append(col_name) + nested_originated_columns.add(col_name) + if hasattr(pa_type, "value_type") and ( + pa.types.is_struct(pa_type.value_type) + ): + array_of_struct_columns.append(col_name) + else: + clear_on_continuation_cols.append(col_name) + elif col_name in initial_columns: + clear_on_continuation_cols.append(col_name) + return ( + struct_columns, + array_columns, + array_of_struct_columns, + clear_on_continuation_cols, + nested_originated_columns, + ) + + +def _flatten_array_of_struct_columns( + dataframe: pd.DataFrame, + array_of_struct_columns: list[str], + array_columns: list[str], + nested_originated_columns: set[str], +) -> tuple[pd.DataFrame, list[str]]: + """Flatten ARRAY of STRUCT columns into separate array columns for each field.""" + result_df = dataframe.copy() + for col_name in array_of_struct_columns: + col_data = result_df[col_name] + pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype + struct_type = pa_type.value_type + + # Use PyArrow to reshape the list into multiple list arrays + arrow_array = pa.array(col_data) + offsets = arrow_array.offsets + values = arrow_array.values # StructArray + flattened_fields = values.flatten() # List[Array] + + new_cols_to_add = {} + new_array_col_names = [] + + # Create new columns for each struct field + for field_idx in range(struct_type.num_fields): + field = struct_type.field(field_idx) + new_col_name = f"{col_name}.{field.name}" + nested_originated_columns.add(new_col_name) + new_array_col_names.append(new_col_name) + + # Reconstruct ListArray for this field + # Use mask=arrow_array.is_null() to preserve nulls from the original list + new_list_array = pa.ListArray.from_arrays( + offsets, flattened_fields[field_idx], mask=arrow_array.is_null() + ) + + new_cols_to_add[new_col_name] = pd.Series( + new_list_array.to_pylist(), + dtype=pd.ArrowDtype(pa.list_(field.type)), + index=result_df.index, + ) + + col_idx = result_df.columns.to_list().index(col_name) + new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) + + result_df = pd.concat( + [ + result_df.iloc[:, :col_idx], + new_cols_df, + result_df.iloc[:, col_idx + 1 :], + ], + axis=1, + ) + + # Update array_columns list + array_columns.remove(col_name) + # Add the new array columns + array_columns.extend(new_array_col_names) + return result_df, array_columns + + +def _explode_array_columns( + dataframe: pd.DataFrame, array_columns: list[str] +) -> tuple[pd.DataFrame, dict[str, list[int]]]: + """Explode array columns into new rows.""" + exploded_rows = [] + array_row_groups: dict[str, list[int]] = {} + non_array_columns = dataframe.columns.drop(array_columns).tolist() + non_array_df = dataframe[non_array_columns] + + for orig_idx in dataframe.index: + non_array_data = non_array_df.loc[orig_idx].to_dict() + array_values = {} + max_len_in_row = 0 + non_na_array_found = False + + for col_name in array_columns: + val = dataframe.loc[orig_idx, col_name] + if val is not None and not ( + isinstance(val, list) and len(val) == 1 and pd.isna(val[0]) + ): + array_values[col_name] = list(val) + max_len_in_row = max(max_len_in_row, len(val)) + non_na_array_found = True + else: + array_values[col_name] = [] + + if not non_na_array_found: + new_row = non_array_data.copy() + for col_name in array_columns: + new_row[f"{col_name}"] = pd.NA + exploded_rows.append(new_row) + orig_key = str(orig_idx) + if orig_key not in array_row_groups: + array_row_groups[orig_key] = [] + array_row_groups[orig_key].append(len(exploded_rows) - 1) + continue + + # Create one row per array element, up to max_len_in_row + for array_idx in range(max_len_in_row): + new_row = non_array_data.copy() + + # Add the specific array element for this index + for col_name in array_columns: + if array_idx < len(array_values.get(col_name, [])): + new_row[f"{col_name}"] = array_values[col_name][array_idx] + else: + new_row[f"{col_name}"] = pd.NA + + exploded_rows.append(new_row) + + # Track which rows belong to which original row + orig_key = str(orig_idx) + if orig_key not in array_row_groups: + array_row_groups[orig_key] = [] + array_row_groups[orig_key].append(len(exploded_rows) - 1) + + if exploded_rows: + # Reconstruct the DataFrame to maintain original column order + exploded_df = pd.DataFrame(exploded_rows)[dataframe.columns] + for col in exploded_df.columns: + # After explosion, object columns that are all-numeric (except for NAs) + # should be converted to a numeric dtype for proper alignment. + if exploded_df[col].dtype == "object": + try: + # Use nullable integer type to preserve integers + exploded_df[col] = exploded_df[col].astype(pd.Int64Dtype()) + except (ValueError, TypeError): + # Fallback for non-integer numerics + try: + exploded_df[col] = pd.to_numeric(exploded_df[col]) + except (ValueError, TypeError): + # Keep as object if not numeric + pass + return exploded_df, array_row_groups + else: + return dataframe, array_row_groups + + +def _flatten_struct_columns( + dataframe: pd.DataFrame, + struct_columns: list[str], + clear_on_continuation_cols: list[str], + nested_originated_columns: set[str], +) -> tuple[pd.DataFrame, list[str]]: + """Flatten regular STRUCT columns.""" + result_df = dataframe.copy() + for col_name in struct_columns: + col_data = result_df[col_name] + if isinstance(col_data.dtype, pd.ArrowDtype): + pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype + + # Use PyArrow to flatten the struct column without row iteration + # combine_chunks() ensures we have a single array if it was chunked + arrow_array = pa.array(col_data) + flattened_fields = arrow_array.flatten() + + new_cols_to_add = {} + for field_idx in range(pa_type.num_fields): + field = pa_type.field(field_idx) + new_col_name = f"{col_name}.{field.name}" + nested_originated_columns.add(new_col_name) + clear_on_continuation_cols.append(new_col_name) + + # Create a new Series from the flattened array + new_cols_to_add[new_col_name] = pd.Series( + flattened_fields[field_idx].to_pylist(), + dtype=pd.ArrowDtype(field.type), + index=result_df.index, + ) + + col_idx = result_df.columns.to_list().index(col_name) + new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) + result_df = pd.concat( + [ + result_df.iloc[:, :col_idx], + new_cols_df, + result_df.iloc[:, col_idx + 1 :], + ], + axis=1, + ) + return result_df, clear_on_continuation_cols diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 70baa9364a..39636a08e1 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -25,10 +25,11 @@ import pandas as pd import pandas.api.types +import pyarrow as pa import bigframes from bigframes._config import display_options, options -from bigframes.display import plaintext +from bigframes.display import _flatten, plaintext import bigframes.formatting_helpers as formatter if typing.TYPE_CHECKING: @@ -38,6 +39,11 @@ def _is_dtype_numeric(dtype: Any) -> bool: """Check if a dtype is numeric for alignment purposes.""" + # Arrays should always be left-aligned, even if they contain numeric elements + if isinstance(dtype, pd.ArrowDtype) and isinstance( + dtype.pyarrow_dtype, pa.ListType + ): + return False return pandas.api.types.is_numeric_dtype(dtype) @@ -47,12 +53,27 @@ def render_html( table_id: str, orderable_columns: list[str] | None = None, ) -> str: - """Render a pandas DataFrame to HTML with specific styling.""" + """Render a pandas DataFrame to HTML with specific styling and nested data support.""" + # Flatten nested data first + ( + flattened_df, + array_row_groups, + clear_on_continuation, + nested_originated_columns, + ) = _flatten.flatten_nested_data(dataframe) + orderable_columns = orderable_columns or [] classes = "dataframe table table-striped table-hover" table_html_parts = [f''] - table_html_parts.append(_render_table_header(dataframe, orderable_columns)) - table_html_parts.append(_render_table_body(dataframe)) + table_html_parts.append(_render_table_header(flattened_df, orderable_columns)) + table_html_parts.append( + _render_table_body( + flattened_df, + array_row_groups, + clear_on_continuation, + nested_originated_columns, + ) + ) table_html_parts.append("
") return "".join(table_html_parts) @@ -73,32 +94,69 @@ def _render_table_header(dataframe: pd.DataFrame, orderable_columns: list[str]) return "\n".join(header_parts) -def _render_table_body(dataframe: pd.DataFrame) -> str: +def _render_table_body( + dataframe: pd.DataFrame, + array_row_groups: dict[str, list[int]], + clear_on_continuation: list[str], + nested_originated_columns: set[str], +) -> str: """Render the body of the HTML table.""" body_parts = [" "] precision = options.display.precision for i in range(len(dataframe)): - body_parts.append(" ") + row_class = "" + orig_row_idx = None + is_continuation = False + for orig_key, row_indices in array_row_groups.items(): + if i in row_indices and row_indices[0] != i: + row_class = "array-continuation" + orig_row_idx = orig_key + is_continuation = True + break + + if row_class: + body_parts.append( + f' ' + ) + else: + body_parts.append(" ") + row = dataframe.iloc[i] for col_name, value in row.items(): + col_name_str = str(col_name) + if is_continuation and col_name_str in clear_on_continuation: + body_parts.append(" ") + continue dtype = dataframe.dtypes.loc[col_name] # type: ignore - align = "right" if _is_dtype_numeric(dtype) else "left" - if pandas.api.types.is_scalar(value) and pd.isna(value): - body_parts.append( - f' ' - '<NA>' - ) + if col_name_str in nested_originated_columns: + align = "left" else: - if isinstance(value, float): - cell_content = f"{value:.{precision}f}" + align = "right" if _is_dtype_numeric(dtype) else "left" + + cell_content = "" + if pandas.api.types.is_scalar(value) and pd.isna(value): + if is_continuation: + # For padding nulls in continuation rows, show empty cell + body_parts.append(f' ') else: - cell_content = str(value) - body_parts.append( - f' ' - f"{html.escape(cell_content)}" - ) + # For primary nulls, keep showing the indicator but maybe styled + body_parts.append( + f' ' + '<NA>' + ) + continue + elif isinstance(value, float): + cell_content = f"{value:.{precision}f}" + else: + cell_content = str(value) + + # Use classes for alignment + body_parts.append( + f' ' + f"{html.escape(cell_content)}" + ) body_parts.append(" ") body_parts.append(" ") return "\n".join(body_parts) @@ -292,8 +350,9 @@ def repr_mimebundle( exclude=None, ): """Custom display method for IPython/Jupyter environments.""" - # TODO(b/467647693): Anywidget integration has been tested in Jupyter, VS Code, and - # BQ Studio, but there is a known compatibility issue with Marimo that needs to be addressed. + # TODO(b/467647693): Anywidget integration has been tested in Jupyter, + # VS Code, and BQ Studio, but there is a known compatibility issue with + # Marimo that needs to be addressed. opts = options.display if opts.repr_mode == "deferred": diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index facefc6069..252dc78bf9 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -91,7 +91,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -103,7 +103,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", + "\u2705 Completed. \n", " Query processed 0 Bytes in a moment of slot time.\n", " " ], @@ -149,7 +149,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -161,7 +161,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -286,7 +286,7 @@ " \n", " \n", "\n", - "

10 rows × 5 columns

\n", + "

10 rows \u00d7 5 columns

\n", "[5552452 rows x 5 columns in total]" ], "text/plain": [ @@ -333,7 +333,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -345,7 +345,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -418,7 +418,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -430,7 +430,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -492,10 +492,10 @@ "### Sorting by Single-Column\n", "You can sort the table by clicking on the headers of columns that have orderable data types (like numbers, strings, and dates). Non-orderable columns (like arrays or structs) do not have sorting controls.\n", "\n", - "**Sorting indicators (▲, ▼) are always visible for sorted columns. The unsorted indicator (●) is only visible when you hover over an unsorted column header.** The sorting control cycles through three states:\n", - "- **Unsorted (no indicator by default, ● on hover):** The default state. Click the header to sort in ascending order.\n", - "- **Ascending (▲):** The data is sorted from smallest to largest. Click again to sort in descending order.\n", - "- **Descending (▼):** The data is sorted from largest to smallest. Click again to return to the unsorted state." + "**Sorting indicators (\u25b2, \u25bc) are always visible for sorted columns. The unsorted indicator (\u25cf) is only visible when you hover over an unsorted column header.** The sorting control cycles through three states:\n", + "- **Unsorted (no indicator by default, \u25cf on hover):** The default state. Click the header to sort in ascending order.\n", + "- **Ascending (\u25b2):** The data is sorted from smallest to largest. Click again to sort in descending order.\n", + "- **Descending (\u25bc):** The data is sorted from largest to smallest. Click again to return to the unsorted state." ] }, { @@ -533,7 +533,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -545,7 +545,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -644,7 +644,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", + "\u2705 Completed. \n", " Query processed 171.4 MB in a moment of slot time.\n", " " ], @@ -658,7 +658,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", + "\u2705 Completed. \n", " Query processed 0 Bytes in a moment of slot time.\n", " " ], @@ -727,7 +727,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", + "\u2705 Completed. \n", " Query processed 85.9 kB in 24 seconds of slot time.\n", " " ], @@ -751,7 +751,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -763,7 +763,7 @@ { "data": { "text/html": [ - "✅ Completed. " + "\u2705 Completed. " ], "text/plain": [ "" @@ -842,10 +842,10 @@ " 18157874.1\n", " 21.02.2018\n", " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", + " Liedtke & Partner Patentanw\u221a\u00a7lte\n", " SHB Hebezeugbau GmbH\n", " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " STEUERUNGSSYSTEM F\u221a\u00faR AUTOMATISCHE PARKH\u221a\u00d1USER\n", " EP 3 366 869 A1\n", " \n", " \n", @@ -896,10 +896,10 @@ " 18171005.4\n", " 05.02.2015\n", " 05.02.2014\n", - " Stork Bamberger Patentanw√§lte\n", + " Stork Bamberger Patentanw\u221a\u00a7lte\n", " Linco Food Systems A/S\n", " Thrane, Uffe\n", - " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " MASTH\u221a\u00d1HNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", " \n", @@ -915,14 +915,14 @@ " 03.04.2018\n", " 30.03.2017\n", " <NA>\n", - " BSH Hausger√§te GmbH\n", + " BSH Hausger\u221a\u00a7te GmbH\n", " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE\u221a\u00faBERTRAGUNG\n", " EP 3 383 141 A2\n", " \n", " \n", "\n", - "

5 rows × 15 columns

\n", + "

5 rows \u00d7 15 columns

\n", "[5 rows x 15 columns in total]" ], "text/plain": [ @@ -948,10 +948,10 @@ "4 03.10.2018 H05B 6/12 18165514.3 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw\u221a\u00a7lte \n", "1 16.02.2016 Scheider, Sascha et al \n", "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanw\u221a\u00a7lte \n", "4 03.04.2018 30.03.2017 \n", "\n", " applicant_line_1 inventor_line_1 \\\n", @@ -959,14 +959,14 @@ "1 EV Group E. Thallner GmbH Kurz, Florian \n", "2 FUJITSU LIMITED Kukihara, Kensuke \n", "3 Linco Food Systems A/S Thrane, Uffe \n", - "4 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "4 BSH Hausger\u221a\u00a7te GmbH Acero Acero, Jesus \n", "\n", " title_line_1 number \n", - "0 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", + "0 STEUERUNGSSYSTEM F\u221a\u00faR AUTOMATISCHE PARKH\u221a\u00d1USER EP 3 366 869 A1 \n", "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "4 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "3 MASTH\u221a\u00d1HNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "4 VORRICHTUNG ZUR INDUKTIVEN ENERGIE\u221a\u00faBERTRAGUNG EP 3 383 141 A2 \n", "\n", "[5 rows x 15 columns]" ] @@ -988,6 +988,104 @@ " LIMIT 5;\n", "\"\"\")" ] + }, + { + "cell_type": "markdown", + "id": "nested_markdown", + "metadata": {}, + "source": [ + "### Displaying Nested Data (STRUCTs and ARRAYs)\n", + "BigQuery DataFrames automatically flattens nested STRUCT and ARRAY columns into separate, more manageable columns when displayed in `anywidget` mode. This approach simplifies interaction and readability, as it avoids deeply nested or collapsible elements.\n", + "\n", + "This flattening ensures that all data is directly visible and sortable, enhancing the interactive table experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nested_code", + "metadata": {}, + "outputs": [], + "source": [ + "sql_nested_data = \"\"\"\n", + "SELECT\n", + " 1 AS id,\n", + " STRUCT('Alice' AS name, 30 AS age) AS struct_col,\n", + " [10, 20, 30] AS array_col,\n", + " [STRUCT('A' AS item, 100 AS value), STRUCT('B' AS item, 200 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 2 AS id,\n", + " STRUCT('Bob' AS name, 25 AS age) AS struct_col,\n", + " [40, 50] AS array_col,\n", + " [STRUCT('C' AS item, 300 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 3 AS id,\n", + " STRUCT('Charlie' AS name, 35 AS age) AS struct_col,\n", + " [60, 70, 80] AS array_col,\n", + " [STRUCT('D' AS item, 400 AS value), STRUCT('E' AS item, 500 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 4 AS id,\n", + " STRUCT('David' AS name, 40 AS age) AS struct_col,\n", + " [90, 100, 110] AS array_col,\n", + " [STRUCT('F' AS item, 600 AS value), STRUCT('G' AS item, 700 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 5 AS id,\n", + " STRUCT('Eve' AS name, 45 AS age) AS struct_col,\n", + " [120, 130, 140] AS array_col,\n", + " [STRUCT('H' AS item, 800 AS value), STRUCT('I' AS item, 900 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 6 AS id,\n", + " STRUCT('Frank' AS name, 50 AS age) AS struct_col,\n", + " [150, 160, 170] AS array_col,\n", + " [STRUCT('J' AS item, 1000 AS value), STRUCT('K' AS item, 1100 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 7 AS id,\n", + " STRUCT('Grace' AS name, 55 AS age) AS struct_col,\n", + " [180, 190] AS array_col,\n", + " [STRUCT('L' AS item, 1200 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 8 AS id,\n", + " STRUCT('Heidi' AS name, 60 AS age) AS struct_col,\n", + " [200, 210, 220] AS array_col,\n", + " [STRUCT('M' AS item, 1300 AS value), STRUCT('N' AS item, 1400 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 9 AS id,\n", + " STRUCT('Ivan' AS name, 65 AS age) AS struct_col,\n", + " [230, 240, 250, 260] AS array_col,\n", + " [STRUCT('O' AS item, 1500 AS value), STRUCT('P' AS item, 1600 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 10 AS id,\n", + " STRUCT('Judy' AS name, 70 AS age) AS struct_col,\n", + " [270, 280] AS array_col,\n", + " [STRUCT('Q' AS item, 1700 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 11 AS id,\n", + " STRUCT('Kevin' AS name, 75 AS age) AS struct_col,\n", + " [290, 300, 310] AS array_col,\n", + " [STRUCT('R' AS item, 1800 AS value), STRUCT('S' AS item, 1900 AS value), STRUCT('T' AS item, 2000 AS value), STRUCT('U' AS item, 2100 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 12 AS id,\n", + " STRUCT('Laura' AS name, 80 AS age) AS struct_col,\n", + " [320] AS array_col,\n", + " [STRUCT('V' AS item, 2200 AS value), STRUCT('W' AS item, 2300 AS value), STRUCT('X' AS item, 2400 AS value)] AS nested_struct_array\n", + "\"\"\"\n", + "\n", + "df_from_sql = bpd.read_gbq(sql_nested_data)\n", + "\n", + "# Display this DataFrame. The nested fields will be rendered as flattened elements.\n", + "df_from_sql" + ] } ], "metadata": { diff --git a/tests/js/table_widget.test.js b/tests/js/table_widget.test.js index 6b5dda48d1..a4eebbd4af 100644 --- a/tests/js/table_widget.test.js +++ b/tests/js/table_widget.test.js @@ -259,4 +259,129 @@ describe("TableWidget", () => { expect(headers[0].textContent).toBe(""); expect(headers[1].textContent).toBe("value"); }); + + it("should highlight all rows in a group when hovering over a nested data row", () => { + // Mock HTML with nested data structure (flattened rows) + model.get.mockImplementation((property) => { + if (property === "table_html") { + return ` + + + +
Row 1 Part A
Row 1 Part B
Row 2
`; + } + if (property === "orderable_columns") { + return []; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === "change:table_html", + )[1]; + tableHtmlChangeHandler(); + + const firstRowCell = el.querySelector('tr[data-orig-row="0"] td'); + const rowsInGroup = el.querySelectorAll('tr[data-orig-row="0"] td'); + + // Simulate mouseover + const mouseOverEvent = new MouseEvent("mouseover", { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOverEvent); + + // Check if row-hover class is added to all cells in the group + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains("row-hover")).toBe(true); + }); + + // Simulate mouseout + const mouseOutEvent = new MouseEvent("mouseout", { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOutEvent); + + // Check if row-hover class is removed + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains("row-hover")).toBe(false); + }); + }); + + it("should not highlight unrelated rows when hovering over a nested data row", () => { + // Mock HTML with nested data structure + model.get.mockImplementation((property) => { + if (property === "table_html") { + return ` + + + +
Row 1 Part A
Row 1 Part B
Row 2
`; + } + if (property === "orderable_columns") { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === "change:table_html", + )[1]; + tableHtmlChangeHandler(); + + const row1Cell = el.querySelector('tr[data-orig-row="0"] td'); + const row2Cell = el.querySelector('tr[data-orig-row="1"] td'); + + const mouseOverEvent = new MouseEvent("mouseover", { + bubbles: true, + cancelable: true, + }); + row1Cell.dispatchEvent(mouseOverEvent); + + // Row 2 should NOT have the hover class + expect(row2Cell.classList.contains("row-hover")).toBe(false); + }); + + it("should not highlight other rows when hovering over a non-nested row", () => { + // Mock HTML with mixed data structure + model.get.mockImplementation((property) => { + if (property === "table_html") { + return ` + + +
Standard Row
Nested Row
`; + } + if (property === "orderable_columns") { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === "change:table_html", + )[1]; + tableHtmlChangeHandler(); + + const standardCell = el.querySelector("tr:not([data-orig-row]) td"); + const nestedCell = el.querySelector('tr[data-orig-row="0"] td'); + + const mouseOverEvent = new MouseEvent("mouseover", { + bubbles: true, + cancelable: true, + }); + standardCell.dispatchEvent(mouseOverEvent); + + // The nested row should NOT have the hover class + expect(nestedCell.classList.contains("row-hover")).toBe(false); + }); }); diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8d4fcc8e89..576707e25f 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -998,178 +998,102 @@ def test_dataframe_repr_mimebundle_should_return_widget_with_metadata_in_anywidg assert "colab" in metadata["application/vnd.jupyter.widget-view+json"] -@pytest.fixture(scope="module") -def custom_index_pandas_df() -> pd.DataFrame: - """Create a DataFrame with a custom named index for testing.""" - test_data = pd.DataFrame( +@pytest.fixture +def nested_data_df(): + """Fixture to provide a pandas DataFrame with nested data (STRUCT and ARRAY) using ArrowDtype.""" + import pyarrow as pa + + # Struct column + struct_type = pa.struct([("name", pa.string()), ("age", pa.int64())]) + struct_data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}] + struct_arr = pa.array(struct_data, type=struct_type) + + # Array column + array_type = pa.list_(pa.int64()) + array_data = [[10, 20, 30], [40, 50]] + array_arr = pa.array(array_data, type=array_type) + + # Array of Struct column + nested_struct_type = pa.struct([("item", pa.string()), ("value", pa.int64())]) + nested_array_type = pa.list_(nested_struct_type) + nested_data = [ + [{"item": "A", "value": 100}, {"item": "B", "value": 200}], + [{"item": "C", "value": 300}], + ] + nested_arr = pa.array(nested_data, type=nested_array_type) + + df = pd.DataFrame( { - "value_a": [10, 20, 30, 40, 50, 60], - "value_b": ["a", "b", "c", "d", "e", "f"], + "id": [1, 2], + "struct_col": pd.Series(struct_arr, dtype=pd.ArrowDtype(struct_type)), + "array_col": pd.Series(array_arr, dtype=pd.ArrowDtype(array_type)), + "nested_struct_array": pd.Series( + nested_arr, dtype=pd.ArrowDtype(nested_array_type) + ), } ) - test_data.index = pd.Index( - ["row_1", "row_2", "row_3", "row_4", "row_5", "row_6"], name="custom_idx" - ) - return test_data + return df -@pytest.fixture(scope="module") -def custom_index_bf_df( - session: bf.Session, custom_index_pandas_df: pd.DataFrame -) -> bf.dataframe.DataFrame: - return session.read_pandas(custom_index_pandas_df) +@pytest.fixture +def different_lengths_arrays_df(): + """Fixture to provide a DataFrame with arrays of different lengths using ArrowDtype.""" + import pyarrow as pa + array_type = pa.list_(pa.int64()) + array_col1 = pa.array([[10, 20, 30]], type=array_type) + array_col2 = pa.array([[100, 200]], type=array_type) -@pytest.fixture(scope="module") -def multiindex_pandas_df() -> pd.DataFrame: - """Create a DataFrame with MultiIndex for testing.""" - test_data = pd.DataFrame( + df = pd.DataFrame( { - "value": [100, 200, 300, 400, 500, 600], - "category": ["X", "Y", "Z", "X", "Y", "Z"], + "id": [1], + "array_col1": pd.Series(array_col1, dtype=pd.ArrowDtype(array_type)), + "array_col2": pd.Series(array_col2, dtype=pd.ArrowDtype(array_type)), } ) - test_data.index = pd.MultiIndex.from_arrays( - [ - ["group_A", "group_A", "group_A", "group_B", "group_B", "group_B"], - [1, 2, 3, 1, 2, 3], - ], - names=["group", "item"], - ) - return test_data - + return df -@pytest.fixture(scope="module") -def multiindex_bf_df( - session: bf.Session, multiindex_pandas_df: pd.DataFrame -) -> bf.dataframe.DataFrame: - return session.read_pandas(multiindex_pandas_df) +def test_render_html_with_nested_data(nested_data_df: pd.DataFrame): + """Verify that render_html correctly flattens nested STRUCT and ARRAY columns. -def test_widget_with_default_index_should_display_index_column_with_empty_header( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a default index, when the TableWidget is rendered, - then an index column should be visible with an empty header. + Updated to expect inline styles. """ - import re - - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - html = widget.table_html - - # The header for the index should be present but empty, matching the - # internal rendering logic. - thead = html.split("")[1].split("")[0] - # Find the first header cell and check that its content div is empty. - match = re.search(r"]*>]*>([^<]*)", thead) - assert match is not None, "Could not find table header cell in output." - assert ( - match.group(1) == "" - ), f"Expected empty index header, but found: {match.group(1)}" + from bigframes.display import html + result_html = html.render_html(dataframe=nested_data_df, table_id="test-table") -def test_widget_with_custom_index_should_display_index_column( - custom_index_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a custom named index, when rendered, - then the index column and first page of rows should be visible. - """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(custom_index_bf_df) - html = widget.table_html + # Check that Alice's data is not repeated on the second row + assert 'class="cell-align-right">1' in result_html + assert 'class="cell-align-left">Alice' in result_html + assert 'class="cell-align-left">30' in result_html + assert 'class="cell-align-left">10' in result_html - assert "custom_idx" in html - assert "row_1" in html - assert "row_2" in html - assert "row_3" not in html # Verify pagination is working - assert "row_4" not in html + # Check continuation row + assert 'class="array-continuation" data-orig-row="0">' in result_html + # In continuation rows, non-array cells are empty + assert "" in result_html -def test_widget_with_custom_index_pagination_preserves_index( - custom_index_bf_df: bf.dataframe.DataFrame, +def test_render_html_with_arrays_of_different_lengths( + different_lengths_arrays_df: pd.DataFrame, ): - """ - Given a DataFrame with a custom index, when navigating to the second page, - then the second page's index values should be visible. - """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(custom_index_bf_df) - - widget.page = 1 # Navigate to page 2 - html = widget.table_html - - assert "row_3" in html - assert "row_4" in html - assert "row_1" not in html # Verify page 1 content is gone - assert "row_2" not in html - + """Verify that render_html handles arrays of different lengths correctly. -def test_widget_with_custom_index_matches_pandas_output( - custom_index_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a custom index and max_rows=3, the widget's HTML - output should contain the first three index values. + Updated to expect inline styles. """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 3): - widget = TableWidget(custom_index_bf_df) - html = widget.table_html - - assert "row_1" in html - assert "row_2" in html - assert "row_3" in html - assert "row_4" not in html # Verify it respects max_rows - - -# TODO(b/438181139): Add tests for custom multiindex -# This may not be necessary for the SQL Cell use case but should be -# considered for completeness. + from bigframes.display import html - -def test_series_anywidget_integration_with_notebook_display( - paginated_bf_df: bf.dataframe.DataFrame, -): - """Test Series display integration in Jupyter-like environment.""" - pytest.importorskip("anywidget") - - with bf.option_context("display.repr_mode", "anywidget"): - series = paginated_bf_df["value"] - - # Test the full display pipeline - from IPython.display import display as ipython_display - - # This should work without errors - ipython_display(series) - - -def test_series_different_data_types_anywidget(session: bf.Session): - """Test Series with different data types in anywidget mode.""" - pytest.importorskip("anywidget") - - # Create Series with different types - test_data = pd.DataFrame( - { - "string_col": ["a", "b", "c"], - "int_col": [1, 2, 3], - "float_col": [1.1, 2.2, 3.3], - "bool_col": [True, False, True], - } + result_html = html.render_html( + dataframe=different_lengths_arrays_df, table_id="test-table" ) - bf_df = session.read_pandas(test_data) - with bf.option_context("display.repr_mode", "anywidget"): - for col_name in test_data.columns: - series = bf_df[col_name] - widget = bigframes.display.TableWidget(series.to_frame()) - assert widget.row_count == 3 + # The first row should contain the first element of both arrays + assert 'class="cell-align-right">1' in result_html + assert 'class="cell-align-left">10' in result_html + assert 'class="cell-align-left">100' in result_html + + # The second row should contain the second element of both arrays + assert 'class="array-continuation" data-orig-row="0">' in result_html + assert 'class="cell-align-left">20' in result_html + assert 'class="cell-align-left">200' in result_html diff --git a/tests/unit/display/test_html.py b/tests/unit/display/test_html.py index 0762a2fd8d..c9bee32296 100644 --- a/tests/unit/display/test_html.py +++ b/tests/unit/display/test_html.py @@ -19,6 +19,7 @@ import pytest import bigframes as bf +from bigframes.display._flatten import flatten_nested_data import bigframes.display.html as bf_html @@ -106,7 +107,7 @@ { "array_col": "left", }, - ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], id="array", ), pytest.param( @@ -119,7 +120,7 @@ { "struct_col": "left", }, - ["{'v': 1}", "{'v': 2}", "{'v': 3}"], + ["1", "2", "3"], id="struct", ), ], @@ -148,3 +149,48 @@ def test_render_html_precision(): # Make sure we reset to default html = bf_html.render_html(dataframe=df, table_id="test-table") assert "3.141593" in html + + +def test_flatten_nested_data_flattens_structs(): + """Verify that flatten_nested_data correctly flattens STRUCT columns.""" + struct_data = pd.DataFrame( + { + "id": [1, 2], + "struct_col": pd.Series( + [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}], + dtype=pd.ArrowDtype( + pa.struct([("name", pa.string()), ("age", pa.int64())]) + ), + ), + } + ) + + flattened, _, _, nested_originated_columns = flatten_nested_data(struct_data) + + assert "struct_col.name" in flattened.columns + assert "struct_col.age" in flattened.columns + assert flattened["struct_col.name"].tolist() == ["Alice", "Bob"] + assert "struct_col" in nested_originated_columns + assert "struct_col.name" in nested_originated_columns + assert "struct_col.age" in nested_originated_columns + + +def test_flatten_nested_data_explodes_arrays(): + """Verify that flatten_nested_data correctly explodes ARRAY columns.""" + array_data = pd.DataFrame( + { + "id": [1, 2], + "array_col": pd.Series( + [[10, 20, 30], [40, 50]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), + } + ) + + flattened, groups, _, nested_originated_columns = flatten_nested_data(array_data) + + assert len(flattened) == 5 # 3 + 2 array elements + assert "0" in groups # First original row + assert len(groups["0"]) == 3 # Three array elements + assert "1" in groups + assert len(groups["1"]) == 2 + assert "array_col" in nested_originated_columns From ceca74dc92c78a6ef8a3562eb58ede22b831ecf9 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 18:09:35 +0000 Subject: [PATCH 04/29] chore: remove unreached code --- bigframes/display/html.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 39636a08e1..099693ffad 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -25,7 +25,6 @@ import pandas as pd import pandas.api.types -import pyarrow as pa import bigframes from bigframes._config import display_options, options @@ -39,11 +38,6 @@ def _is_dtype_numeric(dtype: Any) -> bool: """Check if a dtype is numeric for alignment purposes.""" - # Arrays should always be left-aligned, even if they contain numeric elements - if isinstance(dtype, pd.ArrowDtype) and isinstance( - dtype.pyarrow_dtype, pa.ListType - ): - return False return pandas.api.types.is_numeric_dtype(dtype) @@ -350,9 +344,8 @@ def repr_mimebundle( exclude=None, ): """Custom display method for IPython/Jupyter environments.""" - # TODO(b/467647693): Anywidget integration has been tested in Jupyter, - # VS Code, and BQ Studio, but there is a known compatibility issue with - # Marimo that needs to be addressed. + # TODO(b/467647693): Anywidget integration has been tested in Jupyter, VS Code, and + # BQ Studio, but there is a known compatibility issue with Marimo that needs to be addressed. opts = options.display if opts.repr_mode == "deferred": From 63e4a3c68df58817c8ee13604c418a31f50d3dea Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 18:26:07 +0000 Subject: [PATCH 05/29] refactor: code refactor --- bigframes/display/_flatten.py | 144 +++--- notebooks/dataframes/anywidget_mode.ipynb | 509 +++++++++++++++------- 2 files changed, 413 insertions(+), 240 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index e7f63777ae..07c993a925 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast +from typing import Callable, cast import pandas as pd import pyarrow as pa @@ -39,12 +39,54 @@ def flatten_nested_data( nested_originated_columns, ) = _classify_columns(result_df) - result_df, array_columns = _flatten_array_of_struct_columns( - result_df, array_of_struct_columns, array_columns, nested_originated_columns + # Flatten ARRAY of STRUCT columns + def update_array_columns(col_name: str, new_col_names: list[str]) -> None: + array_columns.remove(col_name) + array_columns.extend(new_col_names) + + def create_list_series( + original_arr: pa.Array, field_arr: pa.Array, index: pd.Index, field: pa.Field + ) -> pd.Series: + new_list_array = pa.ListArray.from_arrays( + original_arr.offsets, field_arr, mask=original_arr.is_null() + ) + return pd.Series( + new_list_array.to_pylist(), + dtype=pd.ArrowDtype(pa.list_(field.type)), + index=index, + ) + + result_df = _flatten_and_replace_columns( + result_df, + array_of_struct_columns, + nested_originated_columns, + get_struct_type=lambda t: t.value_type, + get_field_values=lambda arr: arr.values.flatten(), + create_series=create_list_series, + update_metadata=update_array_columns, ) - result_df, clear_on_continuation_cols = _flatten_struct_columns( - result_df, struct_columns, clear_on_continuation_cols, nested_originated_columns + # Flatten regular STRUCT columns + def update_clear_on_continuation(col_name: str, new_col_names: list[str]) -> None: + clear_on_continuation_cols.extend(new_col_names) + + def create_struct_series( + original_arr: pa.Array, field_arr: pa.Array, index: pd.Index, field: pa.Field + ) -> pd.Series: + return pd.Series( + field_arr.to_pylist(), + dtype=pd.ArrowDtype(field.type), + index=index, + ) + + result_df = _flatten_and_replace_columns( + result_df, + struct_columns, + nested_originated_columns, + get_struct_type=lambda t: t, + get_field_values=lambda arr: arr.flatten(), + create_series=create_struct_series, + update_metadata=update_clear_on_continuation, ) # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) @@ -104,45 +146,36 @@ def _classify_columns( ) -def _flatten_array_of_struct_columns( +def _flatten_and_replace_columns( dataframe: pd.DataFrame, - array_of_struct_columns: list[str], - array_columns: list[str], + columns: list[str], nested_originated_columns: set[str], -) -> tuple[pd.DataFrame, list[str]]: - """Flatten ARRAY of STRUCT columns into separate array columns for each field.""" + get_struct_type: Callable[[pa.DataType], pa.DataType], + get_field_values: Callable[[pa.Array], list[pa.Array]], + create_series: Callable[[pa.Array, pa.Array, pd.Index, pa.Field], pd.Series], + update_metadata: Callable[[str, list[str]], None], +) -> pd.DataFrame: + """Generic helper to flatten structure-like columns and replace them in the DataFrame.""" result_df = dataframe.copy() - for col_name in array_of_struct_columns: + for col_name in columns: col_data = result_df[col_name] pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype - struct_type = pa_type.value_type + struct_type = get_struct_type(pa_type) - # Use PyArrow to reshape the list into multiple list arrays arrow_array = pa.array(col_data) - offsets = arrow_array.offsets - values = arrow_array.values # StructArray - flattened_fields = values.flatten() # List[Array] + flattened_fields = get_field_values(arrow_array) new_cols_to_add = {} - new_array_col_names = [] + new_col_names = [] - # Create new columns for each struct field for field_idx in range(struct_type.num_fields): field = struct_type.field(field_idx) new_col_name = f"{col_name}.{field.name}" nested_originated_columns.add(new_col_name) - new_array_col_names.append(new_col_name) + new_col_names.append(new_col_name) - # Reconstruct ListArray for this field - # Use mask=arrow_array.is_null() to preserve nulls from the original list - new_list_array = pa.ListArray.from_arrays( - offsets, flattened_fields[field_idx], mask=arrow_array.is_null() - ) - - new_cols_to_add[new_col_name] = pd.Series( - new_list_array.to_pylist(), - dtype=pd.ArrowDtype(pa.list_(field.type)), - index=result_df.index, + new_cols_to_add[new_col_name] = create_series( + arrow_array, flattened_fields[field_idx], result_df.index, field ) col_idx = result_df.columns.to_list().index(col_name) @@ -157,11 +190,9 @@ def _flatten_array_of_struct_columns( axis=1, ) - # Update array_columns list - array_columns.remove(col_name) - # Add the new array columns - array_columns.extend(new_array_col_names) - return result_df, array_columns + update_metadata(col_name, new_col_names) + + return result_df def _explode_array_columns( @@ -240,48 +271,3 @@ def _explode_array_columns( return exploded_df, array_row_groups else: return dataframe, array_row_groups - - -def _flatten_struct_columns( - dataframe: pd.DataFrame, - struct_columns: list[str], - clear_on_continuation_cols: list[str], - nested_originated_columns: set[str], -) -> tuple[pd.DataFrame, list[str]]: - """Flatten regular STRUCT columns.""" - result_df = dataframe.copy() - for col_name in struct_columns: - col_data = result_df[col_name] - if isinstance(col_data.dtype, pd.ArrowDtype): - pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype - - # Use PyArrow to flatten the struct column without row iteration - # combine_chunks() ensures we have a single array if it was chunked - arrow_array = pa.array(col_data) - flattened_fields = arrow_array.flatten() - - new_cols_to_add = {} - for field_idx in range(pa_type.num_fields): - field = pa_type.field(field_idx) - new_col_name = f"{col_name}.{field.name}" - nested_originated_columns.add(new_col_name) - clear_on_continuation_cols.append(new_col_name) - - # Create a new Series from the flattened array - new_cols_to_add[new_col_name] = pd.Series( - flattened_fields[field_idx].to_pylist(), - dtype=pd.ArrowDtype(field.type), - index=result_df.index, - ) - - col_idx = result_df.columns.to_list().index(col_name) - new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) - result_df = pd.concat( - [ - result_df.iloc[:, :col_idx], - new_cols_df, - result_df.iloc[:, col_idx + 1 :], - ], - axis=1, - ) - return result_df, clear_on_continuation_cols diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 252dc78bf9..9ff923b346 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -91,7 +91,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -103,7 +103,7 @@ { "data": { "text/html": [ - "\u2705 Completed. \n", + "✅ Completed. \n", " Query processed 0 Bytes in a moment of slot time.\n", " " ], @@ -118,17 +118,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Vera 71\n", - " AR F 1910 Viola 37\n", - " AR F 1910 Alice 57\n", - " AR F 1910 Edna 95\n", - " AR F 1910 Ollie 40\n", - " CA F 1910 Beatrice 37\n", - " CT F 1910 Marion 36\n", - " CT F 1910 Marie 36\n", - " FL F 1910 Alice 53\n", - " GA F 1910 Thelma 133\n", + "state gender year name number\n", + " AL F 1910 Hazel 51\n", + " AL F 1910 Lucy 76\n", + " AR F 1910 Nellie 39\n", + " AR F 1910 Lena 40\n", + " CO F 1910 Thelma 36\n", + " CO F 1910 Ruth 68\n", + " CT F 1910 Elizabeth 86\n", + " DC F 1910 Mary 80\n", + " FL F 1910 Annie 101\n", + " FL F 1910 Alma 39\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -149,7 +149,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -161,7 +161,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -173,7 +173,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "424cfa14088641518224b137b5444d58", + "model_id": "d281fbe99c9747ee9187057bdac9a33c", "version_major": 2, "version_minor": 1 }, @@ -209,98 +209,98 @@ " AL\n", " F\n", " 1910\n", - " Vera\n", - " 71\n", + " Annie\n", + " 482\n", " \n", " \n", " 1\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Viola\n", - " 37\n", + " Myrtle\n", + " 104\n", " \n", " \n", " 2\n", " AR\n", " F\n", " 1910\n", - " Alice\n", - " 57\n", + " Lillian\n", + " 56\n", " \n", " \n", " 3\n", - " AR\n", + " CT\n", " F\n", " 1910\n", - " Edna\n", - " 95\n", + " Anne\n", + " 38\n", " \n", " \n", " 4\n", - " AR\n", + " CT\n", " F\n", " 1910\n", - " Ollie\n", - " 40\n", + " Frances\n", + " 45\n", " \n", " \n", " 5\n", - " CA\n", + " FL\n", " F\n", " 1910\n", - " Beatrice\n", - " 37\n", + " Margaret\n", + " 53\n", " \n", " \n", " 6\n", - " CT\n", + " GA\n", " F\n", " 1910\n", - " Marion\n", - " 36\n", + " Mae\n", + " 73\n", " \n", " \n", " 7\n", - " CT\n", + " GA\n", " F\n", " 1910\n", - " Marie\n", - " 36\n", + " Beatrice\n", + " 96\n", " \n", " \n", " 8\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Alice\n", - " 53\n", + " Lola\n", + " 47\n", " \n", " \n", " 9\n", - " GA\n", + " IA\n", " F\n", " 1910\n", - " Thelma\n", - " 133\n", + " Viola\n", + " 49\n", " \n", " \n", "\n", - "

10 rows \u00d7 5 columns

\n", + "

10 rows × 5 columns

\n", "[5552452 rows x 5 columns in total]" ], "text/plain": [ "state gender year name number\n", - " AL F 1910 Vera 71\n", - " AR F 1910 Viola 37\n", - " AR F 1910 Alice 57\n", - " AR F 1910 Edna 95\n", - " AR F 1910 Ollie 40\n", - " CA F 1910 Beatrice 37\n", - " CT F 1910 Marion 36\n", - " CT F 1910 Marie 36\n", - " FL F 1910 Alice 53\n", - " GA F 1910 Thelma 133\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -326,14 +326,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "id": "42bb02ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. \n", + " Query processed 215.9 MB in 54 seconds of slot time. [Job bigframes-dev:US.bf668ba0-3b44-4e6a-8e62-ae9e1a518994 details]\n", + " " ], "text/plain": [ "" @@ -345,7 +347,9 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. \n", + " Query processed 88.8 MB in a moment of slot time.\n", + " " ], "text/plain": [ "" @@ -356,49 +360,41 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3904868f71114a0c95c8c133a6c29d0b", - "version_major": 2, - "version_minor": 1 - }, "text/html": [ - "
0    1910\n",
-       "1    1910\n",
-       "2    1910\n",
-       "3    1910\n",
-       "4    1910\n",
-       "5    1910\n",
-       "6    1910\n",
-       "7    1910\n",
-       "8    1910\n",
-       "9    1910
[5552452 rows]" + "✅ Completed. " ], "text/plain": [ - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "Name: year, dtype: Int64\n", - "...\n", - "\n", - "[5552452 rows]" + "" ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "Name: year, dtype: Int64\n", + "...\n", + "\n", + "[5552452 rows]\n" + ] } ], "source": [ "test_series = df[\"year\"]\n", "# Displaying the series triggers the interactive widget\n", - "test_series" + "print(test_series)" ] }, { @@ -411,14 +407,23 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "id": "da23e0f3", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\u2705 Completed. " + "\n", + " Query started with request ID bigframes-dev:US.dad903a4-20b1-419f-913f-af083fe054cc.
SQL
SELECT\n",
+       "`year` AS `year`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t0`.`year`,\n",
+       "  `t0`.`bfuid_col_7` AS `bfuid_col_11`\n",
+       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._e20e6e5e_d279_4a57_8eda_5df7d62bffae_bqdf_b6aa24ab-7544-4d8f-8418-b12c0ff9f902` AS `t0`)\n",
+       "ORDER BY `bfuid_col_11` ASC NULLS LAST
\n", + " " ], "text/plain": [ "" @@ -430,7 +435,9 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. \n", + " Query processed 88.8 MB in 4 seconds of slot time. [Job bigframes-dev:US.job_NmHwc2dPU4mfNGVh01R71WPTivjp details]\n", + " " ], "text/plain": [ "" @@ -442,7 +449,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0fd0bd56db2348a68d5755a045652001", + "model_id": "8f93406a94744a17ba97a5aeb88cb1af", "version_major": 2, "version_minor": 1 }, @@ -456,7 +463,7 @@ "6 1910\n", "7 1910\n", "8 1910\n", - "9 1910[5552452 rows]" + "9 1910

[5552452 rows]

" ], "text/plain": [ "1910\n", @@ -475,7 +482,7 @@ "[5552452 rows]" ] }, - "execution_count": 7, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -492,10 +499,10 @@ "### Sorting by Single-Column\n", "You can sort the table by clicking on the headers of columns that have orderable data types (like numbers, strings, and dates). Non-orderable columns (like arrays or structs) do not have sorting controls.\n", "\n", - "**Sorting indicators (\u25b2, \u25bc) are always visible for sorted columns. The unsorted indicator (\u25cf) is only visible when you hover over an unsorted column header.** The sorting control cycles through three states:\n", - "- **Unsorted (no indicator by default, \u25cf on hover):** The default state. Click the header to sort in ascending order.\n", - "- **Ascending (\u25b2):** The data is sorted from smallest to largest. Click again to sort in descending order.\n", - "- **Descending (\u25bc):** The data is sorted from largest to smallest. Click again to return to the unsorted state." + "**Sorting indicators (▲, ▼) are always visible for sorted columns. The unsorted indicator (●) is only visible when you hover over an unsorted column header.** The sorting control cycles through three states:\n", + "- **Unsorted (no indicator by default, ● on hover):** The default state. Click the header to sort in ascending order.\n", + "- **Ascending (▲):** The data is sorted from smallest to largest. Click again to sort in descending order.\n", + "- **Descending (▼):** The data is sorted from largest to smallest. Click again to return to the unsorted state." ] }, { @@ -533,7 +540,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -545,7 +552,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -564,12 +571,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "13b063f7ea74473eb18de270c48c6417", + "model_id": "95aa6f1c585d49aea2e577c04aa1404b", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -644,7 +651,7 @@ { "data": { "text/html": [ - "\u2705 Completed. \n", + "✅ Completed. \n", " Query processed 171.4 MB in a moment of slot time.\n", " " ], @@ -658,7 +665,7 @@ { "data": { "text/html": [ - "\u2705 Completed. \n", + "✅ Completed. \n", " Query processed 0 Bytes in a moment of slot time.\n", " " ], @@ -679,12 +686,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0918149d2d734296afb3243f283eb2d3", + "model_id": "e385c162fcf24f05a0e993009cbd8d04", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -727,8 +734,8 @@ { "data": { "text/html": [ - "\u2705 Completed. \n", - " Query processed 85.9 kB in 24 seconds of slot time.\n", + "✅ Completed. \n", + " Query processed 85.9 kB in 13 seconds of slot time.\n", " " ], "text/plain": [ @@ -751,7 +758,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -763,7 +770,7 @@ { "data": { "text/html": [ - "\u2705 Completed. " + "✅ Completed. " ], "text/plain": [ "" @@ -789,7 +796,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9543a0ef6eb744f480e49d4876c31b84", + "model_id": "050763bb5227403bb729ebacd1888193", "version_major": 2, "version_minor": 1 }, @@ -836,20 +843,38 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", + " 03.10.2018\n", + " H05B 6/12\n", + " <NA>\n", + " 18165514.3\n", + " 03.04.2018\n", + " 30.03.2017\n", + " <NA>\n", + " BSH Hausger√§te GmbH\n", + " Acero Acero, Jesus\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " EP 3 383 141 A2\n", + " \n", + " \n", + " 1\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", " 29.08.018\n", " E04H 6/12\n", " <NA>\n", " 18157874.1\n", " 21.02.2018\n", " 22.02.2017\n", - " Liedtke & Partner Patentanw\u221a\u00a7lte\n", + " Liedtke & Partner Patentanw√§lte\n", " SHB Hebezeugbau GmbH\n", " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F\u221a\u00faR AUTOMATISCHE PARKH\u221a\u00d1USER\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", " EP 3 366 869 A1\n", " \n", " \n", - " 1\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -867,7 +892,7 @@ " EP 3 382 744 A1\n", " \n", " \n", - " 2\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -885,7 +910,7 @@ " EP 3 382 553 A1\n", " \n", " \n", - " 3\n", + " 4\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -896,33 +921,15 @@ " 18171005.4\n", " 05.02.2015\n", " 05.02.2014\n", - " Stork Bamberger Patentanw\u221a\u00a7lte\n", + " Stork Bamberger Patentanw√§lte\n", " Linco Food Systems A/S\n", " Thrane, Uffe\n", - " MASTH\u221a\u00d1HNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", - " \n", - " 4\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 03.10.2018\n", - " H05B 6/12\n", - " <NA>\n", - " 18165514.3\n", - " 03.04.2018\n", - " 30.03.2017\n", - " <NA>\n", - " BSH Hausger\u221a\u00a7te GmbH\n", - " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE\u221a\u00faBERTRAGUNG\n", - " EP 3 383 141 A2\n", - " \n", " \n", "\n", - "

5 rows \u00d7 15 columns

\n", + "

5 rows × 15 columns

\n", "[5 rows x 15 columns in total]" ], "text/plain": [ @@ -941,32 +948,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 29.08.018 E04H 6/12 18157874.1 \n", - "1 03.10.2018 H01L 21/20 18166536.5 \n", - "2 03.10.2018 G06F 11/30 18157347.8 \n", - "3 03.10.2018 A01K 31/00 18171005.4 \n", - "4 03.10.2018 H05B 6/12 18165514.3 \n", + "0 03.10.2018 H05B 6/12 18165514.3 \n", + "1 29.08.018 E04H 6/12 18157874.1 \n", + "2 03.10.2018 H01L 21/20 18166536.5 \n", + "3 03.10.2018 G06F 11/30 18157347.8 \n", + "4 03.10.2018 A01K 31/00 18171005.4 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw\u221a\u00a7lte \n", - "1 16.02.2016 Scheider, Sascha et al \n", - "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 05.02.2015 05.02.2014 Stork Bamberger Patentanw\u221a\u00a7lte \n", - "4 03.04.2018 30.03.2017 \n", + "0 03.04.2018 30.03.2017 \n", + "1 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "2 16.02.2016 Scheider, Sascha et al \n", + "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 EV Group E. Thallner GmbH Kurz, Florian \n", - "2 FUJITSU LIMITED Kukihara, Kensuke \n", - "3 Linco Food Systems A/S Thrane, Uffe \n", - "4 BSH Hausger\u221a\u00a7te GmbH Acero Acero, Jesus \n", + "0 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "1 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "2 EV Group E. Thallner GmbH Kurz, Florian \n", + "3 FUJITSU LIMITED Kukihara, Kensuke \n", + "4 Linco Food Systems A/S Thrane, Uffe \n", "\n", " title_line_1 number \n", - "0 STEUERUNGSSYSTEM F\u221a\u00faR AUTOMATISCHE PARKH\u221a\u00d1USER EP 3 366 869 A1 \n", - "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 MASTH\u221a\u00d1HNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "4 VORRICHTUNG ZUR INDUKTIVEN ENERGIE\u221a\u00faBERTRAGUNG EP 3 383 141 A2 \n", + "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "1 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -1002,10 +1009,190 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "nested_code", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "45fc82b556e046baadd065991330ad7f", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idstruct_colarray_colnested_struct_array
01{'name': 'Alice', 'age': 30}[10 20 30][{'item': 'A', 'value': 100} {'item': 'B', 'va...
12{'name': 'Bob', 'age': 25}[40 50][{'item': 'C', 'value': 300}]
23{'name': 'Charlie', 'age': 35}[60 70 80][{'item': 'D', 'value': 400} {'item': 'E', 'va...
34{'name': 'David', 'age': 40}[ 90 100 110][{'item': 'F', 'value': 600} {'item': 'G', 'va...
45{'name': 'Eve', 'age': 45}[120 130 140][{'item': 'H', 'value': 800} {'item': 'I', 'va...
56{'name': 'Frank', 'age': 50}[150 160 170][{'item': 'J', 'value': 1000} {'item': 'K', 'v...
67{'name': 'Grace', 'age': 55}[180 190][{'item': 'L', 'value': 1200}]
78{'name': 'Heidi', 'age': 60}[200 210 220][{'item': 'M', 'value': 1300} {'item': 'N', 'v...
89{'name': 'Ivan', 'age': 65}[230 240 250 260][{'item': 'O', 'value': 1500} {'item': 'P', 'v...
910{'name': 'Judy', 'age': 70}[270 280][{'item': 'Q', 'value': 1700}]
\n", + "

10 rows × 4 columns

\n", + "
[12 rows x 4 columns in total]" + ], + "text/plain": [ + " id struct_col array_col \\\n", + "0 1 {'name': 'Alice', 'age': 30} [10 20 30] \n", + "1 2 {'name': 'Bob', 'age': 25} [40 50] \n", + "2 3 {'name': 'Charlie', 'age': 35} [60 70 80] \n", + "3 4 {'name': 'David', 'age': 40} [ 90 100 110] \n", + "4 5 {'name': 'Eve', 'age': 45} [120 130 140] \n", + "5 6 {'name': 'Frank', 'age': 50} [150 160 170] \n", + "6 7 {'name': 'Grace', 'age': 55} [180 190] \n", + "7 8 {'name': 'Heidi', 'age': 60} [200 210 220] \n", + "8 9 {'name': 'Ivan', 'age': 65} [230 240 250 260] \n", + "9 10 {'name': 'Judy', 'age': 70} [270 280] \n", + "\n", + " nested_struct_array \n", + "0 [{'item': 'A', 'value': 100} {'item': 'B', 'va... \n", + "1 [{'item': 'C', 'value': 300}] \n", + "2 [{'item': 'D', 'value': 400} {'item': 'E', 'va... \n", + "3 [{'item': 'F', 'value': 600} {'item': 'G', 'va... \n", + "4 [{'item': 'H', 'value': 800} {'item': 'I', 'va... \n", + "5 [{'item': 'J', 'value': 1000} {'item': 'K', 'v... \n", + "6 [{'item': 'L', 'value': 1200}] \n", + "7 [{'item': 'M', 'value': 1300} {'item': 'N', 'v... \n", + "8 [{'item': 'O', 'value': 1500} {'item': 'P', 'v... \n", + "9 [{'item': 'Q', 'value': 1700}] \n", + "...\n", + "\n", + "[12 rows x 4 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sql_nested_data = \"\"\"\n", "SELECT\n", From 3affd924c084ee956746de1043e8eb3da02dbbed Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 22:06:13 +0000 Subject: [PATCH 06/29] refactor: resue pandas struct.explode() --- bigframes/display/_flatten.py | 135 ++++++------- notebooks/dataframes/anywidget_mode.ipynb | 225 +++++++++++----------- 2 files changed, 179 insertions(+), 181 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 07c993a925..004a16c687 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import Callable, cast +from typing import cast import pandas as pd import pyarrow as pa @@ -39,54 +39,12 @@ def flatten_nested_data( nested_originated_columns, ) = _classify_columns(result_df) - # Flatten ARRAY of STRUCT columns - def update_array_columns(col_name: str, new_col_names: list[str]) -> None: - array_columns.remove(col_name) - array_columns.extend(new_col_names) - - def create_list_series( - original_arr: pa.Array, field_arr: pa.Array, index: pd.Index, field: pa.Field - ) -> pd.Series: - new_list_array = pa.ListArray.from_arrays( - original_arr.offsets, field_arr, mask=original_arr.is_null() - ) - return pd.Series( - new_list_array.to_pylist(), - dtype=pd.ArrowDtype(pa.list_(field.type)), - index=index, - ) - - result_df = _flatten_and_replace_columns( - result_df, - array_of_struct_columns, - nested_originated_columns, - get_struct_type=lambda t: t.value_type, - get_field_values=lambda arr: arr.values.flatten(), - create_series=create_list_series, - update_metadata=update_array_columns, + result_df, array_columns = _flatten_array_of_struct_columns( + result_df, array_of_struct_columns, array_columns, nested_originated_columns ) - # Flatten regular STRUCT columns - def update_clear_on_continuation(col_name: str, new_col_names: list[str]) -> None: - clear_on_continuation_cols.extend(new_col_names) - - def create_struct_series( - original_arr: pa.Array, field_arr: pa.Array, index: pd.Index, field: pa.Field - ) -> pd.Series: - return pd.Series( - field_arr.to_pylist(), - dtype=pd.ArrowDtype(field.type), - index=index, - ) - - result_df = _flatten_and_replace_columns( - result_df, - struct_columns, - nested_originated_columns, - get_struct_type=lambda t: t, - get_field_values=lambda arr: arr.flatten(), - create_series=create_struct_series, - update_metadata=update_clear_on_continuation, + result_df, clear_on_continuation_cols = _flatten_struct_columns( + result_df, struct_columns, clear_on_continuation_cols, nested_originated_columns ) # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) @@ -146,36 +104,45 @@ def _classify_columns( ) -def _flatten_and_replace_columns( +def _flatten_array_of_struct_columns( dataframe: pd.DataFrame, - columns: list[str], + array_of_struct_columns: list[str], + array_columns: list[str], nested_originated_columns: set[str], - get_struct_type: Callable[[pa.DataType], pa.DataType], - get_field_values: Callable[[pa.Array], list[pa.Array]], - create_series: Callable[[pa.Array, pa.Array, pd.Index, pa.Field], pd.Series], - update_metadata: Callable[[str, list[str]], None], -) -> pd.DataFrame: - """Generic helper to flatten structure-like columns and replace them in the DataFrame.""" +) -> tuple[pd.DataFrame, list[str]]: + """Flatten ARRAY of STRUCT columns into separate array columns for each field.""" result_df = dataframe.copy() - for col_name in columns: + for col_name in array_of_struct_columns: col_data = result_df[col_name] pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype - struct_type = get_struct_type(pa_type) + struct_type = pa_type.value_type + # Use PyArrow to reshape the list into multiple list arrays arrow_array = pa.array(col_data) - flattened_fields = get_field_values(arrow_array) + offsets = arrow_array.offsets + values = arrow_array.values # StructArray + flattened_fields = values.flatten() # List[Array] new_cols_to_add = {} - new_col_names = [] + new_array_col_names = [] + # Create new columns for each struct field for field_idx in range(struct_type.num_fields): field = struct_type.field(field_idx) new_col_name = f"{col_name}.{field.name}" nested_originated_columns.add(new_col_name) - new_col_names.append(new_col_name) + new_array_col_names.append(new_col_name) - new_cols_to_add[new_col_name] = create_series( - arrow_array, flattened_fields[field_idx], result_df.index, field + # Reconstruct ListArray for this field + # Use mask=arrow_array.is_null() to preserve nulls from the original list + new_list_array = pa.ListArray.from_arrays( + offsets, flattened_fields[field_idx], mask=arrow_array.is_null() + ) + + new_cols_to_add[new_col_name] = pd.Series( + new_list_array.to_pylist(), + dtype=pd.ArrowDtype(pa.list_(field.type)), + index=result_df.index, ) col_idx = result_df.columns.to_list().index(col_name) @@ -190,9 +157,11 @@ def _flatten_and_replace_columns( axis=1, ) - update_metadata(col_name, new_col_names) - - return result_df + # Update array_columns list + array_columns.remove(col_name) + # Add the new array columns + array_columns.extend(new_array_col_names) + return result_df, array_columns def _explode_array_columns( @@ -271,3 +240,39 @@ def _explode_array_columns( return exploded_df, array_row_groups else: return dataframe, array_row_groups + + +def _flatten_struct_columns( + dataframe: pd.DataFrame, + struct_columns: list[str], + clear_on_continuation_cols: list[str], + nested_originated_columns: set[str], +) -> tuple[pd.DataFrame, list[str]]: + """Flatten regular STRUCT columns using pandas accessor.""" + result_df = dataframe.copy() + for col_name in struct_columns: + # Use pandas struct accessor to explode the struct column into a DataFrame of its fields + exploded_struct = result_df[col_name].struct.explode() + + # Rename columns to 'parent.child' format + exploded_struct.columns = [ + f"{col_name}.{sub_col}" for sub_col in exploded_struct.columns + ] + + # Update metadata + for new_col in exploded_struct.columns: + nested_originated_columns.add(new_col) + clear_on_continuation_cols.append(new_col) + + # Replace the original struct column with the new field columns + col_idx = result_df.columns.to_list().index(col_name) + result_df = pd.concat( + [ + result_df.iloc[:, :col_idx], + exploded_struct, + result_df.iloc[:, col_idx + 1 :], + ], + axis=1, + ) + + return result_df, clear_on_continuation_cols diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 9ff923b346..2fcc5d09b1 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -118,17 +118,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Sadie 40\n", + " AL F 1910 Mary 875\n", + " AR F 1910 Vera 39\n", + " AR F 1910 Marie 78\n", + " AR F 1910 Lucille 66\n", + " CA F 1910 Virginia 101\n", + " DC F 1910 Margaret 72\n", + " GA F 1910 Mildred 133\n", + " GA F 1910 Vera 51\n", + " GA F 1910 Sallie 92\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -173,7 +173,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d281fbe99c9747ee9187057bdac9a33c", + "model_id": "3a5ef3e2dfa64533ba0252bc29b9ce7b", "version_major": 2, "version_minor": 1 }, @@ -209,80 +209,80 @@ " AL\n", " F\n", " 1910\n", - " Annie\n", - " 482\n", + " Hazel\n", + " 51\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Myrtle\n", - " 104\n", + " Lucy\n", + " 76\n", " \n", " \n", " 2\n", " AR\n", " F\n", " 1910\n", - " Lillian\n", - " 56\n", + " Nellie\n", + " 39\n", " \n", " \n", " 3\n", - " CT\n", + " AR\n", " F\n", " 1910\n", - " Anne\n", - " 38\n", + " Lena\n", + " 40\n", " \n", " \n", " 4\n", - " CT\n", + " CO\n", " F\n", " 1910\n", - " Frances\n", - " 45\n", + " Thelma\n", + " 36\n", " \n", " \n", " 5\n", - " FL\n", + " CO\n", " F\n", " 1910\n", - " Margaret\n", - " 53\n", + " Ruth\n", + " 68\n", " \n", " \n", " 6\n", - " GA\n", + " CT\n", " F\n", " 1910\n", - " Mae\n", - " 73\n", + " Elizabeth\n", + " 86\n", " \n", " \n", " 7\n", - " GA\n", + " DC\n", " F\n", " 1910\n", - " Beatrice\n", - " 96\n", + " Mary\n", + " 80\n", " \n", " \n", " 8\n", - " GA\n", + " FL\n", " F\n", " 1910\n", - " Lola\n", - " 47\n", + " Annie\n", + " 101\n", " \n", " \n", " 9\n", - " IA\n", + " FL\n", " F\n", " 1910\n", - " Viola\n", - " 49\n", + " Alma\n", + " 39\n", " \n", " \n", "\n", @@ -290,17 +290,17 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Hazel 51\n", + " AL F 1910 Lucy 76\n", + " AR F 1910 Nellie 39\n", + " AR F 1910 Lena 40\n", + " CO F 1910 Thelma 36\n", + " CO F 1910 Ruth 68\n", + " CT F 1910 Elizabeth 86\n", + " DC F 1910 Mary 80\n", + " FL F 1910 Annie 101\n", + " FL F 1910 Alma 39\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -326,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -334,7 +334,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 54 seconds of slot time. [Job bigframes-dev:US.bf668ba0-3b44-4e6a-8e62-ae9e1a518994 details]\n", + " Query processed 171.4 MB in 34 seconds of slot time. [Job bigframes-dev:US.6b7e5580-9e47-4cbb-9289-87c78f9b0cc9 details]\n", " " ], "text/plain": [ @@ -407,22 +407,15 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "id": "da23e0f3", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.dad903a4-20b1-419f-913f-af083fe054cc.
SQL
SELECT\n",
-       "`year` AS `year`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`bfuid_col_7` AS `bfuid_col_11`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._e20e6e5e_d279_4a57_8eda_5df7d62bffae_bqdf_b6aa24ab-7544-4d8f-8418-b12c0ff9f902` AS `t0`)\n",
-       "ORDER BY `bfuid_col_11` ASC NULLS LAST
\n", + "✅ Completed. \n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_RYykRY9utr_s5uce1hFnIux2igsA details]\n", " " ], "text/plain": [ @@ -436,7 +429,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 4 seconds of slot time. [Job bigframes-dev:US.job_NmHwc2dPU4mfNGVh01R71WPTivjp details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_IRQ3km4r-Rm222hBk5RKPoFNmydR details]\n", " " ], "text/plain": [ @@ -449,7 +442,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8f93406a94744a17ba97a5aeb88cb1af", + "model_id": "62884c278ad742bd930d92c414cfa7bd", "version_major": 2, "version_minor": 1 }, @@ -482,7 +475,7 @@ "[5552452 rows]" ] }, - "execution_count": 15, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -540,7 +533,9 @@ { "data": { "text/html": [ - "✅ Completed. " + "✅ Completed. \n", + " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_mMuMY4SFeUPkt8T09zT5Ok5ndF_y details]\n", + " " ], "text/plain": [ "" @@ -552,7 +547,9 @@ { "data": { "text/html": [ - "✅ Completed. " + "✅ Completed. \n", + " Query processed 215.9 MB in 12 seconds of slot time. [Job bigframes-dev:US.job_xdJHd_JzPSvDWtDZ8b89jfhvX-LF details]\n", + " " ], "text/plain": [ "" @@ -571,12 +568,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "95aa6f1c585d49aea2e577c04aa1404b", + "model_id": "3aaa2d812bd243d28703e2a626e01446", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -652,7 +649,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in a moment of slot time.\n", + " Query processed 215.9 MB in a moment of slot time.\n", " " ], "text/plain": [ @@ -666,7 +663,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 0 Bytes in a moment of slot time.\n", + " Query processed 215.9 MB in a moment of slot time.\n", " " ], "text/plain": [ @@ -686,12 +683,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e385c162fcf24f05a0e993009cbd8d04", + "model_id": "afa2f7df47de4dabab331e76904f3f4e", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -735,7 +732,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 13 seconds of slot time.\n", + " Query processed 85.9 kB in 57 seconds of slot time. [Job bigframes-dev:US.job_3HjkRk3xV696BuVfOuIv6d3LDoo4 details]\n", " " ], "text/plain": [ @@ -783,10 +780,6 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", @@ -796,7 +789,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "050763bb5227403bb729ebacd1888193", + "model_id": "7761fee72f124a3788fa732714fa2925", "version_major": 2, "version_minor": 1 }, @@ -933,47 +926,47 @@ "[5 rows x 15 columns in total]" ], "text/plain": [ - " result \\\n", - "0 {'application_number': None, 'class_internatio... \n", - "1 {'application_number': None, 'class_internatio... \n", - "2 {'application_number': None, 'class_internatio... \n", - "3 {'application_number': None, 'class_internatio... \n", - "4 {'application_number': None, 'class_internatio... \n", + " result \\\n", + "{'application_number': None, 'class_internation... \n", + "{'application_number': None, 'class_internation... \n", + "{'application_number': None, 'class_internation... \n", + "{'application_number': None, 'class_internation... \n", + "{'application_number': None, 'class_internation... \n", "\n", - " gcs_path issuer language \\\n", - "0 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", - "1 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", - "2 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", - "3 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", - "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", + " gcs_path issuer language \\\n", + "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", + "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", + "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", + "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", + "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", "\n", - " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 H05B 6/12 18165514.3 \n", - "1 29.08.018 E04H 6/12 18157874.1 \n", - "2 03.10.2018 H01L 21/20 18166536.5 \n", - "3 03.10.2018 G06F 11/30 18157347.8 \n", - "4 03.10.2018 A01K 31/00 18171005.4 \n", + "publication_date class_international class_us application_number filing_date \\\n", + " 03.10.2018 H05B 6/12 18165514.3 03.04.2018 \n", + " 29.08.018 E04H 6/12 18157874.1 21.02.2018 \n", + " 03.10.2018 H01L 21/20 18166536.5 16.02.2016 \n", + " 03.10.2018 G06F 11/30 18157347.8 19.02.2018 \n", + " 03.10.2018 A01K 31/00 18171005.4 05.02.2015 \n", "\n", - " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 03.04.2018 30.03.2017 \n", - "1 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", - "2 16.02.2016 Scheider, Sascha et al \n", - "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", + "priority_date_eu representative_line_1_eu applicant_line_1 \\\n", + " 30.03.2017 BSH Hausger√§te GmbH \n", + " 22.02.2017 Liedtke & Partner Patentanw√§lte SHB Hebezeugbau GmbH \n", + " Scheider, Sascha et al EV Group E. Thallner GmbH \n", + " 31.03.2017 Hoffmann Eitle FUJITSU LIMITED \n", + " 05.02.2014 Stork Bamberger Patentanw√§lte Linco Food Systems A/S \n", "\n", - " applicant_line_1 inventor_line_1 \\\n", - "0 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "1 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "2 EV Group E. Thallner GmbH Kurz, Florian \n", - "3 FUJITSU LIMITED Kukihara, Kensuke \n", - "4 Linco Food Systems A/S Thrane, Uffe \n", + " inventor_line_1 title_line_1 \\\n", + "Acero Acero, Jesus VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG \n", + " VOLGER, Alexander STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER \n", + " Kurz, Florian VORRICHTUNG ZUM BONDEN VON SUBSTRATEN \n", + " Kukihara, Kensuke METHOD EXECUTED BY A COMPUTER, INFORMATION PROC... \n", + " Thrane, Uffe MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER EI... \n", "\n", - " title_line_1 number \n", - "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "1 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", - "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + " number \n", + "EP 3 383 141 A2 \n", + "EP 3 366 869 A1 \n", + "EP 3 382 744 A1 \n", + "EP 3 382 553 A1 \n", + "EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -1054,7 +1047,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "45fc82b556e046baadd065991330ad7f", + "model_id": "7e22609554864964b0167fdeccb414e3", "version_major": 2, "version_minor": 1 }, From c53da80727a51efa296b71cdd8772336a75a2804 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 29 Dec 2025 22:17:37 +0000 Subject: [PATCH 07/29] refactor: revert the refactor --- bigframes/display/_flatten.py | 37 ++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 004a16c687..e7f63777ae 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -248,31 +248,40 @@ def _flatten_struct_columns( clear_on_continuation_cols: list[str], nested_originated_columns: set[str], ) -> tuple[pd.DataFrame, list[str]]: - """Flatten regular STRUCT columns using pandas accessor.""" + """Flatten regular STRUCT columns.""" result_df = dataframe.copy() for col_name in struct_columns: - # Use pandas struct accessor to explode the struct column into a DataFrame of its fields - exploded_struct = result_df[col_name].struct.explode() + col_data = result_df[col_name] + if isinstance(col_data.dtype, pd.ArrowDtype): + pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype + + # Use PyArrow to flatten the struct column without row iteration + # combine_chunks() ensures we have a single array if it was chunked + arrow_array = pa.array(col_data) + flattened_fields = arrow_array.flatten() - # Rename columns to 'parent.child' format - exploded_struct.columns = [ - f"{col_name}.{sub_col}" for sub_col in exploded_struct.columns - ] + new_cols_to_add = {} + for field_idx in range(pa_type.num_fields): + field = pa_type.field(field_idx) + new_col_name = f"{col_name}.{field.name}" + nested_originated_columns.add(new_col_name) + clear_on_continuation_cols.append(new_col_name) - # Update metadata - for new_col in exploded_struct.columns: - nested_originated_columns.add(new_col) - clear_on_continuation_cols.append(new_col) + # Create a new Series from the flattened array + new_cols_to_add[new_col_name] = pd.Series( + flattened_fields[field_idx].to_pylist(), + dtype=pd.ArrowDtype(field.type), + index=result_df.index, + ) - # Replace the original struct column with the new field columns col_idx = result_df.columns.to_list().index(col_name) + new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) result_df = pd.concat( [ result_df.iloc[:, :col_idx], - exploded_struct, + new_cols_df, result_df.iloc[:, col_idx + 1 :], ], axis=1, ) - return result_df, clear_on_continuation_cols From 60785f3a185db4bfd009a3c2967caf9b42986f05 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 2 Jan 2026 21:26:47 +0000 Subject: [PATCH 08/29] test: merge notebook --- notebooks/dataframes/anywidget_mode.ipynb | 382 ++++++++++------------ 1 file changed, 181 insertions(+), 201 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index a57a82784a..65a378f9f7 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,7 +35,15 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Unable to set __version__, run `pip install -e .` or `python setup.py develop` first.\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd" ] @@ -118,17 +126,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Sadie 40\n", - " AL F 1910 Mary 875\n", - " AR F 1910 Vera 39\n", - " AR F 1910 Marie 78\n", - " AR F 1910 Lucille 66\n", - " CA F 1910 Virginia 101\n", - " DC F 1910 Margaret 72\n", - " GA F 1910 Mildred 133\n", - " GA F 1910 Vera 51\n", - " GA F 1910 Sallie 92\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -173,7 +181,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3a5ef3e2dfa64533ba0252bc29b9ce7b", + "model_id": "fdbca03dd2ff41bcb6c8c962a8a24aed", "version_major": 2, "version_minor": 1 }, @@ -209,80 +217,80 @@ " AL\n", " F\n", " 1910\n", - " Hazel\n", - " 51\n", + " Annie\n", + " 482\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Lucy\n", - " 76\n", + " Myrtle\n", + " 104\n", " \n", " \n", " 2\n", " AR\n", " F\n", " 1910\n", - " Nellie\n", - " 39\n", + " Lillian\n", + " 56\n", " \n", " \n", " 3\n", - " AR\n", + " CT\n", " F\n", " 1910\n", - " Lena\n", - " 40\n", + " Anne\n", + " 38\n", " \n", " \n", " 4\n", - " CO\n", + " CT\n", " F\n", " 1910\n", - " Thelma\n", - " 36\n", + " Frances\n", + " 45\n", " \n", " \n", " 5\n", - " CO\n", + " FL\n", " F\n", " 1910\n", - " Ruth\n", - " 68\n", + " Margaret\n", + " 53\n", " \n", " \n", " 6\n", - " CT\n", + " GA\n", " F\n", " 1910\n", - " Elizabeth\n", - " 86\n", + " Mae\n", + " 73\n", " \n", " \n", " 7\n", - " DC\n", + " GA\n", " F\n", " 1910\n", - " Mary\n", - " 80\n", + " Beatrice\n", + " 96\n", " \n", " \n", " 8\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Annie\n", - " 101\n", + " Lola\n", + " 47\n", " \n", " \n", " 9\n", - " FL\n", + " IA\n", " F\n", " 1910\n", - " Alma\n", - " 39\n", + " Viola\n", + " 49\n", " \n", " \n", "\n", @@ -290,17 +298,17 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -326,14 +334,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "42bb02ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "✅ Completed. " + "✅ Completed. \n", + " Query processed 171.4 MB in 28 seconds of slot time. [Job bigframes-dev:US.814d7716-440e-4df5-8c4d-9da6cc033b23 details]\n", + " " ], "text/plain": [ "" @@ -345,7 +355,9 @@ { "data": { "text/html": [ - "✅ Completed. " + "✅ Completed. \n", + " Query processed 88.8 MB in a moment of slot time.\n", + " " ], "text/plain": [ "" @@ -356,43 +368,35 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "004beca7d4034b498add8f9edd55027b", - "version_major": 2, - "version_minor": 1 - }, "text/html": [ - "
0    1910\n",
-       "1    1910\n",
-       "2    1910\n",
-       "3    1910\n",
-       "4    1910\n",
-       "5    1910\n",
-       "6    1910\n",
-       "7    1910\n",
-       "8    1910\n",
-       "9    1910

[5552452 rows]

" + "✅ Completed. " ], "text/plain": [ - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "Name: year, dtype: Int64\n", - "...\n", - "\n", - "[5552452 rows]" + "" ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "1910\n", + "Name: year, dtype: Int64\n", + "...\n", + "\n", + "[5552452 rows]\n" + ] } ], "source": [ @@ -419,7 +423,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_RYykRY9utr_s5uce1hFnIux2igsA details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_DI8qDij7y2WqsPLVZ418ThOjVOTe details]\n", " " ], "text/plain": [ @@ -433,7 +437,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_IRQ3km4r-Rm222hBk5RKPoFNmydR details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gETUk0MDyTx3-BEil5H1iayTXH4I details]\n", " " ], "text/plain": [ @@ -446,7 +450,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1251df51c4ba44d0b93af07917888511", + "model_id": "4a652ba2aab8430da175895fd4d4e279", "version_major": 2, "version_minor": 1 }, @@ -545,7 +549,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_mMuMY4SFeUPkt8T09zT5Ok5ndF_y details]\n", + " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_XxI3HVIuSXKhKHqxS-GYZ_Fse2w_ details]\n", " " ], "text/plain": [ @@ -559,7 +563,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 12 seconds of slot time. [Job bigframes-dev:US.job_xdJHd_JzPSvDWtDZ8b89jfhvX-LF details]\n", + " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_FBd28IEWqZEFT1lvmS0lOUCBtOhj details]\n", " " ], "text/plain": [ @@ -579,12 +583,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "45a462a3a42a445bb06d89132b7d0331", + "model_id": "cbcdaeca25634c7d93d2aed9fb7cc74b", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -659,27 +663,8 @@ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.a9f6b054-3709-49d6-8109-c325ffe07679.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  *\n",
-       "FROM (\n",
-       "  SELECT\n",
-       "    `state`,\n",
-       "    `gender`,\n",
-       "    `year`,\n",
-       "    `name`,\n",
-       "    `number`\n",
-       "  FROM `bigquery-public-data.usa_names.usa_1910_2013` FOR SYSTEM_TIME AS OF TIMESTAMP('2025-12-29T22:47:29.748716+00:00')\n",
-       ") AS `t0`)\n",
-       "ORDER BY `name` ASC NULLS LAST ,`year` ASC NULLS LAST ,`state` ASC NULLS LAST\n",
-       "LIMIT 5
\n", + "✅ Completed. \n", + " Query processed 215.9 MB in a moment of slot time.\n", " " ], "text/plain": [ @@ -713,12 +698,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "219f91f2341d42b8b96da795a79fc3e8", + "model_id": "e5ee622cc3ee447cb75e7698de50a53c", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -754,24 +739,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "added-cell-1", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.8819b8bd-6697-4c65-a8bc-c7a95a06fe8e.
SQL
\n",
-       "  SELECT\n",
-       "    AI.GENERATE(\n",
-       "      prompt=>("Extract the values.", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, "us.conn")), "r")),\n",
-       "      connection_id=>"bigframes-dev.us.bigframes-default-connection",\n",
-       "      output_schema=>"publication_date string, class_international string, application_number string, filing_date string") AS result,\n",
-       "    *\n",
-       "  FROM `bigquery-public-data.labeled_patents.extracted_data`\n",
-       "  LIMIT 5;\n",
-       "
\n", + "✅ Completed. \n", + " Query processed 85.9 kB in 27 seconds of slot time.\n", " " ], "text/plain": [ @@ -819,6 +795,10 @@ "name": "stderr", "output_type": "stream", "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", @@ -828,7 +808,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7761fee72f124a3788fa732714fa2925", + "model_id": "12b8bb97a94a495aabf36bb39386c527", "version_major": 2, "version_minor": 1 }, @@ -876,6 +856,24 @@ " EU\n", " DE\n", " 03.10.2018\n", + " G06F 11/30\n", + " <NA>\n", + " 18157347.8\n", + " 19.02.2018\n", + " 31.03.2017\n", + " Hoffmann Eitle\n", + " FUJITSU LIMITED\n", + " Kukihara, Kensuke\n", + " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", + " EP 3 382 553 A1\n", + " \n", + " \n", + " 1\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", + " 03.10.2018\n", " H05B 6/12\n", " <NA>\n", " 18165514.3\n", @@ -888,24 +886,6 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 1\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 29.08.018\n", - " E04H 6/12\n", - " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", - " \n", - " \n", " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", @@ -930,24 +910,6 @@ " EU\n", " DE\n", " 03.10.2018\n", - " G06F 11/30\n", - " <NA>\n", - " 18157347.8\n", - " 19.02.2018\n", - " 31.03.2017\n", - " Hoffmann Eitle\n", - " FUJITSU LIMITED\n", - " Kukihara, Kensuke\n", - " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", - " EP 3 382 553 A1\n", - " \n", - " \n", - " 4\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 03.10.2018\n", " A01K 31/00\n", " <NA>\n", " 18171005.4\n", @@ -959,53 +921,71 @@ " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", + " \n", + " 4\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", + " 29.08.018\n", + " E04H 6/12\n", + " <NA>\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", + " \n", " \n", "\n", "

5 rows × 15 columns

\n", "[5 rows x 15 columns in total]" ], "text/plain": [ - " result \\\n", - "{'application_number': None, 'class_internation... \n", - "{'application_number': None, 'class_internation... \n", - "{'application_number': None, 'class_internation... \n", - "{'application_number': None, 'class_internation... \n", - "{'application_number': None, 'class_internation... \n", + " result \\\n", + "0 {'application_number': None, 'class_internatio... \n", + "1 {'application_number': None, 'class_internatio... \n", + "2 {'application_number': None, 'class_internatio... \n", + "3 {'application_number': None, 'class_internatio... \n", + "4 {'application_number': None, 'class_internatio... \n", "\n", - " gcs_path issuer language \\\n", - "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", - "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", - "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", - "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", - "gs://gcs-public-data--labeled-patents/espacenet... EU DE \n", + " gcs_path issuer language \\\n", + "0 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", + "1 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", + "2 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", + "3 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", + "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", - "publication_date class_international class_us application_number filing_date \\\n", - " 03.10.2018 H05B 6/12 18165514.3 03.04.2018 \n", - " 29.08.018 E04H 6/12 18157874.1 21.02.2018 \n", - " 03.10.2018 H01L 21/20 18166536.5 16.02.2016 \n", - " 03.10.2018 G06F 11/30 18157347.8 19.02.2018 \n", - " 03.10.2018 A01K 31/00 18171005.4 05.02.2015 \n", + " publication_date class_international class_us application_number \\\n", + "0 03.10.2018 G06F 11/30 18157347.8 \n", + "1 03.10.2018 H05B 6/12 18165514.3 \n", + "2 03.10.2018 H01L 21/20 18166536.5 \n", + "3 03.10.2018 A01K 31/00 18171005.4 \n", + "4 29.08.018 E04H 6/12 18157874.1 \n", "\n", - "priority_date_eu representative_line_1_eu applicant_line_1 \\\n", - " 30.03.2017 BSH Hausgeräte GmbH \n", - " 22.02.2017 Liedtke & Partner Patentanwälte SHB Hebezeugbau GmbH \n", - " Scheider, Sascha et al EV Group E. Thallner GmbH \n", - " 31.03.2017 Hoffmann Eitle FUJITSU LIMITED \n", - " 05.02.2014 Stork Bamberger Patentanwälte Linco Food Systems A/S \n", + " filing_date priority_date_eu representative_line_1_eu \\\n", + "0 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "1 03.04.2018 30.03.2017 \n", + "2 16.02.2016 Scheider, Sascha et al \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", "\n", - " inventor_line_1 title_line_1 \\\n", - "Acero Acero, Jesus VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG \n", - " VOLGER, Alexander STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER \n", - " Kurz, Florian VORRICHTUNG ZUM BONDEN VON SUBSTRATEN \n", - " Kukihara, Kensuke METHOD EXECUTED BY A COMPUTER, INFORMATION PROC... \n", - " Thrane, Uffe MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER EI... \n", + " applicant_line_1 inventor_line_1 \\\n", + "0 FUJITSU LIMITED Kukihara, Kensuke \n", + "1 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "2 EV Group E. Thallner GmbH Kurz, Florian \n", + "3 Linco Food Systems A/S Thrane, Uffe \n", + "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", "\n", - " number \n", - "EP 3 383 141 A2 \n", - "EP 3 366 869 A1 \n", - "EP 3 382 744 A1 \n", - "EP 3 382 553 A1 \n", - "EP 3 381 276 A1 \n", + " title_line_1 number \n", + "0 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "4 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -1086,7 +1066,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7e22609554864964b0167fdeccb414e3", + "model_id": "76b3ef8ead0949d782106e5a255d6e3c", "version_major": 2, "version_minor": 1 }, From f32a53fa0bbb69d57795b2935c6635be82de0308 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 6 Jan 2026 02:18:23 +0000 Subject: [PATCH 09/29] feat: use dataclass for flatten_nested_data --- bigframes/display/_flatten.py | 50 +++++++++++++++++++++++++-------- bigframes/display/html.py | 19 ++++++------- tests/unit/display/test_html.py | 9 ++++-- 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index e7f63777ae..e25fc1d74a 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -16,18 +16,44 @@ from __future__ import annotations +import dataclasses from typing import cast import pandas as pd import pyarrow as pa +@dataclasses.dataclass(frozen=True) +class FlattenResult: + """The result of flattening a DataFrame.""" + + dataframe: pd.DataFrame + """The flattened DataFrame.""" + + row_groups: dict[str, list[int]] + """ + A mapping from original row index to the new row indices that were created + from it. + """ + + cleared_on_continuation: list[str] + """A list of column names that should be cleared on continuation rows.""" + + nested_columns: set[str] + """A set of column names that were created from nested data.""" + + def flatten_nested_data( dataframe: pd.DataFrame, -) -> tuple[pd.DataFrame, dict[str, list[int]], list[str], set[str]]: +) -> FlattenResult: """Flatten nested STRUCT and ARRAY columns for display.""" if dataframe.empty: - return dataframe.copy(), {}, [], set() + return FlattenResult( + dataframe=dataframe.copy(), + row_groups={}, + cleared_on_continuation=[], + nested_columns=set(), + ) result_df = dataframe.copy() @@ -49,19 +75,19 @@ def flatten_nested_data( # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) if not array_columns: - return ( - result_df, - {}, - clear_on_continuation_cols, - nested_originated_columns, + return FlattenResult( + dataframe=result_df, + row_groups={}, + cleared_on_continuation=clear_on_continuation_cols, + nested_columns=nested_originated_columns, ) result_df, array_row_groups = _explode_array_columns(result_df, array_columns) - return ( - result_df, - array_row_groups, - clear_on_continuation_cols, - nested_originated_columns, + return FlattenResult( + dataframe=result_df, + row_groups=array_row_groups, + cleared_on_continuation=clear_on_continuation_cols, + nested_columns=nested_originated_columns, ) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 099693ffad..a92118f37f 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -49,23 +49,20 @@ def render_html( ) -> str: """Render a pandas DataFrame to HTML with specific styling and nested data support.""" # Flatten nested data first - ( - flattened_df, - array_row_groups, - clear_on_continuation, - nested_originated_columns, - ) = _flatten.flatten_nested_data(dataframe) + flatten_result = _flatten.flatten_nested_data(dataframe) orderable_columns = orderable_columns or [] classes = "dataframe table table-striped table-hover" table_html_parts = [f''] - table_html_parts.append(_render_table_header(flattened_df, orderable_columns)) + table_html_parts.append( + _render_table_header(flatten_result.dataframe, orderable_columns) + ) table_html_parts.append( _render_table_body( - flattened_df, - array_row_groups, - clear_on_continuation, - nested_originated_columns, + flatten_result.dataframe, + flatten_result.row_groups, + flatten_result.cleared_on_continuation, + flatten_result.nested_columns, ) ) table_html_parts.append("
") diff --git a/tests/unit/display/test_html.py b/tests/unit/display/test_html.py index c9bee32296..4ea880d9ba 100644 --- a/tests/unit/display/test_html.py +++ b/tests/unit/display/test_html.py @@ -165,7 +165,9 @@ def test_flatten_nested_data_flattens_structs(): } ) - flattened, _, _, nested_originated_columns = flatten_nested_data(struct_data) + result = flatten_nested_data(struct_data) + flattened = result.dataframe + nested_originated_columns = result.nested_columns assert "struct_col.name" in flattened.columns assert "struct_col.age" in flattened.columns @@ -186,7 +188,10 @@ def test_flatten_nested_data_explodes_arrays(): } ) - flattened, groups, _, nested_originated_columns = flatten_nested_data(array_data) + result = flatten_nested_data(array_data) + flattened = result.dataframe + groups = result.row_groups + nested_originated_columns = result.nested_columns assert len(flattened) == 5 # 3 + 2 array elements assert "0" in groups # First original row From 3944249f2cbeabc7a2a51daa2caea093d00d0207 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 6 Jan 2026 03:40:23 +0000 Subject: [PATCH 10/29] feat: Refactor HTML rendering and document JS tests --- bigframes/display/html.py | 77 ++-- tests/js/table_widget.test.js | 772 ++++++++++++++++++---------------- 2 files changed, 447 insertions(+), 402 deletions(-) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index a92118f37f..2f347e6ed0 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -115,44 +115,59 @@ def _render_table_body( row = dataframe.iloc[i] for col_name, value in row.items(): - col_name_str = str(col_name) - if is_continuation and col_name_str in clear_on_continuation: - body_parts.append(" ") - continue dtype = dataframe.dtypes.loc[col_name] # type: ignore - - if col_name_str in nested_originated_columns: - align = "left" - else: - align = "right" if _is_dtype_numeric(dtype) else "left" - - cell_content = "" - if pandas.api.types.is_scalar(value) and pd.isna(value): - if is_continuation: - # For padding nulls in continuation rows, show empty cell - body_parts.append(f' ') - else: - # For primary nulls, keep showing the indicator but maybe styled - body_parts.append( - f' ' - '<NA>' - ) - continue - elif isinstance(value, float): - cell_content = f"{value:.{precision}f}" - else: - cell_content = str(value) - - # Use classes for alignment - body_parts.append( - f' ' - f"{html.escape(cell_content)}" + cell_html = _render_cell( + value, + dtype, + is_continuation, + str(col_name), + clear_on_continuation, + nested_originated_columns, + precision, ) + body_parts.append(cell_html) body_parts.append(" ") body_parts.append(" ") return "\n".join(body_parts) +def _render_cell( + value: Any, + dtype: Any, + is_continuation: bool, + col_name_str: str, + clear_on_continuation: list[str], + nested_originated_columns: set[str], + precision: int, +) -> str: + """Render a single cell of the HTML table.""" + if is_continuation and col_name_str in clear_on_continuation: + return " " + + if col_name_str in nested_originated_columns: + align = "left" + else: + align = "right" if _is_dtype_numeric(dtype) else "left" + + if pandas.api.types.is_scalar(value) and pd.isna(value): + if is_continuation: + # For padding nulls in continuation rows, show empty cell + return f' ' + else: + # For primary nulls, keep showing the indicator but maybe styled + return ( + f' ' + '<NA>' + ) + + if isinstance(value, float): + cell_content = f"{value:.{precision}f}" + else: + cell_content = str(value) + + return f' ' f"{html.escape(cell_content)}" + + def _obj_ref_rt_to_html(obj_ref_rt: str) -> str: obj_ref_rt_json = json.loads(obj_ref_rt) obj_ref_details = obj_ref_rt_json["objectref"]["details"] diff --git a/tests/js/table_widget.test.js b/tests/js/table_widget.test.js index b444a2d14e..136c00de6a 100644 --- a/tests/js/table_widget.test.js +++ b/tests/js/table_widget.test.js @@ -1,3 +1,7 @@ +/** + * @fileoverview Tests for the anywidget-based table widget. + */ + /* * Copyright 2025 Google LLC * @@ -14,238 +18,259 @@ * limitations under the License. */ -import { jest } from "@jest/globals"; -import { JSDOM } from "jsdom"; - -describe("TableWidget", () => { - let model; - let el; - let render; - - beforeEach(async () => { - jest.resetModules(); - document.body.innerHTML = "
"; - el = document.body.querySelector("div"); - - const tableWidget = ( - await import("../../bigframes/display/table_widget.js") - ).default; - render = tableWidget.render; - - model = { - get: jest.fn(), - set: jest.fn(), - save_changes: jest.fn(), - on: jest.fn(), - }; - }); - - it("should have a render function", () => { - expect(render).toBeDefined(); - }); - - describe("render", () => { - it("should create the basic structure", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return ""; - } - if (property === "row_count") { - return 100; - } - if (property === "error_message") { - return null; - } - if (property === "page_size") { - return 10; - } - if (property === "page") { - return 0; - } - return null; - }); - - render({ model, el }); - - expect(el.classList.contains("bigframes-widget")).toBe(true); - expect(el.querySelector(".error-message")).not.toBeNull(); - expect(el.querySelector("div")).not.toBeNull(); - expect(el.querySelector("div:nth-child(3)")).not.toBeNull(); - }); - - it("should sort when a sortable column is clicked", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return "
col1
"; - } - if (property === "orderable_columns") { - return ["col1"]; - } - if (property === "sort_context") { - return []; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const header = el.querySelector("th"); - header.click(); - - expect(model.set).toHaveBeenCalledWith("sort_context", [ - { column: "col1", ascending: true }, - ]); - expect(model.save_changes).toHaveBeenCalled(); - }); - - it("should reverse sort direction when a sorted column is clicked", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return "
col1
"; - } - if (property === "orderable_columns") { - return ["col1"]; - } - if (property === "sort_context") { - return [{ column: "col1", ascending: true }]; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const header = el.querySelector("th"); - header.click(); - - expect(model.set).toHaveBeenCalledWith("sort_context", [ - { column: "col1", ascending: false }, - ]); - expect(model.save_changes).toHaveBeenCalled(); - }); - - it("should clear sort when a descending sorted column is clicked", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return "
col1
"; - } - if (property === "orderable_columns") { - return ["col1"]; - } - if (property === "sort_context") { - return [{ column: "col1", ascending: false }]; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const header = el.querySelector("th"); - header.click(); - - expect(model.set).toHaveBeenCalledWith("sort_context", []); - expect(model.save_changes).toHaveBeenCalled(); - }); - - it("should display the correct sort indicator", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return "
col1
col2
"; - } - if (property === "orderable_columns") { - return ["col1", "col2"]; - } - if (property === "sort_context") { - return [{ column: "col1", ascending: true }]; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const headers = el.querySelectorAll("th"); - const indicator1 = headers[0].querySelector(".sort-indicator"); - const indicator2 = headers[1].querySelector(".sort-indicator"); - - expect(indicator1.textContent).toBe("▲"); - expect(indicator2.textContent).toBe("●"); - }); - - it("should add a column to sort when Shift+Click is used", () => { - // Mock the initial state: already sorted by col1 asc - model.get.mockImplementation((property) => { - if (property === "table_html") { - return "
col1
col2
"; - } - if (property === "orderable_columns") { - return ["col1", "col2"]; - } - if (property === "sort_context") { - return [{ column: "col1", ascending: true }]; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const headers = el.querySelectorAll("th"); - const header2 = headers[1]; // col2 - - // Simulate Shift+Click - const clickEvent = new MouseEvent("click", { - bubbles: true, - cancelable: true, - shiftKey: true, - }); - header2.dispatchEvent(clickEvent); - - expect(model.set).toHaveBeenCalledWith("sort_context", [ - { column: "col1", ascending: true }, - { column: "col2", ascending: true }, - ]); - expect(model.save_changes).toHaveBeenCalled(); - }); - }); - - it("should render the series as a table with an index and one value column", () => { - // Mock the initial state - model.get.mockImplementation((property) => { - if (property === "table_html") { - return ` +import { jest } from '@jest/globals'; + +/** + * Test suite for the TableWidget frontend component. + */ +describe('TableWidget', () => { + /** @type {any} */ + let model; + /** @type {HTMLElement} */ + let el; + /** @type {Function} */ + let render; + + /** + * Sets up the test environment before each test. + * This includes resetting modules, creating a DOM element, + * and mocking the widget model. + */ + beforeEach(async () => { + jest.resetModules(); + document.body.innerHTML = '
'; + el = document.body.querySelector('div'); + + const tableWidget = ( + await import('../../bigframes/display/table_widget.js') + ).default; + render = tableWidget.render; + + model = { + get: jest.fn(), + set: jest.fn(), + save_changes: jest.fn(), + on: jest.fn(), + }; + }); + + it('should have a render function', () => { + expect(render).toBeDefined(); + }); + + /** + * Tests for the render function of the widget. + */ + describe('render', () => { + it('should create the basic structure', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return ''; + } + if (property === 'row_count') { + return 100; + } + if (property === 'error_message') { + return null; + } + if (property === 'page_size') { + return 10; + } + if (property === 'page') { + return 0; + } + return null; + }); + + render({ model, el }); + + expect(el.classList.contains('bigframes-widget')).toBe(true); + expect(el.querySelector('.error-message')).not.toBeNull(); + expect(el.querySelector('div')).not.toBeNull(); + expect(el.querySelector('div:nth-child(3)')).not.toBeNull(); + }); + + /** + * Verifies that clicking a sortable column header triggers a sort action + * with the correct parameters. + */ + it('should sort when a sortable column is clicked', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return '
col1
'; + } + if (property === 'orderable_columns') { + return ['col1']; + } + if (property === 'sort_context') { + return []; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const header = el.querySelector('th'); + header.click(); + + expect(model.set).toHaveBeenCalledWith('sort_context', [ + { column: 'col1', ascending: true }, + ]); + expect(model.save_changes).toHaveBeenCalled(); + }); + + it('should reverse sort direction when a sorted column is clicked', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return '
col1
'; + } + if (property === 'orderable_columns') { + return ['col1']; + } + if (property === 'sort_context') { + return [{ column: 'col1', ascending: true }]; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const header = el.querySelector('th'); + header.click(); + + expect(model.set).toHaveBeenCalledWith('sort_context', [ + { column: 'col1', ascending: false }, + ]); + expect(model.save_changes).toHaveBeenCalled(); + }); + + it('should clear sort when a descending sorted column is clicked', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return '
col1
'; + } + if (property === 'orderable_columns') { + return ['col1']; + } + if (property === 'sort_context') { + return [{ column: 'col1', ascending: false }]; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const header = el.querySelector('th'); + header.click(); + + expect(model.set).toHaveBeenCalledWith('sort_context', []); + expect(model.save_changes).toHaveBeenCalled(); + }); + + it('should display the correct sort indicator', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return '
col1
col2
'; + } + if (property === 'orderable_columns') { + return ['col1', 'col2']; + } + if (property === 'sort_context') { + return [{ column: 'col1', ascending: true }]; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const headers = el.querySelectorAll('th'); + const indicator1 = headers[0].querySelector('.sort-indicator'); + const indicator2 = headers[1].querySelector('.sort-indicator'); + + expect(indicator1.textContent).toBe('▲'); + expect(indicator2.textContent).toBe('●'); + }); + + /** + * Tests that holding the Shift key while clicking a column header + * adds the new column to the existing sort context for multi-column sorting. + */ + it('should add a column to sort when Shift+Click is used', () => { + // Mock the initial state: already sorted by col1 asc + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return '
col1
col2
'; + } + if (property === 'orderable_columns') { + return ['col1', 'col2']; + } + if (property === 'sort_context') { + return [{ column: 'col1', ascending: true }]; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const headers = el.querySelectorAll('th'); + const header2 = headers[1]; // col2 + + // Simulate Shift+Click + const clickEvent = new MouseEvent('click', { + bubbles: true, + cancelable: true, + shiftKey: true, + }); + header2.dispatchEvent(clickEvent); + + expect(model.set).toHaveBeenCalledWith('sort_context', [ + { column: 'col1', ascending: true }, + { column: 'col2', ascending: true }, + ]); + expect(model.save_changes).toHaveBeenCalled(); + }); + }); + + it('should render the series as a table with an index and one value column', () => { + // Mock the initial state + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return `
@@ -268,154 +293,159 @@ describe("TableWidget", () => {
`; - } - if (property === "orderable_columns") { - return []; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - // Check that the table has two columns - const headers = el.querySelectorAll( - ".paginated-table-container .col-header-name", - ); - expect(headers).toHaveLength(2); - - // Check that the headers are an empty string (for the index) and "value" - expect(headers[0].textContent).toBe(""); - expect(headers[1].textContent).toBe("value"); - }); - - it("should highlight all rows in a group when hovering over a nested data row", () => { - // Mock HTML with nested data structure (flattened rows) - model.get.mockImplementation((property) => { - if (property === "table_html") { - return ` + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + // Check that the table has two columns + const headers = el.querySelectorAll( + '.paginated-table-container .col-header-name', + ); + expect(headers).toHaveLength(2); + + // Check that the headers are an empty string (for the index) and "value" + expect(headers[0].textContent).toBe(''); + expect(headers[1].textContent).toBe('value'); + }); + + /** + * Verifies that hovering over a cell in a group of flattened rows + * (i.e., rows originating from the same nested data structure) + * adds a hover class to all cells in that group. + */ + it('should highlight all rows in a group when hovering over a nested data row', () => { + // Mock HTML with nested data structure (flattened rows) + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return `
Row 1 Part A
Row 1 Part B
Row 2
`; - } - if (property === "orderable_columns") { - return []; - } - return null; - }); - - render({ model, el }); - - // Manually trigger the table_html change handler - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const firstRowCell = el.querySelector('tr[data-orig-row="0"] td'); - const rowsInGroup = el.querySelectorAll('tr[data-orig-row="0"] td'); - - // Simulate mouseover - const mouseOverEvent = new MouseEvent("mouseover", { - bubbles: true, - cancelable: true, - }); - firstRowCell.dispatchEvent(mouseOverEvent); - - // Check if row-hover class is added to all cells in the group - - rowsInGroup.forEach((cell) => { - expect(cell.classList.contains("row-hover")).toBe(true); - }); - - // Simulate mouseout - const mouseOutEvent = new MouseEvent("mouseout", { - bubbles: true, - cancelable: true, - }); - firstRowCell.dispatchEvent(mouseOutEvent); - - // Check if row-hover class is removed - - rowsInGroup.forEach((cell) => { - expect(cell.classList.contains("row-hover")).toBe(false); - }); - }); - - it("should not highlight unrelated rows when hovering over a nested data row", () => { - // Mock HTML with nested data structure - model.get.mockImplementation((property) => { - if (property === "table_html") { - return ` + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const firstRowCell = el.querySelector('tr[data-orig-row="0"] td'); + const rowsInGroup = el.querySelectorAll('tr[data-orig-row="0"] td'); + + // Simulate mouseover + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOverEvent); + + // Check if row-hover class is added to all cells in the group + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains('row-hover')).toBe(true); + }); + + // Simulate mouseout + const mouseOutEvent = new MouseEvent('mouseout', { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOutEvent); + + // Check if row-hover class is removed + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains('row-hover')).toBe(false); + }); + }); + + it('should not highlight unrelated rows when hovering over a nested data row', () => { + // Mock HTML with nested data structure + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return `
Row 1 Part A
Row 1 Part B
Row 2
`; - } - if (property === "orderable_columns") { - return []; - } - return null; - }); - - render({ model, el }); - - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const row1Cell = el.querySelector('tr[data-orig-row="0"] td'); - const row2Cell = el.querySelector('tr[data-orig-row="1"] td'); - - const mouseOverEvent = new MouseEvent("mouseover", { - bubbles: true, - cancelable: true, - }); - row1Cell.dispatchEvent(mouseOverEvent); - - // Row 2 should NOT have the hover class - expect(row2Cell.classList.contains("row-hover")).toBe(false); - }); - - it("should not highlight other rows when hovering over a non-nested row", () => { - // Mock HTML with mixed data structure - model.get.mockImplementation((property) => { - if (property === "table_html") { - return ` + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const row1Cell = el.querySelector('tr[data-orig-row="0"] td'); + const row2Cell = el.querySelector('tr[data-orig-row="1"] td'); + + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + row1Cell.dispatchEvent(mouseOverEvent); + + // Row 2 should NOT have the hover class + expect(row2Cell.classList.contains('row-hover')).toBe(false); + }); + + it('should not highlight other rows when hovering over a non-nested row', () => { + // Mock HTML with mixed data structure + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return `
Standard Row
Nested Row
`; - } - if (property === "orderable_columns") { - return []; - } - return null; - }); - - render({ model, el }); - - const tableHtmlChangeHandler = model.on.mock.calls.find( - (call) => call[0] === "change:table_html", - )[1]; - tableHtmlChangeHandler(); - - const standardCell = el.querySelector("tr:not([data-orig-row]) td"); - const nestedCell = el.querySelector('tr[data-orig-row="0"] td'); - - const mouseOverEvent = new MouseEvent("mouseover", { - bubbles: true, - cancelable: true, - }); - standardCell.dispatchEvent(mouseOverEvent); - - // The nested row should NOT have the hover class - expect(nestedCell.classList.contains("row-hover")).toBe(false); - }); + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const standardCell = el.querySelector('tr:not([data-orig-row]) td'); + const nestedCell = el.querySelector('tr[data-orig-row="0"] td'); + + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + standardCell.dispatchEvent(mouseOverEvent); + + // The nested row should NOT have the hover class + expect(nestedCell.classList.contains('row-hover')).toBe(false); + }); }); From 41df7b3a387c7eff435ffbdedf05169d76fc1c5e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 00:35:30 +0000 Subject: [PATCH 11/29] Fix: Improve performance of nested data flattening --- bigframes/display/_flatten.py | 149 +++++++++++++++++----------------- 1 file changed, 75 insertions(+), 74 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index e25fc1d74a..7a3f5b07df 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -159,14 +159,14 @@ def _flatten_array_of_struct_columns( nested_originated_columns.add(new_col_name) new_array_col_names.append(new_col_name) - # Reconstruct ListArray for this field - # Use mask=arrow_array.is_null() to preserve nulls from the original list + # Reconstruct ListArray for this field. This transforms the + # array> into separate array and array columns. new_list_array = pa.ListArray.from_arrays( offsets, flattened_fields[field_idx], mask=arrow_array.is_null() ) new_cols_to_add[new_col_name] = pd.Series( - new_list_array.to_pylist(), + new_list_array, dtype=pd.ArrowDtype(pa.list_(field.type)), index=result_df.index, ) @@ -194,78 +194,79 @@ def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] ) -> tuple[pd.DataFrame, dict[str, list[int]]]: """Explode array columns into new rows.""" - exploded_rows = [] - array_row_groups: dict[str, list[int]] = {} + if not array_columns: + return dataframe, {} + non_array_columns = dataframe.columns.drop(array_columns).tolist() - non_array_df = dataframe[non_array_columns] - - for orig_idx in dataframe.index: - non_array_data = non_array_df.loc[orig_idx].to_dict() - array_values = {} - max_len_in_row = 0 - non_na_array_found = False - - for col_name in array_columns: - val = dataframe.loc[orig_idx, col_name] - if val is not None and not ( - isinstance(val, list) and len(val) == 1 and pd.isna(val[0]) - ): - array_values[col_name] = list(val) - max_len_in_row = max(max_len_in_row, len(val)) - non_na_array_found = True - else: - array_values[col_name] = [] - - if not non_na_array_found: - new_row = non_array_data.copy() - for col_name in array_columns: - new_row[f"{col_name}"] = pd.NA - exploded_rows.append(new_row) - orig_key = str(orig_idx) - if orig_key not in array_row_groups: - array_row_groups[orig_key] = [] - array_row_groups[orig_key].append(len(exploded_rows) - 1) - continue - - # Create one row per array element, up to max_len_in_row - for array_idx in range(max_len_in_row): - new_row = non_array_data.copy() - - # Add the specific array element for this index - for col_name in array_columns: - if array_idx < len(array_values.get(col_name, [])): - new_row[f"{col_name}"] = array_values[col_name][array_idx] - else: - new_row[f"{col_name}"] = pd.NA - - exploded_rows.append(new_row) - - # Track which rows belong to which original row - orig_key = str(orig_idx) - if orig_key not in array_row_groups: - array_row_groups[orig_key] = [] - array_row_groups[orig_key].append(len(exploded_rows) - 1) - - if exploded_rows: - # Reconstruct the DataFrame to maintain original column order - exploded_df = pd.DataFrame(exploded_rows)[dataframe.columns] - for col in exploded_df.columns: - # After explosion, object columns that are all-numeric (except for NAs) - # should be converted to a numeric dtype for proper alignment. - if exploded_df[col].dtype == "object": - try: - # Use nullable integer type to preserve integers - exploded_df[col] = exploded_df[col].astype(pd.Int64Dtype()) - except (ValueError, TypeError): - # Fallback for non-integer numerics - try: - exploded_df[col] = pd.to_numeric(exploded_df[col]) - except (ValueError, TypeError): - # Keep as object if not numeric - pass - return exploded_df, array_row_groups + if not non_array_columns: + # Add a temporary column to allow grouping if all columns are arrays + non_array_columns = ["_temp_grouping_col"] + dataframe["_temp_grouping_col"] = range(len(dataframe)) + + # Preserve original index + if dataframe.index.name: + original_index_name = dataframe.index.name + dataframe = dataframe.reset_index() + non_array_columns.append(original_index_name) + else: + original_index_name = None + dataframe = dataframe.reset_index(names=["_original_index"]) + non_array_columns.append("_original_index") + + exploded_dfs = [] + for col in array_columns: + # Explode each array column individually + exploded = dataframe[non_array_columns + [col]].explode(col) + exploded["_row_num"] = exploded.groupby(non_array_columns).cumcount() + exploded_dfs.append(exploded) + + if not exploded_dfs: + return dataframe, {} + + # Merge the exploded columns + merged_df = exploded_dfs[0] + for i in range(1, len(exploded_dfs)): + merged_df = pd.merge( + merged_df, + exploded_dfs[i], + on=non_array_columns + ["_row_num"], + how="outer", + ) + + # Restore original column order and sort + final_cols = dataframe.columns.tolist() + ["_row_num"] + merged_df = merged_df.sort_values(non_array_columns + ["_row_num"]).reset_index( + drop=True + ) + + # Create row groups + array_row_groups = {} + if "_original_index" in merged_df.columns: + grouping_col = "_original_index" + elif original_index_name: + grouping_col = original_index_name else: - return dataframe, array_row_groups + # Fallback if no clear grouping column is identified + grouping_col = non_array_columns[0] + + for orig_idx, group in merged_df.groupby(grouping_col): + array_row_groups[str(orig_idx)] = group.index.tolist() + + # Clean up temporary columns + if "_temp_grouping_col" in merged_df.columns: + merged_df = merged_df.drop(columns=["_temp_grouping_col"]) + final_cols.remove("_temp_grouping_col") + if "_original_index" in merged_df.columns: + merged_df = merged_df.drop(columns=["_original_index"]) + final_cols.remove("_original_index") + if original_index_name: + merged_df = merged_df.set_index(original_index_name) + final_cols.remove(original_index_name) + + final_cols.remove("_row_num") + merged_df = merged_df[final_cols] + + return merged_df, array_row_groups def _flatten_struct_columns( @@ -295,7 +296,7 @@ def _flatten_struct_columns( # Create a new Series from the flattened array new_cols_to_add[new_col_name] = pd.Series( - flattened_fields[field_idx].to_pylist(), + flattened_fields[field_idx], dtype=pd.ArrowDtype(field.type), index=result_df.index, ) From e36467468fb60514e4c9ea6a06b1ae9dfe7be95f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 00:52:27 +0000 Subject: [PATCH 12/29] Fix: Correct bug in nested data flattening --- bigframes/display/_flatten.py | 56 +++++++++++++++-------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 7a3f5b07df..8a7c5b7135 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -197,30 +197,35 @@ def _explode_array_columns( if not array_columns: return dataframe, {} - non_array_columns = dataframe.columns.drop(array_columns).tolist() + original_cols = dataframe.columns.tolist() + work_df = dataframe + + non_array_columns = work_df.columns.drop(array_columns).tolist() if not non_array_columns: + work_df = work_df.copy() # Avoid modifying input # Add a temporary column to allow grouping if all columns are arrays non_array_columns = ["_temp_grouping_col"] - dataframe["_temp_grouping_col"] = range(len(dataframe)) + work_df["_temp_grouping_col"] = range(len(work_df)) # Preserve original index - if dataframe.index.name: - original_index_name = dataframe.index.name - dataframe = dataframe.reset_index() + if work_df.index.name: + original_index_name = work_df.index.name + work_df = work_df.reset_index() non_array_columns.append(original_index_name) else: original_index_name = None - dataframe = dataframe.reset_index(names=["_original_index"]) + work_df = work_df.reset_index(names=["_original_index"]) non_array_columns.append("_original_index") exploded_dfs = [] for col in array_columns: # Explode each array column individually - exploded = dataframe[non_array_columns + [col]].explode(col) + exploded = work_df[non_array_columns + [col]].explode(col) exploded["_row_num"] = exploded.groupby(non_array_columns).cumcount() exploded_dfs.append(exploded) if not exploded_dfs: + # This should not be reached if array_columns is not empty return dataframe, {} # Merge the exploded columns @@ -234,39 +239,26 @@ def _explode_array_columns( ) # Restore original column order and sort - final_cols = dataframe.columns.tolist() + ["_row_num"] merged_df = merged_df.sort_values(non_array_columns + ["_row_num"]).reset_index( drop=True ) # Create row groups array_row_groups = {} - if "_original_index" in merged_df.columns: - grouping_col = "_original_index" - elif original_index_name: - grouping_col = original_index_name - else: - # Fallback if no clear grouping column is identified - grouping_col = non_array_columns[0] - - for orig_idx, group in merged_df.groupby(grouping_col): - array_row_groups[str(orig_idx)] = group.index.tolist() - - # Clean up temporary columns - if "_temp_grouping_col" in merged_df.columns: - merged_df = merged_df.drop(columns=["_temp_grouping_col"]) - final_cols.remove("_temp_grouping_col") - if "_original_index" in merged_df.columns: - merged_df = merged_df.drop(columns=["_original_index"]) - final_cols.remove("_original_index") - if original_index_name: - merged_df = merged_df.set_index(original_index_name) - final_cols.remove(original_index_name) + grouping_col_name = ( + "_original_index" if original_index_name is None else original_index_name + ) + if grouping_col_name in merged_df.columns: + for orig_idx, group in merged_df.groupby(grouping_col_name): + array_row_groups[str(orig_idx)] = group.index.tolist() + + # Restore original columns + result_df = merged_df[original_cols] - final_cols.remove("_row_num") - merged_df = merged_df[final_cols] + if original_index_name: + result_df = result_df.set_index(original_index_name) - return merged_df, array_row_groups + return result_df, array_row_groups def _flatten_struct_columns( From 68b7fbb0841a6c6f26527d89961ca06fa4f0cbe5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 00:02:04 +0000 Subject: [PATCH 13/29] fix(display): fix explode on Arrow list columns in flatten_nested_data --- bigframes/display/_flatten.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 8a7c5b7135..02359d8fb4 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -220,7 +220,24 @@ def _explode_array_columns( exploded_dfs = [] for col in array_columns: # Explode each array column individually - exploded = work_df[non_array_columns + [col]].explode(col) + col_series = work_df[col] + target_dtype = None + if isinstance(col_series.dtype, pd.ArrowDtype): + pa_type = col_series.dtype.pyarrow_dtype + if pa.types.is_list(pa_type): + target_dtype = pd.ArrowDtype(pa_type.value_type) + # Use to_list() to avoid pandas attempting to create a 2D numpy + # array if the list elements have the same length. + col_series = pd.Series( + col_series.to_list(), index=col_series.index, dtype=object + ) + + exploded = work_df[non_array_columns].assign(**{col: col_series}).explode(col) + + if target_dtype is not None: + # Re-cast to arrow dtype if possible + exploded[col] = exploded[col].astype(target_dtype) + exploded["_row_num"] = exploded.groupby(non_array_columns).cumcount() exploded_dfs.append(exploded) From 0b73c0adaa3b92d2c31b33a5ea884c3dfe8a7949 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 01:18:23 +0000 Subject: [PATCH 14/29] perf(display): optimize nested data flattening and fix js style --- bigframes/display/_flatten.py | 39 ++++++++++++++++++--------------- bigframes/display/html.py | 23 ++++++++++--------- tests/js/table_widget.test.js | 12 +++++----- tests/unit/display/test_html.py | 9 ++++---- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 02359d8fb4..b9f88f66c0 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -30,11 +30,11 @@ class FlattenResult: dataframe: pd.DataFrame """The flattened DataFrame.""" - row_groups: dict[str, list[int]] - """ - A mapping from original row index to the new row indices that were created - from it. - """ + row_labels: list[str] | None + """A list of original row labels for each row in the flattened DataFrame.""" + + continuation_rows: set[int] | None + """A set of row indices that are continuation rows.""" cleared_on_continuation: list[str] """A list of column names that should be cleared on continuation rows.""" @@ -50,7 +50,8 @@ def flatten_nested_data( if dataframe.empty: return FlattenResult( dataframe=dataframe.copy(), - row_groups={}, + row_labels=None, + continuation_rows=None, cleared_on_continuation=[], nested_columns=set(), ) @@ -77,15 +78,19 @@ def flatten_nested_data( if not array_columns: return FlattenResult( dataframe=result_df, - row_groups={}, + row_labels=None, + continuation_rows=None, cleared_on_continuation=clear_on_continuation_cols, nested_columns=nested_originated_columns, ) - result_df, array_row_groups = _explode_array_columns(result_df, array_columns) + result_df, row_labels, continuation_rows = _explode_array_columns( + result_df, array_columns + ) return FlattenResult( dataframe=result_df, - row_groups=array_row_groups, + row_labels=row_labels, + continuation_rows=continuation_rows, cleared_on_continuation=clear_on_continuation_cols, nested_columns=nested_originated_columns, ) @@ -192,10 +197,10 @@ def _flatten_array_of_struct_columns( def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] -) -> tuple[pd.DataFrame, dict[str, list[int]]]: +) -> tuple[pd.DataFrame, list[str], set[int]]: """Explode array columns into new rows.""" if not array_columns: - return dataframe, {} + return dataframe, [], set() original_cols = dataframe.columns.tolist() work_df = dataframe @@ -243,7 +248,7 @@ def _explode_array_columns( if not exploded_dfs: # This should not be reached if array_columns is not empty - return dataframe, {} + return dataframe, [], set() # Merge the exploded columns merged_df = exploded_dfs[0] @@ -260,14 +265,12 @@ def _explode_array_columns( drop=True ) - # Create row groups - array_row_groups = {} + # Generate row labels and continuation mask efficiently grouping_col_name = ( "_original_index" if original_index_name is None else original_index_name ) - if grouping_col_name in merged_df.columns: - for orig_idx, group in merged_df.groupby(grouping_col_name): - array_row_groups[str(orig_idx)] = group.index.tolist() + row_labels = merged_df[grouping_col_name].astype(str).tolist() + continuation_rows = set(merged_df.index[merged_df["_row_num"] > 0]) # Restore original columns result_df = merged_df[original_cols] @@ -275,7 +278,7 @@ def _explode_array_columns( if original_index_name: result_df = result_df.set_index(original_index_name) - return result_df, array_row_groups + return result_df, row_labels, continuation_rows def _flatten_struct_columns( diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 2f347e6ed0..81592daabc 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -60,7 +60,8 @@ def render_html( table_html_parts.append( _render_table_body( flatten_result.dataframe, - flatten_result.row_groups, + flatten_result.row_labels, + flatten_result.continuation_rows, flatten_result.cleared_on_continuation, flatten_result.nested_columns, ) @@ -87,7 +88,8 @@ def _render_table_header(dataframe: pd.DataFrame, orderable_columns: list[str]) def _render_table_body( dataframe: pd.DataFrame, - array_row_groups: dict[str, list[int]], + row_labels: list[str] | None, + continuation_rows: set[int] | None, clear_on_continuation: list[str], nested_originated_columns: set[str], ) -> str: @@ -99,14 +101,15 @@ def _render_table_body( row_class = "" orig_row_idx = None is_continuation = False - for orig_key, row_indices in array_row_groups.items(): - if i in row_indices and row_indices[0] != i: - row_class = "array-continuation" - orig_row_idx = orig_key - is_continuation = True - break - - if row_class: + + if row_labels: + orig_row_idx = row_labels[i] + + if continuation_rows and i in continuation_rows: + is_continuation = True + row_class = "array-continuation" + + if orig_row_idx is not None: body_parts.append( f' ' ) diff --git a/tests/js/table_widget.test.js b/tests/js/table_widget.test.js index 8bebb42619..9436f8b3d6 100644 --- a/tests/js/table_widget.test.js +++ b/tests/js/table_widget.test.js @@ -20,7 +20,7 @@ import { jest } from '@jest/globals'; -/** +/* * Test suite for the TableWidget frontend component. */ describe('TableWidget', () => { @@ -31,7 +31,7 @@ describe('TableWidget', () => { /** @type {Function} */ let render; - /** + /* * Sets up the test environment before each test. * This includes resetting modules, creating a DOM element, * and mocking the widget model. @@ -58,7 +58,7 @@ describe('TableWidget', () => { expect(render).toBeDefined(); }); - /** + /* * Tests for the render function of the widget. */ describe('render', () => { @@ -91,7 +91,7 @@ describe('TableWidget', () => { expect(el.querySelector('div:nth-child(3)')).not.toBeNull(); }); - /** + /* * Verifies that clicking a sortable column header triggers a sort action * with the correct parameters. */ @@ -220,7 +220,7 @@ describe('TableWidget', () => { expect(indicator2.textContent).toBe('●'); }); - /** + /* * Tests that holding the Shift key while clicking a column header * adds the new column to the existing sort context for multi-column sorting. */ @@ -362,7 +362,7 @@ describe('TableWidget', () => { expect(headers[1].textContent).toBe('value'); }); - /** + /* * Verifies that hovering over a cell in a group of flattened rows * (i.e., rows originating from the same nested data structure) * adds a hover class to all cells in that group. diff --git a/tests/unit/display/test_html.py b/tests/unit/display/test_html.py index 4ea880d9ba..08a89df65b 100644 --- a/tests/unit/display/test_html.py +++ b/tests/unit/display/test_html.py @@ -190,12 +190,11 @@ def test_flatten_nested_data_explodes_arrays(): result = flatten_nested_data(array_data) flattened = result.dataframe - groups = result.row_groups + row_labels = result.row_labels + continuation_rows = result.continuation_rows nested_originated_columns = result.nested_columns assert len(flattened) == 5 # 3 + 2 array elements - assert "0" in groups # First original row - assert len(groups["0"]) == 3 # Three array elements - assert "1" in groups - assert len(groups["1"]) == 2 + assert row_labels == ["0", "0", "0", "1", "1"] + assert continuation_rows == {1, 2, 4} assert "array_col" in nested_originated_columns From 21a5d5c82bc222a781248314d8e775054c21c128 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 01:32:41 +0000 Subject: [PATCH 15/29] test: rerun notebook --- notebooks/dataframes/anywidget_mode.ipynb | 244 ++++++++++++---------- 1 file changed, 137 insertions(+), 107 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 65a378f9f7..064aa70b2d 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,15 +35,7 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Unable to set __version__, run `pip install -e .` or `python setup.py develop` first.\n" - ] - } - ], + "outputs": [], "source": [ "import bigframes.pandas as bpd" ] @@ -181,7 +173,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fdbca03dd2ff41bcb6c8c962a8a24aed", + "model_id": "f51152e46134483da9e186ae9c21b219", "version_major": 2, "version_minor": 1 }, @@ -217,80 +209,80 @@ " AL\n", " F\n", " 1910\n", - " Annie\n", - " 482\n", + " Lillian\n", + " 99\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Myrtle\n", - " 104\n", + " Ruby\n", + " 204\n", " \n", " \n", " 2\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Lillian\n", - " 56\n", + " Helen\n", + " 76\n", " \n", " \n", " 3\n", - " CT\n", + " AL\n", " F\n", " 1910\n", - " Anne\n", - " 38\n", + " Eunice\n", + " 41\n", " \n", " \n", " 4\n", - " CT\n", + " AR\n", " F\n", " 1910\n", - " Frances\n", - " 45\n", + " Dora\n", + " 42\n", " \n", " \n", " 5\n", - " FL\n", + " CA\n", " F\n", " 1910\n", - " Margaret\n", - " 53\n", + " Edna\n", + " 62\n", " \n", " \n", " 6\n", - " GA\n", + " CA\n", " F\n", " 1910\n", - " Mae\n", - " 73\n", + " Helen\n", + " 239\n", " \n", " \n", " 7\n", - " GA\n", + " CO\n", " F\n", " 1910\n", - " Beatrice\n", - " 96\n", + " Alice\n", + " 46\n", " \n", " \n", " 8\n", - " GA\n", + " FL\n", " F\n", " 1910\n", - " Lola\n", - " 47\n", + " Willie\n", + " 71\n", " \n", " \n", " 9\n", - " IA\n", + " FL\n", " F\n", " 1910\n", - " Viola\n", - " 49\n", + " Thelma\n", + " 65\n", " \n", " \n", "\n", @@ -298,17 +290,17 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -342,7 +334,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 28 seconds of slot time. [Job bigframes-dev:US.814d7716-440e-4df5-8c4d-9da6cc033b23 details]\n", + " Query processed 171.4 MB in 36 seconds of slot time. [Job bigframes-dev:US.7885d9f3-ddfa-41cb-ad0e-580119390ab6 details]\n", " " ], "text/plain": [ @@ -422,8 +414,15 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_DI8qDij7y2WqsPLVZ418ThOjVOTe details]\n", + "\n", + " Query started with request ID bigframes-dev:US.2f9f26b3-ff3e-431d-a676-d5daf5e52796.
SQL
SELECT\n",
+       "`year` AS `year`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t0`.`year`,\n",
+       "  `t0`.`bfuid_col_2` AS `bfuid_col_5`\n",
+       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
+       "ORDER BY `bfuid_col_5` ASC NULLS LAST
\n", " " ], "text/plain": [ @@ -437,7 +436,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gETUk0MDyTx3-BEil5H1iayTXH4I details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_nIGVzX-38kg4za0Qwpm1hmmI_50_ details]\n", " " ], "text/plain": [ @@ -450,7 +449,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4a652ba2aab8430da175895fd4d4e279", + "model_id": "148a49927f4d4a3e9d7bbd404cbdaa2a", "version_major": 2, "version_minor": 1 }, @@ -548,8 +547,23 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_XxI3HVIuSXKhKHqxS-GYZ_Fse2w_ details]\n", + "\n", + " Query started with request ID bigframes-dev:US.9b3ddeec-c764-4860-b15a-1ce48045e438.
SQL
SELECT\n",
+       "`state` AS `state`,\n",
+       "`gender` AS `gender`,\n",
+       "`year` AS `year`,\n",
+       "`name` AS `name`,\n",
+       "`number` AS `number`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t0`.`state`,\n",
+       "  `t0`.`gender`,\n",
+       "  `t0`.`year`,\n",
+       "  `t0`.`name`,\n",
+       "  `t0`.`number`,\n",
+       "  `t0`.`bfuid_col_2` AS `bfuid_col_7`\n",
+       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
+       "ORDER BY `bfuid_col_7` ASC NULLS LAST
\n", " " ], "text/plain": [ @@ -563,7 +577,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_FBd28IEWqZEFT1lvmS0lOUCBtOhj details]\n", + " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_nJW2lE25m_vpfd-BYf4L-tJZUxvf details]\n", " " ], "text/plain": [ @@ -583,12 +597,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cbcdaeca25634c7d93d2aed9fb7cc74b", + "model_id": "b32b55e7345e44d7803e43de84007cbe", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -663,8 +677,24 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", + "\n", + " Query started with request ID bigframes-dev:US.e96b418c-c635-411f-b8e7-fb53617cae78.
SQL
SELECT\n",
+       "`state` AS `state`,\n",
+       "`gender` AS `gender`,\n",
+       "`year` AS `year`,\n",
+       "`name` AS `name`,\n",
+       "`number` AS `number`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t0`.`state`,\n",
+       "  `t0`.`gender`,\n",
+       "  `t0`.`year`,\n",
+       "  `t0`.`name`,\n",
+       "  `t0`.`number`,\n",
+       "  `t0`.`bfuid_col_2` AS `bfuid_col_9`\n",
+       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
+       "ORDER BY `name` ASC NULLS LAST ,`year` ASC NULLS LAST ,`state` ASC NULLS LAST ,`bfuid_col_9` ASC NULLS LAST\n",
+       "LIMIT 5
\n", " " ], "text/plain": [ @@ -698,12 +728,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e5ee622cc3ee447cb75e7698de50a53c", + "model_id": "0f234155ce4a45ae96938d3845f204de", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -808,7 +838,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "12b8bb97a94a495aabf36bb39386c527", + "model_id": "1aa385620a5f4daebc44dd890ba75381", "version_major": 2, "version_minor": 1 }, @@ -855,6 +885,24 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", + " 29.08.018\n", + " E04H 6/12\n", + " <NA>\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanwälte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER\n", + " EP 3 366 869 A1\n", + " \n", + " \n", + " 1\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", " 03.10.2018\n", " G06F 11/30\n", " <NA>\n", @@ -868,7 +916,7 @@ " EP 3 382 553 A1\n", " \n", " \n", - " 1\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -886,7 +934,7 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 2\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -904,7 +952,7 @@ " EP 3 382 744 A1\n", " \n", " \n", - " 3\n", + " 4\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -921,24 +969,6 @@ " MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", - " \n", - " 4\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 29.08.018\n", - " E04H 6/12\n", - " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanwälte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER\n", - " EP 3 366 869 A1\n", - " \n", " \n", "\n", "

5 rows × 15 columns

\n", @@ -960,32 +990,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 G06F 11/30 18157347.8 \n", - "1 03.10.2018 H05B 6/12 18165514.3 \n", - "2 03.10.2018 H01L 21/20 18166536.5 \n", - "3 03.10.2018 A01K 31/00 18171005.4 \n", - "4 29.08.018 E04H 6/12 18157874.1 \n", + "0 29.08.018 E04H 6/12 18157874.1 \n", + "1 03.10.2018 G06F 11/30 18157347.8 \n", + "2 03.10.2018 H05B 6/12 18165514.3 \n", + "3 03.10.2018 H01L 21/20 18166536.5 \n", + "4 03.10.2018 A01K 31/00 18171005.4 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "1 03.04.2018 30.03.2017 \n", - "2 16.02.2016 Scheider, Sascha et al \n", - "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", - "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", + "1 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "2 03.04.2018 30.03.2017 \n", + "3 16.02.2016 Scheider, Sascha et al \n", + "4 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 FUJITSU LIMITED Kukihara, Kensuke \n", - "1 BSH Hausgeräte GmbH Acero Acero, Jesus \n", - "2 EV Group E. Thallner GmbH Kurz, Florian \n", - "3 Linco Food Systems A/S Thrane, Uffe \n", - "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "1 FUJITSU LIMITED Kukihara, Kensuke \n", + "2 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "3 EV Group E. Thallner GmbH Kurz, Florian \n", + "4 Linco Food Systems A/S Thrane, Uffe \n", "\n", " title_line_1 number \n", - "0 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", - "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "4 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", + "0 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", + "1 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "2 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "3 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "4 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -1066,7 +1096,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "76b3ef8ead0949d782106e5a255d6e3c", + "model_id": "4037a24ad291479da4bba4393ce97a52", "version_major": 2, "version_minor": 1 }, @@ -1303,7 +1333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.0" + "version": "3.10.15" } }, "nbformat": 4, From 36a9a375534f70033c7efedefc4471cb6f1770ac Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 01:43:04 +0000 Subject: [PATCH 16/29] fix(display): add row hover effect for nested data rows --- bigframes/display/table_widget.js | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/bigframes/display/table_widget.js b/bigframes/display/table_widget.js index 40a027a8bc..203f837222 100644 --- a/bigframes/display/table_widget.js +++ b/bigframes/display/table_widget.js @@ -237,6 +237,38 @@ function render({ model, el }) { } }); + // Add hover effect for flattened rows + const rows = tableContainer.querySelectorAll('tbody tr'); + rows.forEach((row) => { + row.addEventListener('mouseover', () => { + const origRow = row.getAttribute('data-orig-row'); + if (origRow !== null) { + const groupRows = tableContainer.querySelectorAll( + `tr[data-orig-row="${origRow}"]`, + ); + groupRows.forEach((r) => { + r.querySelectorAll('td').forEach((cell) => { + cell.classList.add('row-hover'); + }); + }); + } + }); + + row.addEventListener('mouseout', () => { + const origRow = row.getAttribute('data-orig-row'); + if (origRow !== null) { + const groupRows = tableContainer.querySelectorAll( + `tr[data-orig-row="${origRow}"]`, + ); + groupRows.forEach((r) => { + r.querySelectorAll('td').forEach((cell) => { + cell.classList.remove('row-hover'); + }); + }); + } + }); + }); + updateButtonStates(); } From 4d46e3cce437f468c65be4d1020f7ceaf6051a2a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 22:39:13 +0000 Subject: [PATCH 17/29] refactor: code refactor --- bigframes/display/_flatten.py | 83 +++++++++++++++++++++--------- bigframes/display/table_widget.css | 8 +++ bigframes/display/table_widget.js | 2 +- tests/js/table_widget.test.js | 4 +- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index b9f88f66c0..64cd679832 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -43,6 +43,40 @@ class FlattenResult: """A set of column names that were created from nested data.""" +@dataclasses.dataclass(frozen=True) +class ColumnClassification: + """The result of classifying columns.""" + + struct_columns: list[str] + """Columns that are STRUCTs.""" + + array_columns: list[str] + """Columns that are ARRAYs.""" + + array_of_struct_columns: list[str] + """Columns that are ARRAYs of STRUCTs.""" + + clear_on_continuation_cols: list[str] + """Columns that should be cleared on continuation rows.""" + + nested_originated_columns: set[str] + """Columns that were created from nested data.""" + + +@dataclasses.dataclass(frozen=True) +class ExplodeResult: + """The result of exploding array columns.""" + + dataframe: pd.DataFrame + """The exploded DataFrame.""" + + row_labels: list[str] + """Labels for the rows.""" + + continuation_rows: set[int] + """Indices of continuation rows.""" + + def flatten_nested_data( dataframe: pd.DataFrame, ) -> FlattenResult: @@ -58,13 +92,16 @@ def flatten_nested_data( result_df = dataframe.copy() - ( - struct_columns, - array_columns, - array_of_struct_columns, - clear_on_continuation_cols, - nested_originated_columns, - ) = _classify_columns(result_df) + classification = _classify_columns(result_df) + # Extract lists to allow modification + # TODO(b/469966526): The modification of these lists in place by subsequent functions + # (e.g. _flatten_array_of_struct_columns removing items from array_columns) suggests + # that the data flow here could be cleaner, but keeping it as is for now. + struct_columns = classification.struct_columns + array_columns = classification.array_columns + array_of_struct_columns = classification.array_of_struct_columns + clear_on_continuation_cols = classification.clear_on_continuation_cols + nested_originated_columns = classification.nested_originated_columns result_df, array_columns = _flatten_array_of_struct_columns( result_df, array_of_struct_columns, array_columns, nested_originated_columns @@ -84,13 +121,11 @@ def flatten_nested_data( nested_columns=nested_originated_columns, ) - result_df, row_labels, continuation_rows = _explode_array_columns( - result_df, array_columns - ) + explode_result = _explode_array_columns(result_df, array_columns) return FlattenResult( - dataframe=result_df, - row_labels=row_labels, - continuation_rows=continuation_rows, + dataframe=explode_result.dataframe, + row_labels=explode_result.row_labels, + continuation_rows=explode_result.continuation_rows, cleared_on_continuation=clear_on_continuation_cols, nested_columns=nested_originated_columns, ) @@ -98,7 +133,7 @@ def flatten_nested_data( def _classify_columns( dataframe: pd.DataFrame, -) -> tuple[list[str], list[str], list[str], list[str], set[str]]: +) -> ColumnClassification: """Identify all STRUCT and ARRAY columns.""" initial_columns = list(dataframe.columns) struct_columns: list[str] = [] @@ -126,12 +161,12 @@ def _classify_columns( clear_on_continuation_cols.append(col_name) elif col_name in initial_columns: clear_on_continuation_cols.append(col_name) - return ( - struct_columns, - array_columns, - array_of_struct_columns, - clear_on_continuation_cols, - nested_originated_columns, + return ColumnClassification( + struct_columns=struct_columns, + array_columns=array_columns, + array_of_struct_columns=array_of_struct_columns, + clear_on_continuation_cols=clear_on_continuation_cols, + nested_originated_columns=nested_originated_columns, ) @@ -197,10 +232,10 @@ def _flatten_array_of_struct_columns( def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] -) -> tuple[pd.DataFrame, list[str], set[int]]: +) -> ExplodeResult: """Explode array columns into new rows.""" if not array_columns: - return dataframe, [], set() + return ExplodeResult(dataframe, [], set()) original_cols = dataframe.columns.tolist() work_df = dataframe @@ -248,7 +283,7 @@ def _explode_array_columns( if not exploded_dfs: # This should not be reached if array_columns is not empty - return dataframe, [], set() + return ExplodeResult(dataframe, [], set()) # Merge the exploded columns merged_df = exploded_dfs[0] @@ -278,7 +313,7 @@ def _explode_array_columns( if original_index_name: result_df = result_df.set_index(original_index_name) - return result_df, row_labels, continuation_rows + return ExplodeResult(result_df, row_labels, continuation_rows) def _flatten_struct_columns( diff --git a/bigframes/display/table_widget.css b/bigframes/display/table_widget.css index b02caa004e..ee7057e24a 100644 --- a/bigframes/display/table_widget.css +++ b/bigframes/display/table_widget.css @@ -26,6 +26,7 @@ --bf-header-bg: #f5f5f5; --bf-null-fg: gray; --bf-row-even-bg: #f5f5f5; + --bf-row-hover-bg: #e8eaed; --bf-row-odd-bg: white; background-color: var(--bf-bg); @@ -59,6 +60,7 @@ --bf-header-bg: var(--vscode-editor-background, black); --bf-null-fg: #aaa; --bf-row-even-bg: #202124; + --bf-row-hover-bg: #4c4c4c; --bf-row-odd-bg: #383838; } } @@ -75,6 +77,7 @@ body[data-theme='dark'] .bigframes-widget.bigframes-widget { --bf-header-bg: var(--vscode-editor-background, black); --bf-null-fg: #aaa; --bf-row-even-bg: #202124; + --bf-row-hover-bg: #4c4c4c; --bf-row-odd-bg: #383838; } @@ -236,3 +239,8 @@ body[data-theme='dark'] .bigframes-widget.bigframes-widget { .bigframes-widget .debug-info { border-top: 1px solid var(--bf-border-color); } + +.bigframes-widget table tbody tr:hover td, +.bigframes-widget table tbody tr td.row-hover { + background-color: var(--bf-row-hover-bg); +} diff --git a/bigframes/display/table_widget.js b/bigframes/display/table_widget.js index 203f837222..3a406bcc0b 100644 --- a/bigframes/display/table_widget.js +++ b/bigframes/display/table_widget.js @@ -316,4 +316,4 @@ function render({ model, el }) { handleErrorMessageChange(); } -export default { render }; +export { render }; diff --git a/tests/js/table_widget.test.js b/tests/js/table_widget.test.js index 9436f8b3d6..280bd642a8 100644 --- a/tests/js/table_widget.test.js +++ b/tests/js/table_widget.test.js @@ -41,9 +41,7 @@ describe('TableWidget', () => { document.body.innerHTML = '
'; el = document.body.querySelector('div'); - const tableWidget = ( - await import('../../bigframes/display/table_widget.js') - ).default; + const tableWidget = await import('../../bigframes/display/table_widget.js'); render = tableWidget.render; model = { From 0f48f822e224fa4be1fa361f449520cd2bf8d9b3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 22:46:03 +0000 Subject: [PATCH 18/29] refactor: improve _flatten readability and table widget styles --- bigframes/display/_flatten.py | 154 ++++++++++++++++++++++++++-------- 1 file changed, 117 insertions(+), 37 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 64cd679832..03611b3ff3 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -12,7 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Utilities for flattening nested data structures for display.""" +"""Utilities for flattening nested data structures for display. + +This module provides functionality to flatten BigQuery STRUCT and ARRAY columns +in a pandas DataFrame into a format suitable for display in a 2D table widget. +It handles nested structures by: +1. Expanding STRUCT fields into separate columns (e.g., "struct.field"). +2. Exploding ARRAY elements into multiple rows, replicating other columns. +3. Generating metadata to grouping rows and handling continuation values. +""" from __future__ import annotations @@ -25,62 +33,75 @@ @dataclasses.dataclass(frozen=True) class FlattenResult: - """The result of flattening a DataFrame.""" + """The result of flattening a DataFrame. - dataframe: pd.DataFrame - """The flattened DataFrame.""" + Attributes: + dataframe: The flattened DataFrame. + row_labels: A list of original row labels for each row in the flattened DataFrame. + continuation_rows: A set of row indices that are continuation rows. + cleared_on_continuation: A list of column names that should be cleared on continuation rows. + nested_columns: A set of column names that were created from nested data. + """ + dataframe: pd.DataFrame row_labels: list[str] | None - """A list of original row labels for each row in the flattened DataFrame.""" - continuation_rows: set[int] | None - """A set of row indices that are continuation rows.""" - cleared_on_continuation: list[str] - """A list of column names that should be cleared on continuation rows.""" - nested_columns: set[str] - """A set of column names that were created from nested data.""" @dataclasses.dataclass(frozen=True) class ColumnClassification: - """The result of classifying columns.""" + """The result of classifying columns. - struct_columns: list[str] - """Columns that are STRUCTs.""" + Attributes: + struct_columns: Columns that are STRUCTs. + array_columns: Columns that are ARRAYs. + array_of_struct_columns: Columns that are ARRAYs of STRUCTs. + clear_on_continuation_cols: Columns that should be cleared on continuation rows. + nested_originated_columns: Columns that were created from nested data. + """ + struct_columns: list[str] array_columns: list[str] - """Columns that are ARRAYs.""" - array_of_struct_columns: list[str] - """Columns that are ARRAYs of STRUCTs.""" - clear_on_continuation_cols: list[str] - """Columns that should be cleared on continuation rows.""" - nested_originated_columns: set[str] - """Columns that were created from nested data.""" @dataclasses.dataclass(frozen=True) class ExplodeResult: - """The result of exploding array columns.""" + """The result of exploding array columns. - dataframe: pd.DataFrame - """The exploded DataFrame.""" + Attributes: + dataframe: The exploded DataFrame. + row_labels: Labels for the rows. + continuation_rows: Indices of continuation rows. + """ + dataframe: pd.DataFrame row_labels: list[str] - """Labels for the rows.""" - continuation_rows: set[int] - """Indices of continuation rows.""" def flatten_nested_data( dataframe: pd.DataFrame, ) -> FlattenResult: - """Flatten nested STRUCT and ARRAY columns for display.""" + """Flatten nested STRUCT and ARRAY columns for display. + + This function coordinates the flattening process: + 1. Classifies columns into STRUCT, ARRAY, ARRAY-of-STRUCT, and standard types. + 2. Flattens ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). + This simplifies the subsequent explosion step. + 3. Flattens top-level STRUCT columns into separate columns. + 4. Explodes all ARRAY columns (original and those from step 2) into multiple rows. + + Args: + dataframe: The input DataFrame containing potential nested structures. + + Returns: + A FlattenResult containing the flattened DataFrame and metadata for display. + """ if dataframe.empty: return FlattenResult( dataframe=dataframe.copy(), @@ -93,10 +114,9 @@ def flatten_nested_data( result_df = dataframe.copy() classification = _classify_columns(result_df) - # Extract lists to allow modification - # TODO(b/469966526): The modification of these lists in place by subsequent functions - # (e.g. _flatten_array_of_struct_columns removing items from array_columns) suggests - # that the data flow here could be cleaner, but keeping it as is for now. + # Extract lists to allow modification by subsequent steps. + # _flatten_array_of_struct_columns will modify array_columns to replace + # the original array-of-struct column with the new flattened array columns. struct_columns = classification.struct_columns array_columns = classification.array_columns array_of_struct_columns = classification.array_of_struct_columns @@ -134,7 +154,17 @@ def flatten_nested_data( def _classify_columns( dataframe: pd.DataFrame, ) -> ColumnClassification: - """Identify all STRUCT and ARRAY columns.""" + """Identify all STRUCT and ARRAY columns in the DataFrame. + + It inspects the PyArrow dtype of each column to determine if it is a + STRUCT, LIST (Array), or LIST of STRUCTs. + + Args: + dataframe: The DataFrame to inspect. + + Returns: + A ColumnClassification object containing lists of column names for each category. + """ initial_columns = list(dataframe.columns) struct_columns: list[str] = [] array_columns: list[str] = [] @@ -176,7 +206,21 @@ def _flatten_array_of_struct_columns( array_columns: list[str], nested_originated_columns: set[str], ) -> tuple[pd.DataFrame, list[str]]: - """Flatten ARRAY of STRUCT columns into separate array columns for each field.""" + """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field. + + For example, an ARRAY> column named 'items' will be + converted into two ARRAY columns: 'items.a' (ARRAY) and 'items.b' (ARRAY). + This allows us to treat them as standard ARRAY columns for the subsequent explosion step. + + Args: + dataframe: The DataFrame to process. + array_of_struct_columns: List of column names that are ARRAYs of STRUCTs. + array_columns: The main list of ARRAY columns to be updated. + nested_originated_columns: Set of columns tracked as originating from nested data. + + Returns: + A tuple containing the modified DataFrame and the updated list of array columns. + """ result_df = dataframe.copy() for col_name in array_of_struct_columns: col_data = result_df[col_name] @@ -233,7 +277,26 @@ def _flatten_array_of_struct_columns( def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] ) -> ExplodeResult: - """Explode array columns into new rows.""" + """Explode array columns into new rows. + + This function performs the "flattening" of 1D arrays by exploding them. + It handles multiple array columns by ensuring they are exploded in sync + relative to the other columns. + + Design details: + - We group by all non-array columns to maintain context. + - `_row_num` is used to track the index within the exploded array, effectively + synchronizing multiple arrays if they belong to the same row. + - Continuation rows (index > 0 in the explosion) are tracked so we can clear + repeated values in the display. + + Args: + dataframe: The DataFrame to explode. + array_columns: List of array columns to explode. + + Returns: + An ExplodeResult containing the new DataFrame and row metadata. + """ if not array_columns: return ExplodeResult(dataframe, [], set()) @@ -243,7 +306,8 @@ def _explode_array_columns( non_array_columns = work_df.columns.drop(array_columns).tolist() if not non_array_columns: work_df = work_df.copy() # Avoid modifying input - # Add a temporary column to allow grouping if all columns are arrays + # Add a temporary column to allow grouping if all columns are arrays. + # This ensures we can still group by "original row" even if there are no scalar columns. non_array_columns = ["_temp_grouping_col"] work_df["_temp_grouping_col"] = range(len(work_df)) @@ -278,6 +342,7 @@ def _explode_array_columns( # Re-cast to arrow dtype if possible exploded[col] = exploded[col].astype(target_dtype) + # Track position in the array for alignment exploded["_row_num"] = exploded.groupby(non_array_columns).cumcount() exploded_dfs.append(exploded) @@ -322,7 +387,22 @@ def _flatten_struct_columns( clear_on_continuation_cols: list[str], nested_originated_columns: set[str], ) -> tuple[pd.DataFrame, list[str]]: - """Flatten regular STRUCT columns.""" + """Flatten regular STRUCT columns into separate columns. + + A STRUCT column 'user' with fields 'name' and 'age' becomes 'user.name' + and 'user.age'. + + Args: + dataframe: The DataFrame to process. + struct_columns: List of STRUCT columns to flatten. + clear_on_continuation_cols: List of columns to clear on continuation, + which will be updated with the new flattened columns. + nested_originated_columns: Set of columns tracked as originating from nested data. + + Returns: + A tuple containing the modified DataFrame and the updated list of + columns to clear on continuation. + """ result_df = dataframe.copy() for col_name in struct_columns: col_data = result_df[col_name] From a8a39dc5e269dd55c9495dcac0cca0f5097bbb87 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 22:51:26 +0000 Subject: [PATCH 19/29] docs: move implementation details from docstrings to block comments --- bigframes/display/_flatten.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 03611b3ff3..40cf417908 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -89,13 +89,6 @@ def flatten_nested_data( ) -> FlattenResult: """Flatten nested STRUCT and ARRAY columns for display. - This function coordinates the flattening process: - 1. Classifies columns into STRUCT, ARRAY, ARRAY-of-STRUCT, and standard types. - 2. Flattens ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). - This simplifies the subsequent explosion step. - 3. Flattens top-level STRUCT columns into separate columns. - 4. Explodes all ARRAY columns (original and those from step 2) into multiple rows. - Args: dataframe: The input DataFrame containing potential nested structures. @@ -111,6 +104,12 @@ def flatten_nested_data( nested_columns=set(), ) + # Coordinates the flattening process: + # 1. Classifies columns into STRUCT, ARRAY, ARRAY-of-STRUCT, and standard types. + # 2. Flattens ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). + # This simplifies the subsequent explosion step. + # 3. Flattens top-level STRUCT columns into separate columns. + # 4. Explodes all ARRAY columns (original and those from step 2) into multiple rows. result_df = dataframe.copy() classification = _classify_columns(result_df) @@ -156,15 +155,14 @@ def _classify_columns( ) -> ColumnClassification: """Identify all STRUCT and ARRAY columns in the DataFrame. - It inspects the PyArrow dtype of each column to determine if it is a - STRUCT, LIST (Array), or LIST of STRUCTs. - Args: dataframe: The DataFrame to inspect. Returns: A ColumnClassification object containing lists of column names for each category. """ + # Inspects the PyArrow dtype of each column to determine if it is a + # STRUCT, LIST (Array), or LIST of STRUCTs. initial_columns = list(dataframe.columns) struct_columns: list[str] = [] array_columns: list[str] = [] @@ -283,13 +281,6 @@ def _explode_array_columns( It handles multiple array columns by ensuring they are exploded in sync relative to the other columns. - Design details: - - We group by all non-array columns to maintain context. - - `_row_num` is used to track the index within the exploded array, effectively - synchronizing multiple arrays if they belong to the same row. - - Continuation rows (index > 0 in the explosion) are tracked so we can clear - repeated values in the display. - Args: dataframe: The DataFrame to explode. array_columns: List of array columns to explode. @@ -300,6 +291,12 @@ def _explode_array_columns( if not array_columns: return ExplodeResult(dataframe, [], set()) + # Implementation details: + # - We group by all non-array columns to maintain context. + # - `_row_num` is used to track the index within the exploded array, effectively + # synchronizing multiple arrays if they belong to the same row. + # - Continuation rows (index > 0 in the explosion) are tracked so we can clear + # repeated values in the display. original_cols = dataframe.columns.tolist() work_df = dataframe From dfe5fec829bc451f8cfa7f62f2d72b82712ca3d0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 22:59:03 +0000 Subject: [PATCH 20/29] docs: remove redundant comments in _flatten.py --- bigframes/display/_flatten.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 40cf417908..097b744b2c 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -104,12 +104,6 @@ def flatten_nested_data( nested_columns=set(), ) - # Coordinates the flattening process: - # 1. Classifies columns into STRUCT, ARRAY, ARRAY-of-STRUCT, and standard types. - # 2. Flattens ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). - # This simplifies the subsequent explosion step. - # 3. Flattens top-level STRUCT columns into separate columns. - # 4. Explodes all ARRAY columns (original and those from step 2) into multiple rows. result_df = dataframe.copy() classification = _classify_columns(result_df) @@ -161,8 +155,6 @@ def _classify_columns( Returns: A ColumnClassification object containing lists of column names for each category. """ - # Inspects the PyArrow dtype of each column to determine if it is a - # STRUCT, LIST (Array), or LIST of STRUCTs. initial_columns = list(dataframe.columns) struct_columns: list[str] = [] array_columns: list[str] = [] @@ -291,12 +283,9 @@ def _explode_array_columns( if not array_columns: return ExplodeResult(dataframe, [], set()) - # Implementation details: - # - We group by all non-array columns to maintain context. - # - `_row_num` is used to track the index within the exploded array, effectively - # synchronizing multiple arrays if they belong to the same row. - # - Continuation rows (index > 0 in the explosion) are tracked so we can clear - # repeated values in the display. + # Group by all non-array columns to maintain context. + # _row_num tracks the index within the exploded array to synchronize multiple + # arrays. Continuation rows (index > 0) are tracked for display clearing. original_cols = dataframe.columns.tolist() work_df = dataframe From 15bdf54f29370cb13e92dc01bb5a389b2e444783 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 8 Jan 2026 23:07:07 +0000 Subject: [PATCH 21/29] refactor: simplify flattening logic in _flatten.py --- bigframes/display/_flatten.py | 173 +++++++++++++++++++--------------- 1 file changed, 99 insertions(+), 74 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 097b744b2c..331d1b7476 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -155,38 +155,37 @@ def _classify_columns( Returns: A ColumnClassification object containing lists of column names for each category. """ - initial_columns = list(dataframe.columns) - struct_columns: list[str] = [] - array_columns: list[str] = [] - array_of_struct_columns: list[str] = [] - clear_on_continuation_cols: list[str] = [] - nested_originated_columns: set[str] = set() - - for col_name_raw, col_data in dataframe.items(): - col_name = str(col_name_raw) - dtype = col_data.dtype - if isinstance(dtype, pd.ArrowDtype): - pa_type = dtype.pyarrow_dtype - if pa.types.is_struct(pa_type): - struct_columns.append(col_name) - nested_originated_columns.add(col_name) - elif pa.types.is_list(pa_type): - array_columns.append(col_name) - nested_originated_columns.add(col_name) - if hasattr(pa_type, "value_type") and ( - pa.types.is_struct(pa_type.value_type) - ): - array_of_struct_columns.append(col_name) - else: - clear_on_continuation_cols.append(col_name) - elif col_name in initial_columns: - clear_on_continuation_cols.append(col_name) + # Maps column names to their structural category to simplify list building. + categories: dict[str, str] = {} + + for col, dtype in dataframe.dtypes.items(): + col_name = str(col) + pa_type = getattr(dtype, "pyarrow_dtype", None) + + if not pa_type: + categories[col_name] = "clear" + elif pa.types.is_struct(pa_type): + categories[col_name] = "struct" + elif pa.types.is_list(pa_type): + is_struct_array = pa.types.is_struct(pa_type.value_type) + categories[col_name] = "array_of_struct" if is_struct_array else "array" + else: + categories[col_name] = "clear" + return ColumnClassification( - struct_columns=struct_columns, - array_columns=array_columns, - array_of_struct_columns=array_of_struct_columns, - clear_on_continuation_cols=clear_on_continuation_cols, - nested_originated_columns=nested_originated_columns, + struct_columns=[c for c, cat in categories.items() if cat == "struct"], + array_columns=[ + c for c, cat in categories.items() if cat in ("array", "array_of_struct") + ], + array_of_struct_columns=[ + c for c, cat in categories.items() if cat == "array_of_struct" + ], + clear_on_continuation_cols=[ + c for c, cat in categories.items() if cat == "clear" + ], + nested_originated_columns={ + c for c, cat in categories.items() if cat != "clear" + }, ) @@ -198,10 +197,6 @@ def _flatten_array_of_struct_columns( ) -> tuple[pd.DataFrame, list[str]]: """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field. - For example, an ARRAY> column named 'items' will be - converted into two ARRAY columns: 'items.a' (ARRAY) and 'items.b' (ARRAY). - This allows us to treat them as standard ARRAY columns for the subsequent explosion step. - Args: dataframe: The DataFrame to process. array_of_struct_columns: List of column names that are ARRAYs of STRUCTs. @@ -214,56 +209,86 @@ def _flatten_array_of_struct_columns( result_df = dataframe.copy() for col_name in array_of_struct_columns: col_data = result_df[col_name] - pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype - struct_type = pa_type.value_type - - # Use PyArrow to reshape the list into multiple list arrays + # Ensure we have a PyArrow array (pa.array handles pandas Series conversion) arrow_array = pa.array(col_data) - offsets = arrow_array.offsets - values = arrow_array.values # StructArray - flattened_fields = values.flatten() # List[Array] - - new_cols_to_add = {} - new_array_col_names = [] - # Create new columns for each struct field - for field_idx in range(struct_type.num_fields): - field = struct_type.field(field_idx) - new_col_name = f"{col_name}.{field.name}" - nested_originated_columns.add(new_col_name) - new_array_col_names.append(new_col_name) + # Transpose List> to {field: List} + new_arrays = _transpose_list_of_structs(arrow_array) - # Reconstruct ListArray for this field. This transforms the - # array> into separate array and array columns. - new_list_array = pa.ListArray.from_arrays( - offsets, flattened_fields[field_idx], mask=arrow_array.is_null() - ) - - new_cols_to_add[new_col_name] = pd.Series( - new_list_array, - dtype=pd.ArrowDtype(pa.list_(field.type)), - index=result_df.index, - ) + new_cols_df = pd.DataFrame( + { + f"{col_name}.{field_name}": pd.Series( + arr, dtype=pd.ArrowDtype(arr.type), index=result_df.index + ) + for field_name, arr in new_arrays.items() + } + ) - col_idx = result_df.columns.to_list().index(col_name) - new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) + # Track the new columns + for new_col in new_cols_df.columns: + nested_originated_columns.add(new_col) - result_df = pd.concat( - [ - result_df.iloc[:, :col_idx], - new_cols_df, - result_df.iloc[:, col_idx + 1 :], - ], - axis=1, - ) + # Update the DataFrame + result_df = _replace_column_in_df(result_df, col_name, new_cols_df) # Update array_columns list array_columns.remove(col_name) - # Add the new array columns - array_columns.extend(new_array_col_names) + array_columns.extend(new_cols_df.columns.tolist()) + return result_df, array_columns +def _transpose_list_of_structs(arrow_array: pa.ListArray) -> dict[str, pa.ListArray]: + """Transposes a ListArray of Structs into multiple ListArrays of fields. + + Args: + arrow_array: A PyArrow ListArray where the value type is a Struct. + + Returns: + A dictionary mapping field names to new ListArrays (one for each field in the struct). + """ + struct_type = arrow_array.type.value_type + offsets = arrow_array.offsets + # arrow_array.values is the underlying StructArray. + # Flattening it gives us the arrays for each field, effectively "removing" the struct layer. + flattened_fields = arrow_array.values.flatten() + validity = arrow_array.is_null() + + transposed = {} + for i in range(struct_type.num_fields): + field = struct_type.field(i) + # Reconstruct ListArray for each field using original offsets and validity. + # This transforms List> into List and List. + transposed[field.name] = pa.ListArray.from_arrays( + offsets, flattened_fields[i], mask=validity + ) + return transposed + + +def _replace_column_in_df( + dataframe: pd.DataFrame, col_name: str, new_cols: pd.DataFrame +) -> pd.DataFrame: + """Replaces a column in a DataFrame with a set of new columns at the same position. + + Args: + dataframe: The original DataFrame. + col_name: The name of the column to replace. + new_cols: A DataFrame containing the new columns to insert. + + Returns: + A new DataFrame with the substitution made. + """ + col_idx = dataframe.columns.to_list().index(col_name) + return pd.concat( + [ + dataframe.iloc[:, :col_idx], + new_cols, + dataframe.iloc[:, col_idx + 1 :], + ], + axis=1, + ) + + def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] ) -> ExplodeResult: From 59c3a2ab870d5594d69fafa486e80946d9e2d254 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 02:53:04 +0000 Subject: [PATCH 22/29] refactor: use mutable ColumnClassification object in _flatten.py --- bigframes/display/_flatten.py | 41 ++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 331d1b7476..d728261d59 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -50,7 +50,7 @@ class FlattenResult: nested_columns: set[str] -@dataclasses.dataclass(frozen=True) +@dataclasses.dataclass class ColumnClassification: """The result of classifying columns. @@ -107,40 +107,41 @@ def flatten_nested_data( result_df = dataframe.copy() classification = _classify_columns(result_df) - # Extract lists to allow modification by subsequent steps. - # _flatten_array_of_struct_columns will modify array_columns to replace - # the original array-of-struct column with the new flattened array columns. - struct_columns = classification.struct_columns - array_columns = classification.array_columns - array_of_struct_columns = classification.array_of_struct_columns - clear_on_continuation_cols = classification.clear_on_continuation_cols - nested_originated_columns = classification.nested_originated_columns - - result_df, array_columns = _flatten_array_of_struct_columns( - result_df, array_of_struct_columns, array_columns, nested_originated_columns + # Create a mutable structure to track column changes during flattening. + # _flatten_array_of_struct_columns modifies the array_columns list. + columns_info = dataclasses.replace(classification) + + result_df, columns_info.array_columns = _flatten_array_of_struct_columns( + result_df, + columns_info.array_of_struct_columns, + columns_info.array_columns, + columns_info.nested_originated_columns, ) - result_df, clear_on_continuation_cols = _flatten_struct_columns( - result_df, struct_columns, clear_on_continuation_cols, nested_originated_columns + result_df, columns_info.clear_on_continuation_cols = _flatten_struct_columns( + result_df, + columns_info.struct_columns, + columns_info.clear_on_continuation_cols, + columns_info.nested_originated_columns, ) # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) - if not array_columns: + if not columns_info.array_columns: return FlattenResult( dataframe=result_df, row_labels=None, continuation_rows=None, - cleared_on_continuation=clear_on_continuation_cols, - nested_columns=nested_originated_columns, + cleared_on_continuation=columns_info.clear_on_continuation_cols, + nested_columns=columns_info.nested_originated_columns, ) - explode_result = _explode_array_columns(result_df, array_columns) + explode_result = _explode_array_columns(result_df, columns_info.array_columns) return FlattenResult( dataframe=explode_result.dataframe, row_labels=explode_result.row_labels, continuation_rows=explode_result.continuation_rows, - cleared_on_continuation=clear_on_continuation_cols, - nested_columns=nested_originated_columns, + cleared_on_continuation=columns_info.clear_on_continuation_cols, + nested_columns=columns_info.nested_originated_columns, ) From 6d28d28dd59e3c687583cba28735366690a66e76 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 03:01:37 +0000 Subject: [PATCH 23/29] fix: resolve bug in _classify_columns logic and enable functional updates --- bigframes/display/_flatten.py | 146 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index d728261d59..a6d8b23852 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -50,7 +50,7 @@ class FlattenResult: nested_columns: set[str] -@dataclasses.dataclass +@dataclasses.dataclass(frozen=True) class ColumnClassification: """The result of classifying columns. @@ -62,11 +62,11 @@ class ColumnClassification: nested_originated_columns: Columns that were created from nested data. """ - struct_columns: list[str] - array_columns: list[str] - array_of_struct_columns: list[str] - clear_on_continuation_cols: list[str] - nested_originated_columns: set[str] + struct_columns: tuple[str, ...] + array_columns: tuple[str, ...] + array_of_struct_columns: tuple[str, ...] + clear_on_continuation_cols: tuple[str, ...] + nested_originated_columns: frozenset[str] @dataclasses.dataclass(frozen=True) @@ -107,41 +107,50 @@ def flatten_nested_data( result_df = dataframe.copy() classification = _classify_columns(result_df) - # Create a mutable structure to track column changes during flattening. - # _flatten_array_of_struct_columns modifies the array_columns list. - columns_info = dataclasses.replace(classification) - result_df, columns_info.array_columns = _flatten_array_of_struct_columns( + # Process ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). + result_df, array_cols, nested_cols = _flatten_array_of_struct_columns( result_df, - columns_info.array_of_struct_columns, - columns_info.array_columns, - columns_info.nested_originated_columns, + classification.array_of_struct_columns, + classification.array_columns, + classification.nested_originated_columns, + ) + classification = dataclasses.replace( + classification, array_columns=array_cols, nested_originated_columns=nested_cols ) - result_df, columns_info.clear_on_continuation_cols = _flatten_struct_columns( + # Flatten top-level STRUCT columns into separate columns. + result_df, clear_cols, nested_cols = _flatten_struct_columns( result_df, - columns_info.struct_columns, - columns_info.clear_on_continuation_cols, - columns_info.nested_originated_columns, + classification.struct_columns, + classification.clear_on_continuation_cols, + classification.nested_originated_columns, + ) + classification = dataclasses.replace( + classification, + clear_on_continuation_cols=clear_cols, + nested_originated_columns=nested_cols, ) # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) - if not columns_info.array_columns: + if not classification.array_columns: return FlattenResult( dataframe=result_df, row_labels=None, continuation_rows=None, - cleared_on_continuation=columns_info.clear_on_continuation_cols, - nested_columns=columns_info.nested_originated_columns, + cleared_on_continuation=list(classification.clear_on_continuation_cols), + nested_columns=set(classification.nested_originated_columns), ) - explode_result = _explode_array_columns(result_df, columns_info.array_columns) + explode_result = _explode_array_columns( + result_df, list(classification.array_columns) + ) return FlattenResult( dataframe=explode_result.dataframe, row_labels=explode_result.row_labels, continuation_rows=explode_result.continuation_rows, - cleared_on_continuation=columns_info.clear_on_continuation_cols, - nested_columns=columns_info.nested_originated_columns, + cleared_on_continuation=list(classification.clear_on_continuation_cols), + nested_columns=set(classification.nested_originated_columns), ) @@ -174,40 +183,43 @@ def _classify_columns( categories[col_name] = "clear" return ColumnClassification( - struct_columns=[c for c, cat in categories.items() if cat == "struct"], - array_columns=[ + struct_columns=tuple(c for c, cat in categories.items() if cat == "struct"), + array_columns=tuple( c for c, cat in categories.items() if cat in ("array", "array_of_struct") - ], - array_of_struct_columns=[ + ), + array_of_struct_columns=tuple( c for c, cat in categories.items() if cat == "array_of_struct" - ], - clear_on_continuation_cols=[ + ), + clear_on_continuation_cols=tuple( c for c, cat in categories.items() if cat == "clear" - ], - nested_originated_columns={ + ), + nested_originated_columns=frozenset( c for c, cat in categories.items() if cat != "clear" - }, + ), ) def _flatten_array_of_struct_columns( dataframe: pd.DataFrame, - array_of_struct_columns: list[str], - array_columns: list[str], - nested_originated_columns: set[str], -) -> tuple[pd.DataFrame, list[str]]: + array_of_struct_columns: tuple[str, ...], + array_columns: tuple[str, ...], + nested_originated_columns: frozenset[str], +) -> tuple[pd.DataFrame, tuple[str, ...], frozenset[str]]: """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field. Args: dataframe: The DataFrame to process. - array_of_struct_columns: List of column names that are ARRAYs of STRUCTs. - array_columns: The main list of ARRAY columns to be updated. - nested_originated_columns: Set of columns tracked as originating from nested data. + array_of_struct_columns: Column names that are ARRAYs of STRUCTs. + array_columns: The main sequence of ARRAY columns to be updated. + nested_originated_columns: Columns tracked as originating from nested data. Returns: - A tuple containing the modified DataFrame and the updated list of array columns. + A tuple containing the modified DataFrame, updated array columns, and updated nested columns. """ result_df = dataframe.copy() + current_array_columns = list(array_columns) + current_nested_columns = set(nested_originated_columns) + for col_name in array_of_struct_columns: col_data = result_df[col_name] # Ensure we have a PyArrow array (pa.array handles pandas Series conversion) @@ -225,18 +237,13 @@ def _flatten_array_of_struct_columns( } ) - # Track the new columns - for new_col in new_cols_df.columns: - nested_originated_columns.add(new_col) - - # Update the DataFrame + current_nested_columns.update(new_cols_df.columns) result_df = _replace_column_in_df(result_df, col_name, new_cols_df) - # Update array_columns list - array_columns.remove(col_name) - array_columns.extend(new_cols_df.columns.tolist()) + current_array_columns.remove(col_name) + current_array_columns.extend(new_cols_df.columns.tolist()) - return result_df, array_columns + return result_df, tuple(current_array_columns), frozenset(current_nested_columns) def _transpose_list_of_structs(arrow_array: pa.ListArray) -> dict[str, pa.ListArray]: @@ -395,27 +402,25 @@ def _explode_array_columns( def _flatten_struct_columns( dataframe: pd.DataFrame, - struct_columns: list[str], - clear_on_continuation_cols: list[str], - nested_originated_columns: set[str], -) -> tuple[pd.DataFrame, list[str]]: + struct_columns: tuple[str, ...], + clear_on_continuation_cols: tuple[str, ...], + nested_originated_columns: frozenset[str], +) -> tuple[pd.DataFrame, tuple[str, ...], frozenset[str]]: """Flatten regular STRUCT columns into separate columns. - A STRUCT column 'user' with fields 'name' and 'age' becomes 'user.name' - and 'user.age'. - Args: dataframe: The DataFrame to process. - struct_columns: List of STRUCT columns to flatten. - clear_on_continuation_cols: List of columns to clear on continuation, - which will be updated with the new flattened columns. - nested_originated_columns: Set of columns tracked as originating from nested data. + struct_columns: STRUCT columns to flatten. + clear_on_continuation_cols: Columns to clear on continuation. + nested_originated_columns: Columns tracked as originating from nested data. Returns: - A tuple containing the modified DataFrame and the updated list of - columns to clear on continuation. + A tuple containing the modified DataFrame, updated clear columns, and updated nested columns. """ result_df = dataframe.copy() + current_clear_cols = list(clear_on_continuation_cols) + current_nested_cols = set(nested_originated_columns) + for col_name in struct_columns: col_data = result_df[col_name] if isinstance(col_data.dtype, pd.ArrowDtype): @@ -430,8 +435,8 @@ def _flatten_struct_columns( for field_idx in range(pa_type.num_fields): field = pa_type.field(field_idx) new_col_name = f"{col_name}.{field.name}" - nested_originated_columns.add(new_col_name) - clear_on_continuation_cols.append(new_col_name) + current_nested_cols.add(new_col_name) + current_clear_cols.append(new_col_name) # Create a new Series from the flattened array new_cols_to_add[new_col_name] = pd.Series( @@ -440,14 +445,7 @@ def _flatten_struct_columns( index=result_df.index, ) - col_idx = result_df.columns.to_list().index(col_name) new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) - result_df = pd.concat( - [ - result_df.iloc[:, :col_idx], - new_cols_df, - result_df.iloc[:, col_idx + 1 :], - ], - axis=1, - ) - return result_df, clear_on_continuation_cols + result_df = _replace_column_in_df(result_df, col_name, new_cols_df) + + return result_df, tuple(current_clear_cols), frozenset(current_nested_cols) From 09635e6578f9571a6a5173a7d5a086864060b5e6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 03:04:55 +0000 Subject: [PATCH 24/29] refactor: simplify _classify_columns logic in _flatten.py --- bigframes/display/_flatten.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index a6d8b23852..42e16c65cd 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -165,22 +165,24 @@ def _classify_columns( Returns: A ColumnClassification object containing lists of column names for each category. """ - # Maps column names to their structural category to simplify list building. - categories: dict[str, str] = {} - for col, dtype in dataframe.dtypes.items(): - col_name = str(col) + def get_category(dtype: pd.api.extensions.ExtensionDtype) -> str: pa_type = getattr(dtype, "pyarrow_dtype", None) + if pa_type: + if pa.types.is_struct(pa_type): + return "struct" + if pa.types.is_list(pa_type): + return ( + "array_of_struct" + if pa.types.is_struct(pa_type.value_type) + else "array" + ) + return "clear" - if not pa_type: - categories[col_name] = "clear" - elif pa.types.is_struct(pa_type): - categories[col_name] = "struct" - elif pa.types.is_list(pa_type): - is_struct_array = pa.types.is_struct(pa_type.value_type) - categories[col_name] = "array_of_struct" if is_struct_array else "array" - else: - categories[col_name] = "clear" + # Maps column names to their structural category to simplify list building. + categories = { + str(col): get_category(dtype) for col, dtype in dataframe.dtypes.items() + } return ColumnClassification( struct_columns=tuple(c for c, cat in categories.items() if cat == "struct"), From 2de5a3c10e0ff9b5a7d59d3ee486ca61700a750a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 03:08:29 +0000 Subject: [PATCH 25/29] fix: resolve NameError for ExplodeResult and formatting --- bigframes/display/_flatten.py | 78 +++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 42e16c65cd..0d40e4c0d1 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -70,18 +70,33 @@ class ColumnClassification: @dataclasses.dataclass(frozen=True) -class ExplodeResult: - """The result of exploding array columns. +class FlattenArrayOfStructsResult: + """The result of flattening array-of-struct columns. Attributes: - dataframe: The exploded DataFrame. - row_labels: Labels for the rows. - continuation_rows: Indices of continuation rows. + dataframe: The flattened DataFrame. + array_columns: The updated list of array columns. + nested_originated_columns: The updated set of columns created from nested data. """ dataframe: pd.DataFrame - row_labels: list[str] - continuation_rows: set[int] + array_columns: tuple[str, ...] + nested_originated_columns: frozenset[str] + + +@dataclasses.dataclass(frozen=True) +class FlattenStructsResult: + """The result of flattening struct columns. + + Attributes: + dataframe: The flattened DataFrame. + clear_on_continuation_cols: The updated list of columns to clear on continuation. + nested_originated_columns: The updated set of columns created from nested data. + """ + + dataframe: pd.DataFrame + clear_on_continuation_cols: tuple[str, ...] + nested_originated_columns: frozenset[str] def flatten_nested_data( @@ -109,27 +124,31 @@ def flatten_nested_data( classification = _classify_columns(result_df) # Process ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). - result_df, array_cols, nested_cols = _flatten_array_of_struct_columns( + flatten_array_structs_result = _flatten_array_of_struct_columns( result_df, classification.array_of_struct_columns, classification.array_columns, classification.nested_originated_columns, ) + result_df = flatten_array_structs_result.dataframe classification = dataclasses.replace( - classification, array_columns=array_cols, nested_originated_columns=nested_cols + classification, + array_columns=flatten_array_structs_result.array_columns, + nested_originated_columns=flatten_array_structs_result.nested_originated_columns, ) # Flatten top-level STRUCT columns into separate columns. - result_df, clear_cols, nested_cols = _flatten_struct_columns( + flatten_structs_result = _flatten_struct_columns( result_df, classification.struct_columns, classification.clear_on_continuation_cols, classification.nested_originated_columns, ) + result_df = flatten_structs_result.dataframe classification = dataclasses.replace( classification, - clear_on_continuation_cols=clear_cols, - nested_originated_columns=nested_cols, + clear_on_continuation_cols=flatten_structs_result.clear_on_continuation_cols, + nested_originated_columns=flatten_structs_result.nested_originated_columns, ) # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) @@ -206,7 +225,7 @@ def _flatten_array_of_struct_columns( array_of_struct_columns: tuple[str, ...], array_columns: tuple[str, ...], nested_originated_columns: frozenset[str], -) -> tuple[pd.DataFrame, tuple[str, ...], frozenset[str]]: +) -> FlattenArrayOfStructsResult: """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field. Args: @@ -216,7 +235,7 @@ def _flatten_array_of_struct_columns( nested_originated_columns: Columns tracked as originating from nested data. Returns: - A tuple containing the modified DataFrame, updated array columns, and updated nested columns. + A FlattenArrayOfStructsResult containing the updated DataFrame and columns. """ result_df = dataframe.copy() current_array_columns = list(array_columns) @@ -245,7 +264,11 @@ def _flatten_array_of_struct_columns( current_array_columns.remove(col_name) current_array_columns.extend(new_cols_df.columns.tolist()) - return result_df, tuple(current_array_columns), frozenset(current_nested_columns) + return FlattenArrayOfStructsResult( + dataframe=result_df, + array_columns=tuple(current_array_columns), + nested_originated_columns=frozenset(current_nested_columns), + ) def _transpose_list_of_structs(arrow_array: pa.ListArray) -> dict[str, pa.ListArray]: @@ -299,6 +322,21 @@ def _replace_column_in_df( ) +@dataclasses.dataclass(frozen=True) +class ExplodeResult: + """The result of exploding array columns. + + Attributes: + dataframe: The exploded DataFrame. + row_labels: Labels for the rows. + continuation_rows: Indices of continuation rows. + """ + + dataframe: pd.DataFrame + row_labels: list[str] + continuation_rows: set[int] + + def _explode_array_columns( dataframe: pd.DataFrame, array_columns: list[str] ) -> ExplodeResult: @@ -407,7 +445,7 @@ def _flatten_struct_columns( struct_columns: tuple[str, ...], clear_on_continuation_cols: tuple[str, ...], nested_originated_columns: frozenset[str], -) -> tuple[pd.DataFrame, tuple[str, ...], frozenset[str]]: +) -> FlattenStructsResult: """Flatten regular STRUCT columns into separate columns. Args: @@ -417,7 +455,7 @@ def _flatten_struct_columns( nested_originated_columns: Columns tracked as originating from nested data. Returns: - A tuple containing the modified DataFrame, updated clear columns, and updated nested columns. + A FlattenStructsResult containing the updated DataFrame and columns. """ result_df = dataframe.copy() current_clear_cols = list(clear_on_continuation_cols) @@ -450,4 +488,8 @@ def _flatten_struct_columns( new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) result_df = _replace_column_in_df(result_df, col_name, new_cols_df) - return result_df, tuple(current_clear_cols), frozenset(current_nested_cols) + return FlattenStructsResult( + dataframe=result_df, + clear_on_continuation_cols=tuple(current_clear_cols), + nested_originated_columns=frozenset(current_nested_cols), + ) From fc122a536c0a6b2463a21fa32cf8aadd0074c665 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 03:51:02 +0000 Subject: [PATCH 26/29] refactor(anywidget): optimize and style cleanup for flatten logic - Replaced Python-based row explosion with optimized PyArrow computation for nested arrays. - Cleaned up comments in to strictly adhere to Google Python Style Guide (focused on 'why', removed redundant 'what'). - Renamed variable to for clarity. - Verified changes with Python unit tests and JavaScript frontend tests. --- bigframes/display/_flatten.py | 155 +++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 66 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 0d40e4c0d1..184de3a484 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -27,8 +27,10 @@ import dataclasses from typing import cast +import numpy as np import pandas as pd import pyarrow as pa +import pyarrow.compute as pc @dataclasses.dataclass(frozen=True) @@ -356,83 +358,82 @@ def _explode_array_columns( if not array_columns: return ExplodeResult(dataframe, [], set()) - # Group by all non-array columns to maintain context. - # _row_num tracks the index within the exploded array to synchronize multiple - # arrays. Continuation rows (index > 0) are tracked for display clearing. - original_cols = dataframe.columns.tolist() - work_df = dataframe + work_df, non_array_columns, original_index_name = _prepare_explosion_dataframe( + dataframe, array_columns + ) - non_array_columns = work_df.columns.drop(array_columns).tolist() - if not non_array_columns: - work_df = work_df.copy() # Avoid modifying input - # Add a temporary column to allow grouping if all columns are arrays. - # This ensures we can still group by "original row" even if there are no scalar columns. - non_array_columns = ["_temp_grouping_col"] - work_df["_temp_grouping_col"] = range(len(work_df)) + if work_df.empty: + return ExplodeResult(dataframe, [], set()) - # Preserve original index - if work_df.index.name: - original_index_name = work_df.index.name - work_df = work_df.reset_index() - non_array_columns.append(original_index_name) - else: - original_index_name = None - work_df = work_df.reset_index(names=["_original_index"]) - non_array_columns.append("_original_index") + table = pa.Table.from_pandas(work_df) + arrays = [table.column(col).combine_chunks() for col in array_columns] + lengths = [] + for arr in arrays: + row_lengths = pc.list_value_length(arr) + # Treat null lists as length 1 to match pandas explode behavior for scalars. + row_lengths = pc.if_else( + pc.is_null(row_lengths, nan_is_null=True), 1, row_lengths + ) + lengths.append(row_lengths) - exploded_dfs = [] - for col in array_columns: - # Explode each array column individually - col_series = work_df[col] - target_dtype = None - if isinstance(col_series.dtype, pd.ArrowDtype): - pa_type = col_series.dtype.pyarrow_dtype - if pa.types.is_list(pa_type): - target_dtype = pd.ArrowDtype(pa_type.value_type) - # Use to_list() to avoid pandas attempting to create a 2D numpy - # array if the list elements have the same length. - col_series = pd.Series( - col_series.to_list(), index=col_series.index, dtype=object - ) + if not lengths: + return ExplodeResult(dataframe, [], set()) - exploded = work_df[non_array_columns].assign(**{col: col_series}).explode(col) + max_lens = lengths[0] if len(lengths) == 1 else pc.max_element_wise(*lengths) + max_lens = max_lens.cast(pa.int64()) + current_offsets = pc.cumulative_sum(max_lens) + target_offsets = pa.concat_arrays([pa.array([0], type=pa.int64()), current_offsets]) - if target_dtype is not None: - # Re-cast to arrow dtype if possible - exploded[col] = exploded[col].astype(target_dtype) + total_rows = target_offsets[-1].as_py() + if total_rows == 0: + empty_df = pd.DataFrame(columns=dataframe.columns) + if original_index_name: + empty_df.index.name = original_index_name + return ExplodeResult(empty_df, [], set()) - # Track position in the array for alignment - exploded["_row_num"] = exploded.groupby(non_array_columns).cumcount() - exploded_dfs.append(exploded) + # parent_indices maps each result row to its original row index. + dummy_values = pa.nulls(total_rows, type=pa.null()) + dummy_list_array = pa.ListArray.from_arrays(target_offsets, dummy_values) + parent_indices = pc.list_parent_indices(dummy_list_array) - if not exploded_dfs: - # This should not be reached if array_columns is not empty - return ExplodeResult(dataframe, [], set()) + range_k = pa.array(range(total_rows)) + starts = target_offsets.take(parent_indices) + row_nums = pc.subtract(range_k, starts) - # Merge the exploded columns - merged_df = exploded_dfs[0] - for i in range(1, len(exploded_dfs)): - merged_df = pd.merge( - merged_df, - exploded_dfs[i], - on=non_array_columns + ["_row_num"], - how="outer", - ) + new_columns = {} + for col_name in non_array_columns: + new_columns[col_name] = table.column(col_name).take(parent_indices) - # Restore original column order and sort - merged_df = merged_df.sort_values(non_array_columns + ["_row_num"]).reset_index( - drop=True - ) + for col_name, arr in zip(array_columns, arrays): + actual_lens_scattered = pc.list_value_length(arr).take(parent_indices) + valid_mask = pc.less(row_nums, actual_lens_scattered) + starts_scattered = arr.offsets.take(parent_indices) + + # safe_mask ensures we don't access out of bounds even if masked out. + safe_mask = pc.fill_null(valid_mask, False) + candidate_indices = pc.add(starts_scattered, row_nums) + safe_indices = pc.if_else(safe_mask, candidate_indices, 0) + + if len(arr.values) == 0: + final_values = pa.nulls(total_rows, type=arr.type.value_type) + else: + taken_values = arr.values.take(safe_indices) + final_values = pc.if_else(safe_mask, taken_values, None) + + new_columns[col_name] = final_values + + result_df = pa.Table.from_pydict(new_columns).to_pandas() - # Generate row labels and continuation mask efficiently grouping_col_name = ( "_original_index" if original_index_name is None else original_index_name ) - row_labels = merged_df[grouping_col_name].astype(str).tolist() - continuation_rows = set(merged_df.index[merged_df["_row_num"] > 0]) + row_labels = result_df[grouping_col_name].astype(str).tolist() - # Restore original columns - result_df = merged_df[original_cols] + # The continuation_mask is a boolean mask where row_num > 0. + continuation_mask = pc.greater(row_nums, 0).to_numpy(zero_copy_only=False) + continuation_rows = set(np.flatnonzero(continuation_mask)) + + result_df = result_df[dataframe.columns.tolist()] if original_index_name: result_df = result_df.set_index(original_index_name) @@ -440,6 +441,31 @@ def _explode_array_columns( return ExplodeResult(result_df, row_labels, continuation_rows) +def _prepare_explosion_dataframe( + dataframe: pd.DataFrame, array_columns: list[str] +) -> tuple[pd.DataFrame, list[str], str | None]: + """Prepares the DataFrame for explosion by ensuring grouping columns exist.""" + work_df = dataframe + non_array_columns = work_df.columns.drop(array_columns).tolist() + + if not non_array_columns: + work_df = work_df.copy() # Avoid modifying input + # Add a temporary column to allow grouping if all columns are arrays. + non_array_columns = ["_temp_grouping_col"] + work_df["_temp_grouping_col"] = range(len(work_df)) + + original_index_name = None + if work_df.index.name: + original_index_name = work_df.index.name + work_df = work_df.reset_index() + non_array_columns.append(original_index_name) + else: + work_df = work_df.reset_index(names=["_original_index"]) + non_array_columns.append("_original_index") + + return work_df, non_array_columns, original_index_name + + def _flatten_struct_columns( dataframe: pd.DataFrame, struct_columns: tuple[str, ...], @@ -466,8 +492,6 @@ def _flatten_struct_columns( if isinstance(col_data.dtype, pd.ArrowDtype): pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype - # Use PyArrow to flatten the struct column without row iteration - # combine_chunks() ensures we have a single array if it was chunked arrow_array = pa.array(col_data) flattened_fields = arrow_array.flatten() @@ -478,7 +502,6 @@ def _flatten_struct_columns( current_nested_cols.add(new_col_name) current_clear_cols.append(new_col_name) - # Create a new Series from the flattened array new_cols_to_add[new_col_name] = pd.Series( flattened_fields[field_idx], dtype=pd.ArrowDtype(field.type), From 9a199665df0b9c06bf6ccfe8709f1f8471647103 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 18:41:03 +0000 Subject: [PATCH 27/29] refactor(anywidget): optimize array flattening using pyarrow --- bigframes/display/_flatten.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index 184de3a484..be72a73cc7 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -422,7 +422,9 @@ def _explode_array_columns( new_columns[col_name] = final_values - result_df = pa.Table.from_pydict(new_columns).to_pandas() + # Convert back to pandas; this is efficient since we have pyarrow arrays. + result_table = pa.Table.from_pydict(new_columns) + result_df = result_table.to_pandas(types_mapper=pd.ArrowDtype) grouping_col_name = ( "_original_index" if original_index_name is None else original_index_name From 9886e5fb27b0e05013b1a3c621c7b9061723e64f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 18:48:25 +0000 Subject: [PATCH 28/29] test: rerun notebook --- notebooks/dataframes/anywidget_mode.ipynb | 319 ++++++++++------------ 1 file changed, 144 insertions(+), 175 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 064aa70b2d..331209833e 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -118,17 +118,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Hazel 51\n", + " AL F 1910 Lucy 76\n", + " AR F 1910 Nellie 39\n", + " AR F 1910 Lena 40\n", + " CO F 1910 Thelma 36\n", + " CO F 1910 Ruth 68\n", + " CT F 1910 Elizabeth 86\n", + " DC F 1910 Mary 80\n", + " FL F 1910 Annie 101\n", + " FL F 1910 Alma 39\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -173,7 +173,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f51152e46134483da9e186ae9c21b219", + "model_id": "4b98d465a76449ec86b1289518cc130d", "version_major": 2, "version_minor": 1 }, @@ -209,80 +209,80 @@ " AL\n", " F\n", " 1910\n", - " Lillian\n", - " 99\n", + " Annie\n", + " 482\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Ruby\n", - " 204\n", + " Myrtle\n", + " 104\n", " \n", " \n", " 2\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Helen\n", - " 76\n", + " Lillian\n", + " 56\n", " \n", " \n", " 3\n", - " AL\n", + " CT\n", " F\n", " 1910\n", - " Eunice\n", - " 41\n", + " Anne\n", + " 38\n", " \n", " \n", " 4\n", - " AR\n", + " CT\n", " F\n", " 1910\n", - " Dora\n", - " 42\n", + " Frances\n", + " 45\n", " \n", " \n", " 5\n", - " CA\n", + " FL\n", " F\n", " 1910\n", - " Edna\n", - " 62\n", + " Margaret\n", + " 53\n", " \n", " \n", " 6\n", - " CA\n", + " GA\n", " F\n", " 1910\n", - " Helen\n", - " 239\n", + " Mae\n", + " 73\n", " \n", " \n", " 7\n", - " CO\n", + " GA\n", " F\n", " 1910\n", - " Alice\n", - " 46\n", + " Beatrice\n", + " 96\n", " \n", " \n", " 8\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Willie\n", - " 71\n", + " Lola\n", + " 47\n", " \n", " \n", " 9\n", - " FL\n", + " IA\n", " F\n", " 1910\n", - " Thelma\n", - " 65\n", + " Viola\n", + " 49\n", " \n", " \n", "\n", @@ -290,17 +290,17 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -334,7 +334,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 36 seconds of slot time. [Job bigframes-dev:US.7885d9f3-ddfa-41cb-ad0e-580119390ab6 details]\n", + " Query processed 171.4 MB in 50 seconds of slot time. [Job bigframes-dev:US.d46fca41-30ed-4fe8-81db-029d37df13a8 details]\n", " " ], "text/plain": [ @@ -374,16 +374,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", + "1913\n", + "1913\n", + "1915\n", + "1917\n", + "1917\n", + "1919\n", + "1922\n", + "1923\n", + "1924\n", + "1924\n", "Name: year, dtype: Int64\n", "...\n", "\n", @@ -415,13 +415,13 @@ "data": { "text/html": [ "\n", - " Query started with request ID bigframes-dev:US.2f9f26b3-ff3e-431d-a676-d5daf5e52796.
SQL
SELECT\n",
+       "    Query started with request ID bigframes-dev:US.20a9eaeb-b23f-479e-acf0-e621563cf4d0.
SQL
SELECT\n",
        "`year` AS `year`\n",
        "FROM\n",
        "(SELECT\n",
        "  `t0`.`year`,\n",
        "  `t0`.`bfuid_col_2` AS `bfuid_col_5`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
+       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._5cbbc449_a246_4e9d_9cdf_2941a8013fc0_bqdf_74c30588-acfe-48fd-84d6-a0c0159fa95c` AS `t0`)\n",
        "ORDER BY `bfuid_col_5` ASC NULLS LAST
\n", " " ], @@ -436,7 +436,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_nIGVzX-38kg4za0Qwpm1hmmI_50_ details]\n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_KOFvcpsRpmwuCfHOa-kXNhxJOeY- details]\n", " " ], "text/plain": [ @@ -449,33 +449,33 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "148a49927f4d4a3e9d7bbd404cbdaa2a", + "model_id": "6230f92412e1425a9d5938d4166b4f9c", "version_major": 2, "version_minor": 1 }, "text/html": [ - "
0    1910\n",
-       "1    1910\n",
-       "2    1910\n",
-       "3    1910\n",
-       "4    1910\n",
-       "5    1910\n",
-       "6    1910\n",
-       "7    1910\n",
-       "8    1910\n",
-       "9    1910

[5552452 rows]

" + "
0    1913\n",
+       "1    1913\n",
+       "2    1915\n",
+       "3    1917\n",
+       "4    1917\n",
+       "5    1919\n",
+       "6    1922\n",
+       "7    1923\n",
+       "8    1924\n",
+       "9    1924

[5552452 rows]

" ], "text/plain": [ - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", - "1910\n", + "1913\n", + "1913\n", + "1915\n", + "1917\n", + "1917\n", + "1919\n", + "1922\n", + "1923\n", + "1924\n", + "1924\n", "Name: year, dtype: Int64\n", "...\n", "\n", @@ -547,23 +547,8 @@ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.9b3ddeec-c764-4860-b15a-1ce48045e438.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`state`,\n",
-       "  `t0`.`gender`,\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`name`,\n",
-       "  `t0`.`number`,\n",
-       "  `t0`.`bfuid_col_2` AS `bfuid_col_7`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
-       "ORDER BY `bfuid_col_7` ASC NULLS LAST
\n", + "✅ Completed. \n", + " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_2Pf6d9tXnCYyG6EkFhmL7wF20KCe details]\n", " " ], "text/plain": [ @@ -577,7 +562,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_nJW2lE25m_vpfd-BYf4L-tJZUxvf details]\n", + " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_ILlDgVfpwQ40fLDkGoz0mvDIhn6U details]\n", " " ], "text/plain": [ @@ -597,12 +582,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b32b55e7345e44d7803e43de84007cbe", + "model_id": "8bbf6508ddd74f5782f0234ea62656e2", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -677,24 +662,8 @@ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.e96b418c-c635-411f-b8e7-fb53617cae78.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`state`,\n",
-       "  `t0`.`gender`,\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`name`,\n",
-       "  `t0`.`number`,\n",
-       "  `t0`.`bfuid_col_2` AS `bfuid_col_9`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._bf9622a9_188b_48af_a40a_4407e14ca5c8_bqdf_3028ad51-121b-42f3-86e6-ce710e604254` AS `t0`)\n",
-       "ORDER BY `name` ASC NULLS LAST ,`year` ASC NULLS LAST ,`state` ASC NULLS LAST ,`bfuid_col_9` ASC NULLS LAST\n",
-       "LIMIT 5
\n", + "✅ Completed. \n", + " Query processed 215.9 MB in a moment of slot time.\n", " " ], "text/plain": [ @@ -728,12 +697,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0f234155ce4a45ae96938d3845f204de", + "model_id": "0daafc202fee436a93dbd3847f58528a", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -777,7 +746,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 27 seconds of slot time.\n", + " Query processed 85.9 kB in 19 seconds of slot time.\n", " " ], "text/plain": [ @@ -838,7 +807,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1aa385620a5f4daebc44dd890ba75381", + "model_id": "0f0fa2a6c893404f901a0195e13b00d3", "version_major": 2, "version_minor": 1 }, @@ -904,24 +873,6 @@ " EU\n", " DE\n", " 03.10.2018\n", - " G06F 11/30\n", - " <NA>\n", - " 18157347.8\n", - " 19.02.2018\n", - " 31.03.2017\n", - " Hoffmann Eitle\n", - " FUJITSU LIMITED\n", - " Kukihara, Kensuke\n", - " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", - " EP 3 382 553 A1\n", - " \n", - " \n", - " 2\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 03.10.2018\n", " H05B 6/12\n", " <NA>\n", " 18165514.3\n", @@ -934,25 +885,25 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 3\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", " 03.10.2018\n", - " H01L 21/20\n", - " <NA>\n", - " 18166536.5\n", - " 16.02.2016\n", + " G06F 11/30\n", " <NA>\n", - " Scheider, Sascha et al\n", - " EV Group E. Thallner GmbH\n", - " Kurz, Florian\n", - " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", - " EP 3 382 744 A1\n", + " 18157347.8\n", + " 19.02.2018\n", + " 31.03.2017\n", + " Hoffmann Eitle\n", + " FUJITSU LIMITED\n", + " Kukihara, Kensuke\n", + " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", + " EP 3 382 553 A1\n", " \n", " \n", - " 4\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -969,6 +920,24 @@ " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", + " \n", + " 4\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", + " 03.10.2018\n", + " H01L 21/20\n", + " <NA>\n", + " 18166536.5\n", + " 16.02.2016\n", + " <NA>\n", + " Scheider, Sascha et al\n", + " EV Group E. Thallner GmbH\n", + " Kurz, Florian\n", + " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", + " EP 3 382 744 A1\n", + " \n", " \n", "\n", "

5 rows × 15 columns

\n", @@ -991,31 +960,31 @@ "\n", " publication_date class_international class_us application_number \\\n", "0 29.08.018 E04H 6/12 18157874.1 \n", - "1 03.10.2018 G06F 11/30 18157347.8 \n", - "2 03.10.2018 H05B 6/12 18165514.3 \n", - "3 03.10.2018 H01L 21/20 18166536.5 \n", - "4 03.10.2018 A01K 31/00 18171005.4 \n", + "1 03.10.2018 H05B 6/12 18165514.3 \n", + "2 03.10.2018 G06F 11/30 18157347.8 \n", + "3 03.10.2018 A01K 31/00 18171005.4 \n", + "4 03.10.2018 H01L 21/20 18166536.5 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", - "1 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "2 03.04.2018 30.03.2017 \n", - "3 16.02.2016 Scheider, Sascha et al \n", - "4 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "1 03.04.2018 30.03.2017 \n", + "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "4 16.02.2016 Scheider, Sascha et al \n", "\n", " applicant_line_1 inventor_line_1 \\\n", "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 FUJITSU LIMITED Kukihara, Kensuke \n", - "2 BSH Hausgeräte GmbH Acero Acero, Jesus \n", - "3 EV Group E. Thallner GmbH Kurz, Florian \n", - "4 Linco Food Systems A/S Thrane, Uffe \n", + "1 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "2 FUJITSU LIMITED Kukihara, Kensuke \n", + "3 Linco Food Systems A/S Thrane, Uffe \n", + "4 EV Group E. Thallner GmbH Kurz, Florian \n", "\n", " title_line_1 number \n", "0 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", - "1 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "2 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", - "3 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "4 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "4 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", "\n", "[5 rows x 15 columns]" ] @@ -1096,7 +1065,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4037a24ad291479da4bba4393ce97a52", + "model_id": "7e9b96e71df94ccc868a16dfa98fc7e2", "version_major": 2, "version_minor": 1 }, @@ -1333,7 +1302,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.13.0" } }, "nbformat": 4, From b2166ed9277052bb159002120f73065d1fd0c3bb Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 9 Jan 2026 20:11:46 +0000 Subject: [PATCH 29/29] refactor: remove nested loop --- bigframes/display/_flatten.py | 60 ++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py index be72a73cc7..f2055cd41f 100644 --- a/bigframes/display/_flatten.py +++ b/bigframes/display/_flatten.py @@ -25,12 +25,11 @@ from __future__ import annotations import dataclasses -from typing import cast import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.compute as pc +import pyarrow.compute as pc # type: ignore @dataclasses.dataclass(frozen=True) @@ -431,9 +430,8 @@ def _explode_array_columns( ) row_labels = result_df[grouping_col_name].astype(str).tolist() - # The continuation_mask is a boolean mask where row_num > 0. continuation_mask = pc.greater(row_nums, 0).to_numpy(zero_copy_only=False) - continuation_rows = set(np.flatnonzero(continuation_mask)) + continuation_rows = set(np.flatnonzero(continuation_mask).tolist()) result_df = result_df[dataframe.columns.tolist()] @@ -485,33 +483,43 @@ def _flatten_struct_columns( Returns: A FlattenStructsResult containing the updated DataFrame and columns. """ - result_df = dataframe.copy() + if not struct_columns: + return FlattenStructsResult( + dataframe=dataframe.copy(), + clear_on_continuation_cols=clear_on_continuation_cols, + nested_originated_columns=nested_originated_columns, + ) + + # Convert to PyArrow table for efficient flattening + table = pa.Table.from_pandas(dataframe, preserve_index=False) + current_clear_cols = list(clear_on_continuation_cols) current_nested_cols = set(nested_originated_columns) + # Identify new columns that will be created to update metadata for col_name in struct_columns: - col_data = result_df[col_name] - if isinstance(col_data.dtype, pd.ArrowDtype): - pa_type = cast(pd.ArrowDtype, col_data.dtype).pyarrow_dtype + idx = table.schema.get_field_index(col_name) + if idx == -1: + continue + + field = table.schema.field(idx) + if pa.types.is_struct(field.type): + for i in range(field.type.num_fields): + child_field = field.type.field(i) + new_col_name = f"{col_name}.{child_field.name}" + current_nested_cols.add(new_col_name) + current_clear_cols.append(new_col_name) + + # Expand all struct columns into "parent.child" columns. + flattened_table = table.flatten() + + # Convert back to pandas, using ArrowDtype to preserve types and ignoring metadata + # to avoid issues with stale struct type info. + result_df = flattened_table.to_pandas( + types_mapper=pd.ArrowDtype, ignore_metadata=True + ) - arrow_array = pa.array(col_data) - flattened_fields = arrow_array.flatten() - - new_cols_to_add = {} - for field_idx in range(pa_type.num_fields): - field = pa_type.field(field_idx) - new_col_name = f"{col_name}.{field.name}" - current_nested_cols.add(new_col_name) - current_clear_cols.append(new_col_name) - - new_cols_to_add[new_col_name] = pd.Series( - flattened_fields[field_idx], - dtype=pd.ArrowDtype(field.type), - index=result_df.index, - ) - - new_cols_df = pd.DataFrame(new_cols_to_add, index=result_df.index) - result_df = _replace_column_in_df(result_df, col_name, new_cols_df) + result_df.index = dataframe.index return FlattenStructsResult( dataframe=result_df,