Skip to content

Commit 5fc1913

Browse files
Merge branch 'main' into df_local_tests
2 parents 504abf9 + 1cfbb47 commit 5fc1913

File tree

11 files changed

+324
-81
lines changed

11 files changed

+324
-81
lines changed

bigframes/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,8 @@
128128
# BigQuery default is 10000, leave 100 for overhead
129129
MAX_COLUMNS = 9900
130130

131+
# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
132+
# Also must assume that text encoding as literals is much less efficient than in-memory representation.
133+
MAX_INLINE_BYTES = 5000
134+
131135
SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows."

bigframes/core/array_value.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,17 @@ def from_table(
133133
ordering=ordering,
134134
n_rows=n_rows,
135135
)
136+
return cls.from_bq_data_source(source_def, scan_list, session)
137+
138+
@classmethod
139+
def from_bq_data_source(
140+
cls,
141+
source: nodes.BigqueryDataSource,
142+
scan_list: nodes.ScanList,
143+
session: Session,
144+
):
136145
node = nodes.ReadTableNode(
137-
source=source_def,
146+
source=source,
138147
scan_list=scan_list,
139148
table_session=session,
140149
)

bigframes/core/nodes.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,9 @@ class ScanItem(typing.NamedTuple):
578578
def with_id(self, id: identifiers.ColumnId) -> ScanItem:
579579
return ScanItem(id, self.dtype, self.source_id)
580580

581+
def with_source_id(self, source_id: str) -> ScanItem:
582+
return ScanItem(self.id, self.dtype, source_id)
583+
581584

582585
@dataclasses.dataclass(frozen=True)
583586
class ScanList:
@@ -614,16 +617,31 @@ def project(
614617
result = ScanList((self.items[:1]))
615618
return result
616619

620+
def remap_source_ids(
621+
self,
622+
mapping: Mapping[str, str],
623+
) -> ScanList:
624+
items = tuple(
625+
item.with_source_id(mapping.get(item.source_id, item.source_id))
626+
for item in self.items
627+
)
628+
return ScanList(items)
629+
630+
def append(
631+
self, source_id: str, dtype: bigframes.dtypes.Dtype, id: identifiers.ColumnId
632+
) -> ScanList:
633+
return ScanList((*self.items, ScanItem(id, dtype, source_id)))
634+
617635

618636
@dataclasses.dataclass(frozen=True, eq=False)
619637
class ReadLocalNode(LeafNode):
620638
# TODO: Track nullability for local data
621639
local_data_source: local_data.ManagedArrowTable
622640
# Mapping of local ids to bfet id.
623641
scan_list: ScanList
642+
session: bigframes.session.Session
624643
# Offsets are generated only if this is non-null
625644
offsets_col: Optional[identifiers.ColumnId] = None
626-
session: typing.Optional[bigframes.session.Session] = None
627645

628646
@property
629647
def fields(self) -> Sequence[Field]:

bigframes/session/__init__.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@
6060
from bigframes import version
6161
import bigframes._config.bigquery_options as bigquery_options
6262
import bigframes.clients
63-
from bigframes.core import blocks, log_adapter
63+
import bigframes.constants
64+
from bigframes.core import blocks, log_adapter, utils
6465
import bigframes.core.pyformat
6566

6667
# Even though the ibis.backends.bigquery import is unused, it's needed
@@ -248,13 +249,6 @@ def __init__(
248249
self._temp_storage_manager = (
249250
self._session_resource_manager or self._anon_dataset_manager
250251
)
251-
self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor(
252-
bqclient=self._clients_provider.bqclient,
253-
bqstoragereadclient=self._clients_provider.bqstoragereadclient,
254-
storage_manager=self._temp_storage_manager,
255-
strictly_ordered=self._strictly_ordered,
256-
metrics=self._metrics,
257-
)
258252
self._loader = bigframes.session.loader.GbqDataLoader(
259253
session=self,
260254
bqclient=self._clients_provider.bqclient,
@@ -265,6 +259,14 @@ def __init__(
265259
force_total_order=self._strictly_ordered,
266260
metrics=self._metrics,
267261
)
262+
self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor(
263+
bqclient=self._clients_provider.bqclient,
264+
bqstoragereadclient=self._clients_provider.bqstoragereadclient,
265+
loader=self._loader,
266+
storage_manager=self._temp_storage_manager,
267+
strictly_ordered=self._strictly_ordered,
268+
metrics=self._metrics,
269+
)
268270

269271
def __del__(self):
270272
"""Automatic cleanup of internal resources."""
@@ -937,15 +939,15 @@ def _read_pandas(
937939
if write_engine == "default":
938940
write_engine = (
939941
"bigquery_load"
940-
if mem_usage > MAX_INLINE_DF_BYTES
942+
if mem_usage > bigframes.constants.MAX_INLINE_BYTES
941943
else "bigquery_inline"
942944
)
943945

944946
if write_engine == "bigquery_inline":
945-
if mem_usage > MAX_INLINE_DF_BYTES:
947+
if mem_usage > bigframes.constants.MAX_INLINE_BYTES:
946948
raise ValueError(
947949
f"DataFrame size ({mem_usage} bytes) exceeds the maximum allowed "
948-
f"for inline data ({MAX_INLINE_DF_BYTES} bytes)."
950+
f"for inline data ({bigframes.constants.MAX_INLINE_BYTES} bytes)."
949951
)
950952
return self._read_pandas_inline(pandas_dataframe)
951953
elif write_engine == "bigquery_load":
@@ -954,6 +956,10 @@ def _read_pandas(
954956
return self._loader.read_pandas(pandas_dataframe, method="stream")
955957
elif write_engine == "bigquery_write":
956958
return self._loader.read_pandas(pandas_dataframe, method="write")
959+
elif write_engine == "_deferred":
960+
import bigframes.dataframe as dataframe
961+
962+
return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe, self))
957963
else:
958964
raise ValueError(f"Got unexpected write_engine '{write_engine}'")
959965

@@ -1102,11 +1108,8 @@ def _read_csv_w_bigquery_engine(
11021108
native CSV loading capabilities, making it suitable for large datasets
11031109
that may not fit into local memory.
11041110
"""
1105-
if dtype is not None:
1106-
raise NotImplementedError(
1107-
f"BigQuery engine does not support the `dtype` argument."
1108-
f"{constants.FEEDBACK_LINK}"
1109-
)
1111+
if dtype is not None and not utils.is_dict_like(dtype):
1112+
raise ValueError("dtype should be a dict-like object.")
11101113

11111114
if names is not None:
11121115
if len(names) != len(set(names)):
@@ -1161,10 +1164,16 @@ def _read_csv_w_bigquery_engine(
11611164
job_config.skip_leading_rows = header + 1
11621165

11631166
table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config)
1164-
return self._loader.read_gbq_table(
1167+
df = self._loader.read_gbq_table(
11651168
table_id, index_col=index_col, columns=columns, names=names
11661169
)
11671170

1171+
if dtype is not None:
1172+
for column, dtype in dtype.items():
1173+
if column in df.columns:
1174+
df[column] = df[column].astype(dtype)
1175+
return df
1176+
11681177
def read_pickle(
11691178
self,
11701179
filepath_or_buffer: FilePath | ReadPickleBuffer,

bigframes/session/bq_caching_executor.py

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import dataclasses
1818
import math
1919
import os
20+
import threading
2021
from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union
2122
import warnings
2223
import weakref
@@ -27,8 +28,9 @@
2728
import google.cloud.bigquery.table as bq_table
2829
import google.cloud.bigquery_storage_v1
2930

31+
import bigframes.constants
3032
import bigframes.core
31-
from bigframes.core import compile, rewrite
33+
from bigframes.core import compile, local_data, rewrite
3234
import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir
3335
import bigframes.core.guid
3436
import bigframes.core.nodes as nodes
@@ -38,7 +40,7 @@
3840
import bigframes.dtypes
3941
import bigframes.exceptions as bfe
4042
import bigframes.features
41-
from bigframes.session import executor, local_scan_executor, read_api_execution
43+
from bigframes.session import executor, loader, local_scan_executor, read_api_execution
4244
import bigframes.session._io.bigquery as bq_io
4345
import bigframes.session.metrics
4446
import bigframes.session.planner
@@ -67,12 +69,19 @@ def _get_default_output_spec() -> OutputSpec:
6769
)
6870

6971

72+
SourceIdMapping = Mapping[str, str]
73+
74+
7075
class ExecutionCache:
7176
def __init__(self):
7277
# current assumption is only 1 cache of a given node
7378
# in future, might have multiple caches, with different layout, localities
7479
self._cached_executions: weakref.WeakKeyDictionary[
75-
nodes.BigFrameNode, nodes.BigFrameNode
80+
nodes.BigFrameNode, nodes.CachedTableNode
81+
] = weakref.WeakKeyDictionary()
82+
self._uploaded_local_data: weakref.WeakKeyDictionary[
83+
local_data.ManagedArrowTable,
84+
tuple[nodes.BigqueryDataSource, SourceIdMapping],
7685
] = weakref.WeakKeyDictionary()
7786

7887
@property
@@ -105,6 +114,19 @@ def cache_results_table(
105114
assert original_root.schema == cached_replacement.schema
106115
self._cached_executions[original_root] = cached_replacement
107116

117+
def cache_remote_replacement(
118+
self,
119+
local_data: local_data.ManagedArrowTable,
120+
bq_data: nodes.BigqueryDataSource,
121+
):
122+
# bq table has one extra column for offsets, those are implicit for local data
123+
assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema)
124+
mapping = {
125+
local_data.schema.items[i].column: bq_data.table.physical_schema[i].name
126+
for i in range(len(local_data.schema))
127+
}
128+
self._uploaded_local_data[local_data] = (bq_data, mapping)
129+
108130

109131
class BigQueryCachingExecutor(executor.Executor):
110132
"""Computes BigFrames values using BigQuery Engine.
@@ -120,6 +142,7 @@ def __init__(
120142
bqclient: bigquery.Client,
121143
storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager,
122144
bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient,
145+
loader: loader.GbqDataLoader,
123146
*,
124147
strictly_ordered: bool = True,
125148
metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None,
@@ -129,6 +152,7 @@ def __init__(
129152
self.strictly_ordered: bool = strictly_ordered
130153
self.cache: ExecutionCache = ExecutionCache()
131154
self.metrics = metrics
155+
self.loader = loader
132156
self.bqstoragereadclient = bqstoragereadclient
133157
# Simple left-to-right precedence for now
134158
self._semi_executors = (
@@ -138,6 +162,7 @@ def __init__(
138162
),
139163
local_scan_executor.LocalScanExecutor(),
140164
)
165+
self._upload_lock = threading.Lock()
141166

142167
def to_sql(
143168
self,
@@ -149,6 +174,7 @@ def to_sql(
149174
if offset_column:
150175
array_value, _ = array_value.promote_offsets()
151176
node = self.logical_plan(array_value.node) if enable_cache else array_value.node
177+
node = self._substitute_large_local_sources(node)
152178
compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered))
153179
return compiled.sql
154180

@@ -402,6 +428,7 @@ def _cache_with_cluster_cols(
402428
):
403429
"""Executes the query and uses the resulting table to rewrite future executions."""
404430
plan = self.logical_plan(array_value.node)
431+
plan = self._substitute_large_local_sources(plan)
405432
compiled = compile.compile_sql(
406433
compile.CompileRequest(
407434
plan, sort_rows=False, materialize_all_order_keys=True
@@ -422,7 +449,7 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue):
422449
w_offsets, offset_column = array_value.promote_offsets()
423450
compiled = compile.compile_sql(
424451
compile.CompileRequest(
425-
self.logical_plan(w_offsets.node),
452+
self.logical_plan(self._substitute_large_local_sources(w_offsets.node)),
426453
sort_rows=False,
427454
)
428455
)
@@ -532,6 +559,54 @@ def _validate_result_schema(
532559
f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}"
533560
)
534561

562+
def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode):
563+
"""
564+
Replace large local sources with the uploaded version of those datasources.
565+
"""
566+
# Step 1: Upload all previously un-uploaded data
567+
for leaf in original_root.unique_nodes():
568+
if isinstance(leaf, nodes.ReadLocalNode):
569+
if (
570+
leaf.local_data_source.metadata.total_bytes
571+
> bigframes.constants.MAX_INLINE_BYTES
572+
):
573+
self._upload_local_data(leaf.local_data_source)
574+
575+
# Step 2: Replace local scans with remote scans
576+
def map_local_scans(node: nodes.BigFrameNode):
577+
if not isinstance(node, nodes.ReadLocalNode):
578+
return node
579+
if node.local_data_source not in self.cache._uploaded_local_data:
580+
return node
581+
bq_source, source_mapping = self.cache._uploaded_local_data[
582+
node.local_data_source
583+
]
584+
scan_list = node.scan_list.remap_source_ids(source_mapping)
585+
# offsets_col isn't part of ReadTableNode, so emulate by adding to end of scan_list
586+
if node.offsets_col is not None:
587+
# Offsets are always implicitly the final column of uploaded data
588+
# See: Loader.load_data
589+
scan_list = scan_list.append(
590+
bq_source.table.physical_schema[-1].name,
591+
bigframes.dtypes.INT_DTYPE,
592+
node.offsets_col,
593+
)
594+
return nodes.ReadTableNode(bq_source, scan_list, node.session)
595+
596+
return original_root.bottom_up(map_local_scans)
597+
598+
def _upload_local_data(self, local_table: local_data.ManagedArrowTable):
599+
if local_table in self.cache._uploaded_local_data:
600+
return
601+
# Lock prevents concurrent repeated work, but slows things down.
602+
# Might be better as a queue and a worker thread
603+
with self._upload_lock:
604+
if local_table not in self.cache._uploaded_local_data:
605+
uploaded = self.loader.load_data(
606+
local_table, bigframes.core.guid.generate_guid()
607+
)
608+
self.cache.cache_remote_replacement(local_table, uploaded)
609+
535610
def _execute_plan(
536611
self,
537612
plan: nodes.BigFrameNode,
@@ -562,6 +637,8 @@ def _execute_plan(
562637
# Use explicit destination to avoid 10GB limit of temporary table
563638
if destination_table is not None:
564639
job_config.destination = destination_table
640+
641+
plan = self._substitute_large_local_sources(plan)
565642
compiled = compile.compile_sql(
566643
compile.CompileRequest(plan, sort_rows=ordered, peek_count=peek)
567644
)

0 commit comments

Comments
 (0)