Skip to content

perf: Add warning when using synthetic ordering on large table. #1524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions bigframes/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ class ObsoleteVersionWarning(Warning):
"""The BigFrames version is too old."""


class SyntheticTotalOrderWarning(Warning):
"""Generating an expensive synthetic total ordering."""


def format_message(message: str, fill: bool = True):
"""Formats a warning message with ANSI color codes for the warning color.

Expand Down
15 changes: 15 additions & 0 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
import typing
from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union
import warnings

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
Expand Down Expand Up @@ -449,6 +450,20 @@ def read_gbq_table(
# if we don't have a unique index, we order by row hash if we are in strict mode
if self._force_total_order:
if not primary_key:
# 5gb chosen rather arbitrarily, mostly want to avoid warning for very small tables, for which the
# price is trivial
if (table.num_bytes or 0) > 5000000000:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Might be good to do a constant here or at least use _ as a 0 separator.

Suggested change
if (table.num_bytes or 0) > 5000000000:
if (table.num_bytes or 0) > 5_000_000_000:

msg = bigframes.exceptions.format_message(
"Large table {table} has no natural ordering. "
"Will use a synthetic natural ordering, this is very expensive. "
"For better performance, use bigframes.option.bigquery.ordering_mode = 'partial' "
"provide unique key as index_col argument, or set a valid primary key constraints on the table"
"before importing it to a bigquery dataframe. "
"(see: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#primary_keys_and_foreign_keys).",
)
warnings.warn(
msg, category=bigframes.exceptions.SyntheticTotalOrderWarning
)
array_value = array_value.order_by(
[
bigframes.core.ordering.OrderingExpression(
Expand Down
23 changes: 23 additions & 0 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import bigframes.core.indexes.base
import bigframes.dataframe
import bigframes.dtypes
import bigframes.exceptions
import bigframes.ml.linear_model
from tests.system import utils

Expand Down Expand Up @@ -1557,3 +1558,25 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder):

assert df.shape[0] == scalars_df.shape[0]
pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes)


def test_read_gbq_large_table_ordering_warning(
session: bigframes.Session,
):
with pytest.warns(
bigframes.exceptions.SyntheticTotalOrderWarning,
):
# table is >10gb and has no primary key
session.read_gbq("bigquery-public-data.covid19_open_data.covid19_open_data")


def test_read_gbq_large_table_no_ordering_no_warning(
unordered_session: bigframes.Session,
):
with warnings.catch_warnings(
category=bigframes.exceptions.SyntheticTotalOrderWarning,
):
# table is >10gb and has no primary key
unordered_session.read_gbq(
"bigquery-public-data.covid19_open_data.covid19_open_data"
)
Loading