Skip to content

Commit cd89fbf

Browse files
Merge branch 'main' into df_local_tests
2 parents 4c4d939 + d6b7ab4 commit cd89fbf

File tree

39 files changed

+775
-224
lines changed

39 files changed

+775
-224
lines changed

bigframes/bigquery/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
json_extract,
3838
json_extract_array,
3939
json_extract_string_array,
40+
json_query,
4041
json_set,
4142
json_value,
4243
parse_json,
@@ -58,10 +59,11 @@
5859
"st_distance",
5960
"st_intersection",
6061
# json ops
61-
"json_set",
6262
"json_extract",
6363
"json_extract_array",
6464
"json_extract_string_array",
65+
"json_query",
66+
"json_set",
6567
"json_value",
6668
"parse_json",
6769
# search ops

bigframes/bigquery/_operations/json.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
from __future__ import annotations
2323

2424
from typing import Any, cast, Optional, Sequence, Tuple, Union
25+
import warnings
2526

2627
import bigframes.core.utils as utils
2728
import bigframes.dtypes
29+
import bigframes.exceptions as bfe
2830
import bigframes.operations as ops
2931
import bigframes.series as series
3032

@@ -87,9 +89,13 @@ def json_extract(
8789
input: series.Series,
8890
json_path: str,
8991
) -> series.Series:
90-
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
91-
value. This function uses single quotes and brackets to escape invalid JSONPath
92-
characters in JSON keys.
92+
"""Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` or
93+
``JSON`` value. This function uses single quotes and brackets to escape invalid
94+
JSONPath characters in JSON keys.
95+
96+
.. deprecated:: 2.5.0
97+
The ``json_extract`` is deprecated and will be removed in a future version.
98+
Use ``json_query`` instead.
9399
94100
**Examples:**
95101
@@ -111,6 +117,11 @@ def json_extract(
111117
Returns:
112118
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
113119
"""
120+
msg = (
121+
"The `json_extract` is deprecated and will be removed in a future version. "
122+
"Use `json_query` instead."
123+
)
124+
warnings.warn(bfe.format_message(msg), category=UserWarning)
114125
return input._apply_unary_op(ops.JSONExtract(json_path=json_path))
115126

116127

@@ -231,6 +242,37 @@ def json_extract_string_array(
231242
return array_series
232243

233244

245+
def json_query(
246+
input: series.Series,
247+
json_path: str,
248+
) -> series.Series:
249+
"""Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING``
250+
or ``JSON`` value. This function uses double quotes to escape invalid JSONPath
251+
characters in JSON keys. For example: ``"a.b"``.
252+
253+
**Examples:**
254+
255+
>>> import bigframes.pandas as bpd
256+
>>> import bigframes.bigquery as bbq
257+
>>> bpd.options.display.progress_bar = None
258+
259+
>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
260+
>>> bbq.json_query(s, json_path="$.class")
261+
0 {"students":[{"id":5},{"id":12}]}
262+
dtype: string
263+
264+
Args:
265+
input (bigframes.series.Series):
266+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
267+
json_path (str):
268+
The JSON path identifying the data that you want to obtain from the input.
269+
270+
Returns:
271+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
272+
"""
273+
return input._apply_unary_op(ops.JSONQuery(json_path=json_path))
274+
275+
234276
def json_value(
235277
input: series.Series,
236278
json_path: str,

bigframes/core/bigframe_node.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import typing
2323
from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple
2424

25-
from bigframes.core import identifiers
25+
from bigframes.core import field, identifiers
2626
import bigframes.core.schema as schemata
2727
import bigframes.dtypes
2828

@@ -34,23 +34,6 @@
3434
T = typing.TypeVar("T")
3535

3636

37-
@dataclasses.dataclass(frozen=True)
38-
class Field:
39-
id: identifiers.ColumnId
40-
dtype: bigframes.dtypes.Dtype
41-
# Best effort, nullable=True if not certain
42-
nullable: bool = True
43-
44-
def with_nullable(self) -> Field:
45-
return Field(self.id, self.dtype, nullable=True)
46-
47-
def with_nonnull(self) -> Field:
48-
return Field(self.id, self.dtype, nullable=False)
49-
50-
def with_id(self, id: identifiers.ColumnId) -> Field:
51-
return Field(id, self.dtype, nullable=self.nullable)
52-
53-
5437
@dataclasses.dataclass(eq=False, frozen=True)
5538
class BigFrameNode:
5639
"""
@@ -162,7 +145,7 @@ def roots(self) -> typing.Set[BigFrameNode]:
162145
# TODO: Store some local data lazily for select, aggregate nodes.
163146
@property
164147
@abc.abstractmethod
165-
def fields(self) -> Sequence[Field]:
148+
def fields(self) -> Sequence[field.Field]:
166149
...
167150

168151
@property
@@ -292,7 +275,7 @@ def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]:
292275
return {field.id: field.dtype for field in self.fields}
293276

294277
@functools.cached_property
295-
def field_by_id(self) -> Mapping[identifiers.ColumnId, Field]:
278+
def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]:
296279
return {field.id: field for field in self.fields}
297280

298281
# Plan algorithms

bigframes/core/blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2166,7 +2166,7 @@ def merge(
21662166
result_columns.append(get_column_left[col_id])
21672167
for col_id in other.value_columns:
21682168
if col_id in right_join_ids:
2169-
if other.col_id_to_label[matching_right_id] in matching_join_labels:
2169+
if other.col_id_to_label[col_id] in matching_join_labels:
21702170
pass
21712171
else:
21722172
result_columns.append(get_column_right[col_id])

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,19 @@ def json_extract_string_array_op_impl(
13561356
return json_extract_string_array(json_obj=x, json_path=op.json_path)
13571357

13581358

1359+
@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True)
1360+
def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery):
1361+
# Define a user-defined function whose returned type is dynamically matching the input.
1362+
def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
1363+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1364+
...
1365+
1366+
return_type = x.type()
1367+
json_query.__annotations__["return"] = return_type
1368+
json_query_op = ibis_udf.scalar.builtin(json_query)
1369+
return json_query_op(json_or_json_string=x, json_path=op.json_path)
1370+
1371+
13591372
@scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True)
13601373
def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON):
13611374
return parse_json(json_str=x)

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
2727
import bigframes.core.compile.sqlglot.sqlglot_ir as ir
2828
import bigframes.core.ordering as bf_ordering
29+
from bigframes.core.rewrite import schema_binding
2930

3031

3132
class SQLGlotCompiler:
@@ -120,7 +121,14 @@ def _remap_variables(self, node: nodes.ResultNode) -> nodes.ResultNode:
120121

121122
def _compile_result_node(self, root: nodes.ResultNode) -> str:
122123
sqlglot_ir = self.compile_node(root.child)
123-
# TODO: add order_by, limit, and selections to sqlglot_expr
124+
125+
selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple(
126+
(name, scalar_compiler.compile_scalar_expression(ref))
127+
for ref, name in root.output_cols
128+
)
129+
sqlglot_ir = sqlglot_ir.select(selected_cols)
130+
131+
# TODO: add order_by, limit to sqlglot_expr
124132
return sqlglot_ir.sql
125133

126134
@functools.lru_cache(maxsize=5000)
@@ -176,6 +184,6 @@ def compile_projection(
176184

177185
def _replace_unsupported_ops(node: nodes.BigFrameNode):
178186
node = nodes.bottom_up(node, rewrite.rewrite_slice)
179-
node = nodes.bottom_up(node, rewrite.rewrite_timedelta_expressions)
187+
node = nodes.bottom_up(node, schema_binding.bind_schema_to_expressions)
180188
node = nodes.bottom_up(node, rewrite.rewrite_range_rolling)
181189
return node

bigframes/core/compile/sqlglot/scalar_compiler.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ def compile_deref_expression(expr: expression.DerefOp) -> sge.Expression:
3535
return sge.ColumnDef(this=sge.to_identifier(expr.id.sql, quoted=True))
3636

3737

38+
@compile_scalar_expression.register
39+
def compile_field_ref_expression(
40+
expr: expression.SchemaFieldRefExpression,
41+
) -> sge.Expression:
42+
return sge.ColumnDef(this=sge.to_identifier(expr.field.id.sql, quoted=True))
43+
44+
3845
@compile_scalar_expression.register
3946
def compile_constant_expression(
4047
expr: expression.ScalarConstantExpression,

bigframes/core/compile/sqlglot/sqlglot_ir.py

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,22 @@ def select(
128128
self,
129129
selected_cols: tuple[tuple[str, sge.Expression], ...],
130130
) -> SQLGlotIR:
131-
cols_expr = [
131+
selections = [
132132
sge.Alias(
133133
this=expr,
134134
alias=sge.to_identifier(id, quoted=self.quoted),
135135
)
136136
for id, expr in selected_cols
137137
]
138-
new_expr = self._encapsulate_as_cte().select(*cols_expr, append=False)
139-
return SQLGlotIR(expr=new_expr)
138+
# Attempts to simplify selected columns when the original and new column
139+
# names are simply aliases of each other.
140+
squashed_selections = _squash_selections(self.expr.expressions, selections)
141+
if squashed_selections != []:
142+
new_expr = self.expr.select(*squashed_selections, append=False)
143+
return SQLGlotIR(expr=new_expr)
144+
else:
145+
new_expr = self._encapsulate_as_cte().select(*selections, append=False)
146+
return SQLGlotIR(expr=new_expr)
140147

141148
def project(
142149
self,
@@ -199,7 +206,7 @@ def _encapsulate_as_cte(
199206
this=select_expr,
200207
alias=new_cte_name,
201208
)
202-
new_with_clause = sge.With(expressions=existing_ctes + [new_cte])
209+
new_with_clause = sge.With(expressions=[*existing_ctes, new_cte])
203210
new_select_expr = (
204211
sge.Select().select(sge.Star()).from_(sge.Table(this=new_cte_name))
205212
)
@@ -254,3 +261,62 @@ def _table(table: bigquery.TableReference) -> sge.Table:
254261
db=sg.to_identifier(table.dataset_id, quoted=True),
255262
catalog=sg.to_identifier(table.project, quoted=True),
256263
)
264+
265+
266+
def _squash_selections(
267+
old_expr: list[sge.Expression], new_expr: list[sge.Alias]
268+
) -> list[sge.Alias]:
269+
"""
270+
Simplifies the select column expressions if existing (old_expr) and
271+
new (new_expr) selected columns are both simple aliases of column definitions.
272+
273+
Example:
274+
old_expr: [A AS X, B AS Y]
275+
new_expr: [X AS P, Y AS Q]
276+
Result: [A AS P, B AS Q]
277+
"""
278+
old_alias_map: typing.Dict[str, str] = {}
279+
for selected in old_expr:
280+
column_alias_pair = _get_column_alias_pair(selected)
281+
if column_alias_pair is None:
282+
return []
283+
else:
284+
old_alias_map[column_alias_pair[1]] = column_alias_pair[0]
285+
286+
new_selected_cols: typing.List[sge.Alias] = []
287+
for selected in new_expr:
288+
column_alias_pair = _get_column_alias_pair(selected)
289+
if column_alias_pair is None or column_alias_pair[0] not in old_alias_map:
290+
return []
291+
else:
292+
new_alias_expr = sge.Alias(
293+
this=sge.ColumnDef(
294+
this=sge.to_identifier(
295+
old_alias_map[column_alias_pair[0]], quoted=True
296+
)
297+
),
298+
alias=sg.to_identifier(column_alias_pair[1], quoted=True),
299+
)
300+
new_selected_cols.append(new_alias_expr)
301+
return new_selected_cols
302+
303+
304+
def _get_column_alias_pair(
305+
expr: sge.Expression,
306+
) -> typing.Optional[typing.Tuple[str, str]]:
307+
"""Checks if an expression is a simple alias of a column definition
308+
(e.g., "column_name AS alias_name").
309+
If it is, returns a tuple containing the alias name and original column name.
310+
Returns `None` otherwise.
311+
"""
312+
if not isinstance(expr, sge.Alias):
313+
return None
314+
if not isinstance(expr.this, sge.ColumnDef):
315+
return None
316+
317+
column_def_expr: sge.ColumnDef = expr.this
318+
if not isinstance(column_def_expr.this, sge.Identifier):
319+
return None
320+
321+
original_identifier: sge.Identifier = column_def_expr.this
322+
return (original_identifier.this, expr.alias)

0 commit comments

Comments
 (0)