googleapis
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎bigframes/bigquery/__init__.py
Lines changed: 2 additions & 1 deletion b/‎bigframes/bigquery/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎bigframes/bigquery/_operations/geo.py
Lines changed: 120 additions & 0 deletions b/‎bigframes/bigquery/_operations/geo.py
Lines changed: 120 additions & 0 deletions
diff --git a/‎bigframes/blob/_functions.py
Lines changed: 3 additions & 1 deletion b/‎bigframes/blob/_functions.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎bigframes/clients.py
Lines changed: 16 additions & 8 deletions b/‎bigframes/clients.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎bigframes/core/blocks.py
Lines changed: 101 additions & 19 deletions b/‎bigframes/core/blocks.py
Lines changed: 101 additions & 19 deletions
@@ -38,6 +38,6 @@ repos:
     rev: v1.10.0
     hooks:
     -   id: mypy
-        additional_dependencies: [types-requests, types-tabulate, pandas-stubs]
+        additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126]
         exclude: "^third_party"
         args: ["--check-untyped-defs", "--explicit-package-bases", "--ignore-missing-imports"]
@@ -27,7 +27,7 @@
     unix_millis,
     unix_seconds,
 )
-from bigframes.bigquery._operations.geo import st_area
+from bigframes.bigquery._operations.geo import st_area, st_difference
 from bigframes.bigquery._operations.json import (
     json_extract,
     json_extract_array,
@@ -48,6 +48,7 @@
     "array_to_string",
     # geo ops
     "st_area",
+    "st_difference",
     # json ops
     "json_set",
     "json_extract",
 
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 from bigframes import operations as ops
+import bigframes.dtypes
 import bigframes.geopandas
 import bigframes.series
 
@@ -91,3 +92,122 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series:
     series = series._apply_unary_op(ops.geo_area_op)
     series.name = None
     return series
+
+
+def st_difference(
+    series: bigframes.series.Series, other: bigframes.series.Series
+) -> bigframes.series.Series:
+    """
+    Returns a GEOGRAPHY that represents the point set difference of
+    `geography_1` and `geography_2`. Therefore, the result consists of the part
+    of `geography_1` that doesn't intersect with `geography_2`.
+
+    If `geometry_1` is completely contained in `geometry_2`, then ST_DIFFERENCE
+    returns an empty GEOGRAPHY.
+
+    ..note::
+        BigQuery's Geography functions, like `st_difference`, interpret the geometry
+        data type as a point set on the Earth's surface. A point set is a set
+        of points, lines, and polygons on the WGS84 reference spheroid, with
+        geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data
+
+    **Examples:**
+
+        >>> import bigframes as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import bigframes.geopandas
+        >>> from shapely.geometry import Polygon, LineString, Point
+        >>> bpd.options.display.progress_bar = None
+
+    We can check two GeoSeries against each other, row by row.
+
+        >>> s1 = bigframes.geopandas.GeoSeries(
+        ...    [
+        ...        Polygon([(0, 0), (2, 2), (0, 2)]),
+        ...        Polygon([(0, 0), (2, 2), (0, 2)]),
+        ...        LineString([(0, 0), (2, 2)]),
+        ...        LineString([(2, 0), (0, 2)]),
+        ...        Point(0, 1),
+        ...    ],
+        ... )
+        >>> s2 = bigframes.geopandas.GeoSeries(
+        ...    [
+        ...        Polygon([(0, 0), (1, 1), (0, 1)]),
+        ...        LineString([(1, 0), (1, 3)]),
+        ...        LineString([(2, 0), (0, 2)]),
+        ...        Point(1, 1),
+        ...        Point(0, 1),
+        ...    ],
+        ...    index=range(1, 6),
+        ... )
+
+        >>> s1
+        0    POLYGON ((0 0, 2 2, 0 2, 0 0))
+        1    POLYGON ((0 0, 2 2, 0 2, 0 0))
+        2             LINESTRING (0 0, 2 2)
+        3             LINESTRING (2 0, 0 2)
+        4                       POINT (0 1)
+        dtype: geometry
+
+        >>> s2
+        1    POLYGON ((0 0, 1 1, 0 1, 0 0))
+        2             LINESTRING (1 0, 1 3)
+        3             LINESTRING (2 0, 0 2)
+        4                       POINT (1 1)
+        5                       POINT (0 1)
+        dtype: geometry
+
+        >>> bbq.st_difference(s1, s2)
+        0                                               None
+        1    POLYGON ((0.99954 1, 2 2, 0 2, 0 1, 0.99954 1))
+        2                   LINESTRING (0 0, 1 1.00046, 2 2)
+        3                           GEOMETRYCOLLECTION EMPTY
+        4                                        POINT (0 1)
+        5                                               None
+        dtype: geometry
+
+    We can also check difference of single shapely geometries:
+
+        >>> sbq1 = bigframes.geopandas.GeoSeries(
+        ...     [
+        ...         Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])
+        ...     ]
+        ... )
+        >>> sbq2 = bigframes.geopandas.GeoSeries(
+        ...     [
+        ...         Polygon([(4, 2), (6, 2), (8, 6), (4, 2)])
+        ...     ]
+        ... )
+
+        >>> sbq1
+        0    POLYGON ((0 0, 10 0, 10 10, 0 0))
+        dtype: geometry
+
+        >>> sbq2
+        0    POLYGON ((4 2, 6 2, 8 6, 4 2))
+        dtype: geometry
+
+        >>> bbq.st_difference(sbq1, sbq2)
+        0    POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4...
+        dtype: geometry
+
+    Additionally, we can check difference of a GeoSeries against a single shapely geometry:
+
+        >>> bbq.st_difference(s1, sbq2)
+        0    POLYGON ((0 0, 2 2, 0 2, 0 0))
+        1                              None
+        2                              None
+        3                              None
+        4                              None
+        dtype: geometry
+
+    Args:
+        other (bigframes.series.Series or geometric object):
+            The GeoSeries (elementwise) or geometric object to find the difference to.
+
+    Returns:
+        bigframes.series.Series:
+            A GeoSeries of the points in each aligned geometry that are not
+            in other.
+    """
+    return series._apply_binary_op(other, ops.geo_st_difference_op)
@@ -68,7 +68,9 @@ def _output_bq_type(self):
 
     def _create_udf(self):
         """Create Python UDF in BQ. Return name of the UDF."""
-        udf_name = str(self._session._loader._storage_manager._random_table())
+        udf_name = str(
+            self._session._loader._storage_manager.generate_unique_resource_id()
+        )
 
         func_body = inspect.getsource(self._func)
         func_name = self._func.__name__
 
@@ -94,16 +94,24 @@ def create_bq_connection(
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function
         self._ensure_iam_binding(project_id, service_account_id, iam_role)
 
-    # Introduce retries to accommodate transient errors like etag mismatch,
-    # which can be caused by concurrent operation on the same resource, and
-    # manifests with message like:
-    # google.api_core.exceptions.Aborted: 409 There were concurrent policy
-    # changes. Please retry the whole read-modify-write with exponential
-    # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match
-    # the current policy's ETag '\007\006\003,\3750&\363'.
+    # Introduce retries to accommodate transient errors like:
+    # (1) Etag mismatch,
+    #     which can be caused by concurrent operation on the same resource, and
+    #     manifests with message like:
+    #     google.api_core.exceptions.Aborted: 409 There were concurrent policy
+    #     changes. Please retry the whole read-modify-write with exponential
+    #     backoff. The request's ETag '\007\006\003,\264\304\337\272' did not
+    #     match the current policy's ETag '\007\006\003,\3750&\363'.
+    # (2) Connection creation,
+    #     for which sometimes it takes a bit for its service account to reflect
+    #     across APIs (e.g. b/397662004, b/386838767), before which, an attempt
+    #     to set an IAM policy for the service account may throw an error like:
+    #     google.api_core.exceptions.InvalidArgument: 400 Service account
+    #     bqcx-*@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not exist.
     @google.api_core.retry.Retry(
         predicate=google.api_core.retry.if_exception_type(
-            google.api_core.exceptions.Aborted
+            google.api_core.exceptions.Aborted,
+            google.api_core.exceptions.InvalidArgument,
         ),
         initial=10,
         maximum=20,
 
@@ -22,6 +22,7 @@
 from __future__ import annotations
 
 import ast
+import copy
 import dataclasses
 import datetime
 import functools
@@ -30,6 +31,7 @@
 import textwrap
 import typing
 from typing import (
+    Any,
     Iterable,
     List,
     Literal,
@@ -49,7 +51,7 @@
 import pyarrow as pa
 
 from bigframes import session
-import bigframes._config.sampling_options as sampling_options
+from bigframes._config import sampling_options
 import bigframes.constants
 import bigframes.core as core
 import bigframes.core.compile.googlesql as googlesql
@@ -535,19 +537,9 @@ def to_pandas(
         Returns:
             pandas.DataFrame, QueryJob
         """
-        if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS):
-            raise NotImplementedError(
-                f"The downsampling method {sampling_method} is not implemented, "
-                f"please choose from {','.join(_SAMPLING_METHODS)}."
-            )
-
-        sampling = bigframes.options.sampling.with_max_download_size(max_download_size)
-        if sampling_method is not None:
-            sampling = sampling.with_method(sampling_method).with_random_state(  # type: ignore
-                random_state
-            )
-        else:
-            sampling = sampling.with_disabled()
+        sampling = self._get_sampling_option(
+            max_download_size, sampling_method, random_state
+        )
 
         df, query_job = self._materialize_local(
             materialize_options=MaterializationOptions(
@@ -559,6 +551,27 @@ def to_pandas(
         df.set_axis(self.column_labels, axis=1, copy=False)
         return df, query_job
 
+    def _get_sampling_option(
+        self,
+        max_download_size: Optional[int] = None,
+        sampling_method: Optional[str] = None,
+        random_state: Optional[int] = None,
+    ) -> sampling_options.SamplingOptions:
+
+        if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS):
+            raise NotImplementedError(
+                f"The downsampling method {sampling_method} is not implemented, "
+                f"please choose from {','.join(_SAMPLING_METHODS)}."
+            )
+
+        sampling = bigframes.options.sampling.with_max_download_size(max_download_size)
+        if sampling_method is None:
+            return sampling.with_disabled()
+
+        return sampling.with_method(sampling_method).with_random_state(  # type: ignore
+            random_state
+        )
+
     def try_peek(
         self, n: int = 20, force: bool = False, allow_large_results=None
     ) -> typing.Optional[pd.DataFrame]:
@@ -798,11 +811,73 @@ def split(
         return [sliced_block.drop_columns(drop_cols) for sliced_block in sliced_blocks]
 
     def _compute_dry_run(
-        self, value_keys: Optional[Iterable[str]] = None
-    ) -> bigquery.QueryJob:
+        self,
+        value_keys: Optional[Iterable[str]] = None,
+        *,
+        ordered: bool = True,
+        max_download_size: Optional[int] = None,
+        sampling_method: Optional[str] = None,
+        random_state: Optional[int] = None,
+    ) -> typing.Tuple[pd.Series, bigquery.QueryJob]:
+        sampling = self._get_sampling_option(
+            max_download_size, sampling_method, random_state
+        )
+        if sampling.enable_downsampling:
+            raise NotImplementedError("Dry run with sampling is not supported")
+
+        index: List[Any] = []
+        values: List[Any] = []
+
+        index.append("columnCount")
+        values.append(len(self.value_columns))
+        index.append("columnDtypes")
+        values.append(
+            {
+                col: self.expr.get_column_type(self.resolve_label_exact_or_error(col))
+                for col in self.column_labels
+            }
+        )
+
+        index.append("indexLevel")
+        values.append(self.index.nlevels)
+        index.append("indexDtypes")
+        values.append(self.index.dtypes)
+
         expr = self._apply_value_keys_to_expr(value_keys=value_keys)
-        query_job = self.session._executor.dry_run(expr)
-        return query_job
+        query_job = self.session._executor.dry_run(expr, ordered)
+        job_api_repr = copy.deepcopy(query_job._properties)
+
+        job_ref = job_api_repr["jobReference"]
+        for key, val in job_ref.items():
+            index.append(key)
+            values.append(val)
+
+        index.append("jobType")
+        values.append(job_api_repr["configuration"]["jobType"])
+
+        query_config = job_api_repr["configuration"]["query"]
+        for key in ("destinationTable", "useLegacySql"):
+            index.append(key)
+            values.append(query_config.get(key))
+
+        query_stats = job_api_repr["statistics"]["query"]
+        for key in (
+            "referencedTables",
+            "totalBytesProcessed",
+            "cacheHit",
+            "statementType",
+        ):
+            index.append(key)
+            values.append(query_stats.get(key))
+
+        index.append("creationTime")
+        values.append(
+            pd.Timestamp(
+                job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC"
+            )
+        )
+
+        return pd.Series(values, index=index), query_job
 
     def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None):
         expr = self._expr
@@ -2703,11 +2778,18 @@ def to_pandas(
                 "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index."
             )
         ordered = ordered if ordered is not None else True
+
         df, query_job = self._block.select_columns([]).to_pandas(
-            ordered=ordered, allow_large_results=allow_large_results
+            ordered=ordered,
+            allow_large_results=allow_large_results,
         )
         return df.index, query_job
 
+    def _compute_dry_run(
+        self, *, ordered: bool = True
+    ) -> Tuple[pd.Series, bigquery.QueryJob]:
+        return self._block.select_columns([])._compute_dry_run(ordered=ordered)
+
     def resolve_level(self, level: LevelsType) -> typing.Sequence[str]:
         if utils.is_list_like(level):
             levels = list(level)