googleapis · shuoweil · Jun 6, 2025 · May 20, 2025 · May 22, 2025 · May 23, 2025
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import os
-from typing import cast, Optional, Union
+from typing import cast, Literal, Optional, Union
 import warnings
 
 import IPython.display as ipy_display
@@ -736,3 +736,77 @@ def pdf_chunk(
             return struct_series
         else:
             return content_series
+
+    def audio_transcribe(
+        self,
+        *,
+        connection: Optional[str] = None,
+        model_name: Optional[
+            Literal[
+                "gemini-2.0-flash-001",
+                "gemini-2.0-flash-lite-001",
+            ]
+        ] = None,
+        verbose: bool = False,
+    ) -> bigframes.series.Series:
+        """
+        Transcribe audio content using a Gemini multimodal model.
+
+        Args:
+            connection (str or None, default None): BQ connection used for
+                function internet transactions, and the output blob if "dst"
+                is str. If None, uses default connection of the session.
+            model_name (str): The model for natural language tasks. Accepted
+                values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001".
+                See "https://ai.google.dev/gemini-api/docs/models" for model choices.
+            verbose (bool, default "False"): controls the verbosity of the output.
+                When set to True, both error messages and the transcribed content
+                are displayed. Conversely, when set to False, only the transcribed
+                content is presented, suppressing error messages.
+
+        Returns:
+            bigframes.series.Series: str or struct[str, str],
+                depend on the "verbose" parameter.
+                Contains the transcribed text from the audio file.
+                Includes error messages if verbosity is enabled.
+        """
+        import bigframes.bigquery as bbq
+        import bigframes.ml.llm as llm
+        import bigframes.pandas as bpd
+
+        # col name doesn't matter here. Rename to avoid column name conflicts
+        audio_series = bigframes.series.Series(self._block)
+
+        prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio."
+
+        llm_model = llm.GeminiTextGenerator(
+            model_name=model_name,
+            session=self._block.session,
+            connection_name=connection,
+        )
+
+        # transcribe audio using ML.GENERATE_TEXT
+        transcribed_results = llm_model.predict(
+            X=audio_series,
+            prompt=[prompt_text, audio_series],
+            temperature=0.0,
+        )
+
+        transcribed_content_series = cast(
+            bpd.Series, transcribed_results["ml_generate_text_llm_result"]
+        ).rename("transcribed_content")
+
+        if verbose:
+            transcribed_status_series = cast(
+                bpd.Series, transcribed_results["ml_generate_text_status"]
+            )
+            results_df = bpd.DataFrame(
+                {
+                    "status": transcribed_status_series,
+                    "content": transcribed_content_series,
+                }
+            )
+            results_struct = bbq.struct(results_df).rename("transcription_results")
+            return results_struct
+        else:
+            return transcribed_content_series
@@ -1521,3 +1521,17 @@ def pdf_mm_df(
     pdf_gcs_path, session: bigframes.Session, bq_connection: str
 ) -> bpd.DataFrame:
     return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection)
+
+
+@pytest.fixture(scope="session")
+def audio_gcs_path() -> str:
+    return "gs://bigframes_blob_test/audio/*"
+
+
+@pytest.fixture(scope="session")
+def audio_mm_df(
+    audio_gcs_path, session: bigframes.Session, bq_connection: str
+) -> bpd.DataFrame:
+    return session.from_glob_path(
+        audio_gcs_path, name="audio", connection=bq_connection
+    )
@@ -385,3 +385,54 @@ def test_blob_pdf_chunk(
         check_dtype=False,
         check_index=False,
     )
+
+
+@pytest.mark.parametrize(
+    "model_name, verbose",
+    [
+        ("gemini-2.0-flash-001", True),
+        ("gemini-2.0-flash-001", False),
+        ("gemini-2.0-flash-lite-001", True),
+        ("gemini-2.0-flash-lite-001", False),
+    ],
+)
+def test_blob_transcribe(
+    audio_mm_df: bpd.DataFrame,
+    model_name: str,
+    verbose: bool,
+):
+    actual = (
+        audio_mm_df["audio"]
+        .blob.audio_transcribe(
+            model_name=model_name,
+            verbose=verbose,
+        )
+        .to_pandas()
+    )
+
+    # check relative length
+    expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress"
+    expected_len = len(expected_text)
+
+    actual_text = ""
+    if verbose:
+        actual_text = actual[0]["content"]
+    else:
+        actual_text = actual[0]
+    actual_len = len(actual_text)
+
+    relative_length_tolerance = 0.2
+    min_acceptable_len = expected_len * (1 - relative_length_tolerance)
+    max_acceptable_len = expected_len * (1 + relative_length_tolerance)
+    assert min_acceptable_len <= actual_len <= max_acceptable_len, (
+        f"Item (verbose={verbose}): Transcribed text length {actual_len} is outside the acceptable range "
+        f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
+        f"Expected reference length was {expected_len}. "
+    )
+
+    # check for major keywords
+    major_keywords = ["book", "picture"]
+    for keyword in major_keywords:
+        assert (
+            keyword.lower() in actual_text.lower()
+        ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. "