Skip to content

feat: add blob.transcribe function #1773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion bigframes/operations/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from __future__ import annotations

import os
from typing import cast, Optional, Union
from typing import cast, Literal, Optional, Union
import warnings

import IPython.display as ipy_display
Expand Down Expand Up @@ -736,3 +736,77 @@ def pdf_chunk(
return struct_series
else:
return content_series

def audio_transcribe(
self,
*,
connection: Optional[str] = None,
model_name: Optional[
Literal[
"gemini-2.0-flash-001",
"gemini-2.0-flash-lite-001",
]
] = None,
verbose: bool = False,
) -> bigframes.series.Series:
"""
Transcribe audio content using a Gemini multimodal model.

Args:
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
model_name (str): The model for natural language tasks. Accepted
values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001".
See "https://ai.google.dev/gemini-api/docs/models" for model choices.
verbose (bool, default "False"): controls the verbosity of the output.
When set to True, both error messages and the transcribed content
are displayed. Conversely, when set to False, only the transcribed
content is presented, suppressing error messages.

Returns:
bigframes.series.Series: str or struct[str, str],
depend on the "verbose" parameter.
Contains the transcribed text from the audio file.
Includes error messages if verbosity is enabled.
"""
import bigframes.bigquery as bbq
import bigframes.ml.llm as llm
import bigframes.pandas as bpd

# col name doesn't matter here. Rename to avoid column name conflicts
audio_series = bigframes.series.Series(self._block)

prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio."

llm_model = llm.GeminiTextGenerator(
model_name=model_name,
session=self._block.session,
connection_name=connection,
)

# transcribe audio using ML.GENERATE_TEXT
transcribed_results = llm_model.predict(
X=audio_series,
prompt=[prompt_text, audio_series],
temperature=0.0,
)

transcribed_content_series = cast(
bpd.Series, transcribed_results["ml_generate_text_llm_result"]
).rename("transcribed_content")

if verbose:
transcribed_status_series = cast(
bpd.Series, transcribed_results["ml_generate_text_status"]
)
results_df = bpd.DataFrame(
{
"status": transcribed_status_series,
"content": transcribed_content_series,
}
)
results_struct = bbq.struct(results_df).rename("transcription_results")
return results_struct
else:
return transcribed_content_series
Binary file added scripts/data/audio/audio_LJ001-0010.wav
Binary file not shown.
Binary file added scripts/data/pdfs/pdfs_sample-local-pdf.pdf
Binary file not shown.
Binary file added scripts/data/pdfs/test-protected.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,3 +1521,17 @@ def pdf_mm_df(
pdf_gcs_path, session: bigframes.Session, bq_connection: str
) -> bpd.DataFrame:
return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection)


@pytest.fixture(scope="session")
def audio_gcs_path() -> str:
return "gs://bigframes_blob_test/audio/*"


@pytest.fixture(scope="session")
def audio_mm_df(
audio_gcs_path, session: bigframes.Session, bq_connection: str
) -> bpd.DataFrame:
return session.from_glob_path(
audio_gcs_path, name="audio", connection=bq_connection
)
51 changes: 51 additions & 0 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,3 +385,54 @@ def test_blob_pdf_chunk(
check_dtype=False,
check_index=False,
)


@pytest.mark.parametrize(
"model_name, verbose",
[
("gemini-2.0-flash-001", True),
("gemini-2.0-flash-001", False),
("gemini-2.0-flash-lite-001", True),
("gemini-2.0-flash-lite-001", False),
],
)
def test_blob_transcribe(
audio_mm_df: bpd.DataFrame,
model_name: str,
verbose: bool,
):
actual = (
audio_mm_df["audio"]
.blob.audio_transcribe(
model_name=model_name,
verbose=verbose,
)
.to_pandas()
)

# check relative length
expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress"
expected_len = len(expected_text)

actual_text = ""
if verbose:
actual_text = actual[0]["content"]
else:
actual_text = actual[0]
actual_len = len(actual_text)

relative_length_tolerance = 0.2
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
f"Item (verbose={verbose}): Transcribed text length {actual_len} is outside the acceptable range "
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
f"Expected reference length was {expected_len}. "
)

# check for major keywords
major_keywords = ["book", "picture"]
for keyword in major_keywords:
assert (
keyword.lower() in actual_text.lower()
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. "