[CI] doc builder without custom image (huggingface#36862)

gante · zucchini-nlp · commit 26b4193b1fd4 · 2025-05-14T18:42:52.000+02:00
* no image

* test

* revert jax version updates

* make fixup

* update autodoc path for model_addition_debugger

* shieldgemma2

* add missing pages to toctree
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -15,4 +15,3 @@ jobs:
       pr_number: ${{ github.event.number }}
       package: transformers
       languages: ar de en es fr hi it ko pt tr zh ja te
-      custom_container: huggingface/transformers-doc-builder
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -985,6 +985,8 @@
         title: Qwen2VL
       - local: model_doc/sam
         title: Segment Anything
+      - local: model_doc/shieldgemma2
+        title: ShieldGemma2
       - local: model_doc/siglip
         title: SigLIP
       - local: model_doc/siglip2
@@ -1044,6 +1046,8 @@
   - sections:
     - local: internal/modeling_utils
       title: Custom Layers and Utilities
+    - local: internal/model_debugging_utils
+      title: Utilities for Model Debugging
     - local: internal/pipelines_utils
       title: Utilities for pipelines
     - local: internal/tokenization_utils
diff --git a/docs/source/en/internal/model_debugging_utils.md b/docs/source/en/internal/model_debugging_utils.md
@@ -26,7 +26,7 @@ Most of those are only useful if you are adding new models in the library.
 
 ### Model addition debugger - context manager for model adders
 
-This context manager is a power user tool intended for model adders. 
+This context manager is a power user tool intended for model adders.
 It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
 To note, this context manager enforces `torch.inference_mode()`.
 
@@ -66,6 +66,6 @@ with model_addition_debugger_context(model, "optional_path_to_your_output_file.j
 ```
 
 
-[[autodoc]] utils.model_addition_debugger
+[[autodoc]] model_addition_debugger
 
-[[autodoc]] utils.model_addition_debugger_context
+[[autodoc]] model_addition_debugger_context
diff --git a/setup.py b/setup.py
@@ -121,8 +121,8 @@
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
-    "jax>=0.4.27,<=0.4.38",
-    "jaxlib>=0.4.27,<=0.4.38",
+    "jax>=0.4.1,<=0.4.13",
+    "jaxlib>=0.4.1,<=0.4.13",
     "jieba",
     "jinja2>=3.1.0",
     "kenlm",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -28,8 +28,8 @@
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
-    "jax": "jax>=0.4.27,<=0.4.38",
-    "jaxlib": "jaxlib>=0.4.27,<=0.4.38",
+    "jax": "jax>=0.4.1,<=0.4.13",
+    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
     "jieba": "jieba",
     "jinja2": "jinja2>=3.1.0",
     "kenlm": "kenlm",
diff --git a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py
@@ -25,7 +25,6 @@
 from ...utils import (
     add_start_docstrings_to_model_forward,
     logging,
-    replace_return_docstrings,
 )
 from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModelForImageTextToText
@@ -109,25 +108,6 @@
             Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
             this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
             the complete sequence length.
-
-    Returns:
-        A `ShieldGemma2ImageClassifierOutputWithNoAttention` instance continaing the logits and probabilities
-        associated with the model predicting the `Yes` or `No` token as the response to that prompt, captured in the
-        following properties.
-
-            *   `logits` (`torch.Tensor` of shape `(batch_size, 2)`):
-                The first position along dim=1 is the logits for the `Yes` token and the second position along dim=1 is
-                the logits for the `No` token.
-            *   `probabilities` (`torch.Tensor` of shape `(batch_size, 2)`):
-                The first position along dim=1 is the probability of predicting the `Yes` token and the second position
-                along dim=1 is the probability of predicting the `No` token.
-
-        ShieldGemma prompts are constructed such that predicting the `Yes` token means the content *does violate* the
-        policy as described. If you are only interested in the violative condition, use
-        `violated = outputs.probabilities[:, 1]` to extract that slice from the output tensors.
-
-        When used with the `ShieldGemma2Processor`, the `batch_size` will be equal to `len(images) * len(policies)`,
-        and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN.
 """
 
 
@@ -172,9 +152,6 @@ def tie_weights(self):
 
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(SHIELDGEMMA2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=ShieldGemma2ImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -193,9 +170,26 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> ShieldGemma2ImageClassifierOutputWithNoAttention:
-        """Predicts the binary probability that the image violates the speicfied policy.
+        """Predicts the binary probability that the image violates the specified policy.
 
         Returns:
+            A `ShieldGemma2ImageClassifierOutputWithNoAttention` instance containing the logits and probabilities
+            associated with the model predicting the `Yes` or `No` token as the response to that prompt, captured in the
+            following properties.
+
+                *   `logits` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the logits for the `Yes` token and the second position along dim=1 is
+                    the logits for the `No` token.
+                *   `probabilities` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the probability of predicting the `Yes` token and the second position
+                    along dim=1 is the probability of predicting the `No` token.
+
+            ShieldGemma prompts are constructed such that predicting the `Yes` token means the content *does violate* the
+            policy as described. If you are only interested in the violative condition, use
+            `violated = outputs.probabilities[:, 1]` to extract that slice from the output tensors.
+
+            When used with the `ShieldGemma2Processor`, the `batch_size` will be equal to `len(images) * len(policies)`,
+            and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN.
         """
         outputs = self.model(
             input_ids=input_ids,