Merge pull request #962 from lrdoflnlss/add-js-scraping

VinciGit00 · web-flow · commit df4aa5fd6568 · 2025-04-14T09:14:11.000+02:00
tune scraper
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -360,7 +360,8 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
                     else:
                         raise ValueError(f"Invalid browser name: {browser_name}")
                     context = await browser.new_context(
-                        storage_state=self.storage_state
+                        storage_state=self.storage_state,
+                        ignore_https_errors=True,
                     )
                     await Malenia.apply_stealth(context)
                     page = await context.new_page()
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -3,12 +3,44 @@
 """
 
 import re
+import json
 from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup, Comment
 from minify_html import minify
 
 
+def extract_from_script_tags(soup):
+    script_content = []
+    
+    for script in soup.find_all("script"):
+        content = script.string
+        if content:
+            try:
+                json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
+                json_matches = re.findall(json_pattern, content)
+                
+                for potential_json in json_matches:
+                    try:
+                        parsed = json.loads(potential_json)
+                        if parsed:
+                            script_content.append(f"JSON data from script: {json.dumps(parsed, indent=2)}")
+                    except json.JSONDecodeError:
+                        pass
+                
+                if "window." in content or "document." in content:
+                    data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
+                    data_matches = re.findall(data_pattern, content)
+                    
+                    for var_name, var_value in data_matches:
+                        script_content.append(f"Dynamic data - {var_name}: {var_value.strip()}")
+            except Exception:
+                if len(content) < 1000:
+                    script_content.append(f"Script content: {content.strip()}")
+    
+    return "\n\n".join(script_content)
+
+
 def cleanup_html(html_content: str, base_url: str) -> str:
     """
     Processes HTML content by removing unnecessary tags,
@@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:
 
     title_tag = soup.find("title")
     title = title_tag.get_text() if title_tag else ""
-
-    for tag in soup.find_all(["script", "style"]):
+    
+    script_content = extract_from_script_tags(soup)
+    
+    for tag in soup.find_all("style"):
         tag.extract()
 
     link_urls = [
@@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
     body_content = soup.find("body")
     if body_content:
         minimized_body = minify(str(body_content))
-        return title, minimized_body, link_urls, image_urls
+        return title, minimized_body, link_urls, image_urls, script_content
 
     else:
         raise ValueError(
@@ -106,10 +140,10 @@ def reduce_html(html, reduction):
     for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
         comment.extract()
 
-    for tag in soup(["script", "style"]):
+    for tag in soup(["style"]):
         tag.string = ""
 
-    attrs_to_keep = ["class", "id", "href", "src"]
+    attrs_to_keep = ["class", "id", "href", "src", "type"]
     for tag in soup.find_all(True):
         for attr in list(tag.attrs):
             if attr not in attrs_to_keep:
@@ -118,15 +152,15 @@ def reduce_html(html, reduction):
     if reduction == 1:
         return minify_html(str(soup))
 
-    for tag in soup(["script", "style"]):
+    for tag in soup(["style"]):
         tag.decompose()
 
     body = soup.body
     if not body:
         return "No <body> tag found in the HTML"
 
     for tag in body.find_all(string=True):
-        if tag.parent.name not in ["script", "style"]:
+        if tag.parent.name not in ["script"]:
             tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20])
 
     reduced_html = str(body)
diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py
@@ -10,7 +10,7 @@
 import requests
 from fp.errors import FreeProxyException
 from fp.fp import FreeProxy
-
+from urllib.parse import urlparse
 
 class ProxyBrokerCriteria(TypedDict, total=False):
     """
@@ -188,59 +188,21 @@ def is_ipv4_address(address: str) -> bool:
 
 
 def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
-    """parses a proxy configuration or searches for a new one matching
-    the specified broker criteria
-
-    Args:
-        proxy: The proxy configuration to parse or search for.
-
-    Returns:
-        A 'playwright' compliant proxy configuration.
-
-    Notes:
-        - If the proxy server is a IP address, it is assumed to be
-        a proxy server address.
-        - If the proxy server is 'broker', a proxy server is searched for
-        based on the provided broker criteria.
-
-    Example:
-        >>> proxy = {
-        ...     "server": "broker",
-        ...     "criteria": {
-        ...         "anonymous": True,
-        ...         "countryset": {"GB", "US"},
-        ...         "secure": True,
-        ...         "timeout": 5.0
-        ...         "search_outside_if_empty": False
-        ...     }
-        ... }
-
-        >>> parse_or_search_proxy(proxy)
-        {
-            "server": "<proxy-server-matching-criteria>",
-        }
-
-    Example:
-        >>> proxy = {
-        ...     "server": "192.168.1.1:8080",
-        ...     "username": "<username>",
-        ...     "password": "<password>"
-        ... }
-
-        >>> parse_or_search_proxy(proxy)
-        {
-            "server": "192.168.1.1:8080",
-            "username": "<username>",
-            "password": "<password>"
-        }
     """
-    assert "server" in proxy, "missing server in the proxy configuration"
+    Parses a proxy configuration or searches for a matching one via broker.
+    """
+    assert "server" in proxy, "Missing 'server' field in the proxy configuration."
+
+    parsed_url = urlparse(proxy["server"])
+    server_address = parsed_url.hostname
 
-    server_address = re.sub(r"^\w+://", "", proxy["server"]).split(":", maxsplit=1)[0]
+    if server_address is None:
+        raise ValueError(f"Invalid proxy server format: {proxy['server']}")
 
-    if is_ipv4_address(server_address):
+    # Accept both IP addresses and domain names like 'gate.nodemaven.com'
+    if is_ipv4_address(server_address) or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address):
         return _parse_proxy(proxy)
 
-    assert proxy["server"] == "broker", "unknown proxy server"
+    assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}"
 
     return _search_proxy(proxy)