Merge pull request #808 from ScrapeGraphAI/main

VinciGit00 · web-flow · commit 1c090cbc1769 · 2024-11-19T08:00:12.000+01:00
allignment
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -2,7 +2,7 @@
 
 github: ScrapeGraphAI
 patreon: # Replace with a single Patreon username
-open_collective: 
+open_collective: https://opencollective.com/scrapegraphai
 ko_fi: # Replace with a single Ko-fi username
 tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
+## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06)
+
 ## [1.30.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0-beta.4...v1.30.0-beta.5) (2024-11-18)
 
 
@@ -36,6 +38,8 @@
 
 ### Features
 
+* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b))
+
 * Turkish language support has been added to README.md ([60f673d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60f673dc39cba70706291e11211b9ad180860e24))
 
 ## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04)
diff --git a/README.md b/README.md
@@ -12,6 +12,10 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
 [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)
 
+<p align="center">
+<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<p align="center">
+
 ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.).
 
 Just say which information you want to extract and the library will do it for you!
diff --git a/extract_data.py b/extract_data.py
diff --git a/extracted_data.py b/extracted_data.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,11 +2,13 @@
 name = "scrapegraphai"
 
 
+
 version = "1.30.0b5"
 
 
 
 
+
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -1,6 +1,3 @@
-"""
-chromiumloader module
-"""
 import asyncio
 from typing import Any, AsyncIterator, Iterator, List, Optional
 from langchain_community.document_loaders.base import BaseLoader
@@ -12,15 +9,16 @@
 logger = get_logger("web-loader")
 
 class ChromiumLoader(BaseLoader):
-    """scrapes HTML pages from URLs using a (headless) instance of the
-    Chromium web driver with proxy protection
+    """Scrapes HTML pages from URLs using a (headless) instance of the
+    Chromium web driver with proxy protection.
 
     Attributes:
         backend: The web driver backend library; defaults to 'playwright'.
         browser_config: A dictionary containing additional browser kwargs.
-        headless: whether to run browser in headless mode.
+        headless: Whether to run browser in headless mode.
         proxy: A dictionary containing proxy settings; None disables protection.
         urls: A list of URLs to scrape content from.
+        requires_js_support: Flag to determine if JS rendering is required.
     """
 
     RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
         headless: bool = True,
         proxy: Optional[Proxy] = None,
         load_state: str = "domcontentloaded",
+        requires_js_support: bool = False,
         **kwargs: Any,
     ):
         """Initialize the loader with a list of URL paths.
 
         Args:
             backend: The web driver backend library; defaults to 'playwright'.
-            headless: whether to run browser in headless mode.
+            headless: Whether to run browser in headless mode.
             proxy: A dictionary containing proxy information; None disables protection.
             urls: A list of URLs to scrape content from.
+            requires_js_support: Whether to use JS rendering for scraping.
             kwargs: A dictionary containing additional browser kwargs.
 
         Raises:
@@ -61,6 +61,7 @@ def __init__(
         self.proxy = parse_or_search_proxy(proxy) if proxy else None
         self.urls = urls
         self.load_state = load_state
+        self.requires_js_support = requires_js_support
 
     async def ascrape_undetected_chromedriver(self, url: str) -> str:
         """
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
         Yields:
             Document: The scraped content encapsulated within a Document object.
         """
-        scraping_fn = getattr(self, f"ascrape_{self.backend}")
+        scraping_fn = (
+            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+        )
 
         for url in self.urls:
             html_content = asyncio.run(scraping_fn(url))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
             Document: A Document object containing the scraped content, along with its
             source URL as metadata.
         """
-        scraping_fn = getattr(self, f"ascrape_{self.backend}")
+        scraping_fn = (
+            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+        )
 
         tasks = [scraping_fn(url) for url in self.urls]
         results = await asyncio.gather(*tasks)
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source):
 
         compressed_document = self.load_file_content(source, input_type)
 
-        return self.update_state(state, compressed_document)
-
+        # return self.update_state(state, compressed_document)
+        state.update({self.output[0]: compressed_document})
+        return state
     def load_file_content(self, source, input_type):
         """
         Loads the content of a file based on its input type.
@@ -230,8 +231,9 @@ def handle_local_source(self, state, source):
             Document(page_content=parsed_content, metadata={"source": "local_dir"})
         ]
 
-        return self.update_state(state, compressed_document)
-
+        # return self.update_state(state, compressed_document)
+        state.update({self.output[0]: compressed_document})
+        return state
     def handle_web_source(self, state, source):
         """
         Handles the web source by fetching HTML content from a URL, 
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -138,26 +138,17 @@ def invoke_with_timeout(chain, inputs, timeout):
                 partial_variables={"context": doc, "format_instructions": format_instructions}
             )
             chain = prompt | self.llm_model
+
             try:
                 raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
             except Timeout:
                 state.update({self.output[0]: {"error": "Response timeout exceeded"}})
                 return state
 
             if output_parser:
-                try:
-                    answer = output_parser.parse(raw_response.content)
-                except JSONDecodeError:
-                    lines = raw_response.split('\n')
-                    if lines[0].strip().startswith('```'):
-                        lines = lines[1:]
-                    if lines[-1].strip().endswith('```'):
-                        lines = lines[:-1]
-                    cleaned_response = '\n'.join(lines)
-                    answer = output_parser.parse(cleaned_response)
-            else:
-                answer = raw_response.content
+                chain = chain | output_parser
 
+            answer = chain.invoke({"question": user_prompt})
             state.update({self.output[0]: answer})
             return state