Skip to content

Commit 1c090cb

Browse files
authored
Merge pull request #808 from ScrapeGraphAI/main
allignment
2 parents af901a5 + 98cf5f1 commit 1c090cb

File tree

9 files changed

+34
-81
lines changed

9 files changed

+34
-81
lines changed

.github/FUNDING.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
github: ScrapeGraphAI
44
patreon: # Replace with a single Patreon username
5-
open_collective:
5+
open_collective: https://opencollective.com/scrapegraphai
66
ko_fi: # Replace with a single Ko-fi username
77
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
88
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06)
2+
13
## [1.30.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0-beta.4...v1.30.0-beta.5) (2024-11-18)
24

35

@@ -36,6 +38,8 @@
3638

3739
### Features
3840

41+
* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b))
42+
3943
* Turkish language support has been added to README.md ([60f673d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60f673dc39cba70706291e11211b9ad180860e24))
4044

4145
## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04)

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
1313
[![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)
1414

15+
<p align="center">
16+
<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
17+
<p align="center">
18+
1519
ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.).
1620

1721
Just say which information you want to extract and the library will do it for you!

extract_data.py

Lines changed: 0 additions & 27 deletions
This file was deleted.

extracted_data.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
name = "scrapegraphai"
33

44

5+
56
version = "1.30.0b5"
67

78

89

910

11+
1012
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
1113
authors = [
1214
{ name = "Marco Vinciguerra", email = "[email protected]" },

scrapegraphai/docloaders/chromium.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
"""
2-
chromiumloader module
3-
"""
41
import asyncio
52
from typing import Any, AsyncIterator, Iterator, List, Optional
63
from langchain_community.document_loaders.base import BaseLoader
@@ -12,15 +9,16 @@
129
logger = get_logger("web-loader")
1310

1411
class ChromiumLoader(BaseLoader):
15-
"""scrapes HTML pages from URLs using a (headless) instance of the
16-
Chromium web driver with proxy protection
12+
"""Scrapes HTML pages from URLs using a (headless) instance of the
13+
Chromium web driver with proxy protection.
1714
1815
Attributes:
1916
backend: The web driver backend library; defaults to 'playwright'.
2017
browser_config: A dictionary containing additional browser kwargs.
21-
headless: whether to run browser in headless mode.
18+
headless: Whether to run browser in headless mode.
2219
proxy: A dictionary containing proxy settings; None disables protection.
2320
urls: A list of URLs to scrape content from.
21+
requires_js_support: Flag to determine if JS rendering is required.
2422
"""
2523

2624
RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
3432
headless: bool = True,
3533
proxy: Optional[Proxy] = None,
3634
load_state: str = "domcontentloaded",
35+
requires_js_support: bool = False,
3736
**kwargs: Any,
3837
):
3938
"""Initialize the loader with a list of URL paths.
4039
4140
Args:
4241
backend: The web driver backend library; defaults to 'playwright'.
43-
headless: whether to run browser in headless mode.
42+
headless: Whether to run browser in headless mode.
4443
proxy: A dictionary containing proxy information; None disables protection.
4544
urls: A list of URLs to scrape content from.
45+
requires_js_support: Whether to use JS rendering for scraping.
4646
kwargs: A dictionary containing additional browser kwargs.
4747
4848
Raises:
@@ -61,6 +61,7 @@ def __init__(
6161
self.proxy = parse_or_search_proxy(proxy) if proxy else None
6262
self.urls = urls
6363
self.load_state = load_state
64+
self.requires_js_support = requires_js_support
6465

6566
async def ascrape_undetected_chromedriver(self, url: str) -> str:
6667
"""
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
186187
Yields:
187188
Document: The scraped content encapsulated within a Document object.
188189
"""
189-
scraping_fn = getattr(self, f"ascrape_{self.backend}")
190+
scraping_fn = (
191+
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
192+
)
190193

191194
for url in self.urls:
192195
html_content = asyncio.run(scraping_fn(url))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
206209
Document: A Document object containing the scraped content, along with its
207210
source URL as metadata.
208211
"""
209-
scraping_fn = getattr(self, f"ascrape_{self.backend}")
212+
scraping_fn = (
213+
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
214+
)
210215

211216
tasks = [scraping_fn(url) for url in self.urls]
212217
results = await asyncio.gather(*tasks)

scrapegraphai/nodes/fetch_node.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source):
170170

171171
compressed_document = self.load_file_content(source, input_type)
172172

173-
return self.update_state(state, compressed_document)
174-
173+
# return self.update_state(state, compressed_document)
174+
state.update({self.output[0]: compressed_document})
175+
return state
175176
def load_file_content(self, source, input_type):
176177
"""
177178
Loads the content of a file based on its input type.
@@ -230,8 +231,9 @@ def handle_local_source(self, state, source):
230231
Document(page_content=parsed_content, metadata={"source": "local_dir"})
231232
]
232233

233-
return self.update_state(state, compressed_document)
234-
234+
# return self.update_state(state, compressed_document)
235+
state.update({self.output[0]: compressed_document})
236+
return state
235237
def handle_web_source(self, state, source):
236238
"""
237239
Handles the web source by fetching HTML content from a URL,

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -138,26 +138,17 @@ def invoke_with_timeout(chain, inputs, timeout):
138138
partial_variables={"context": doc, "format_instructions": format_instructions}
139139
)
140140
chain = prompt | self.llm_model
141+
141142
try:
142143
raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
143144
except Timeout:
144145
state.update({self.output[0]: {"error": "Response timeout exceeded"}})
145146
return state
146147

147148
if output_parser:
148-
try:
149-
answer = output_parser.parse(raw_response.content)
150-
except JSONDecodeError:
151-
lines = raw_response.split('\n')
152-
if lines[0].strip().startswith('```'):
153-
lines = lines[1:]
154-
if lines[-1].strip().endswith('```'):
155-
lines = lines[:-1]
156-
cleaned_response = '\n'.join(lines)
157-
answer = output_parser.parse(cleaned_response)
158-
else:
159-
answer = raw_response.content
149+
chain = chain | output_parser
160150

151+
answer = chain.invoke({"question": user_prompt})
161152
state.update({self.output[0]: answer})
162153
return state
163154

0 commit comments

Comments
 (0)