Skip to content

Commit 1c8b910

Browse files
author
Alin Cristian Preda
committed
feat: added scrolling method to chromium docloader
1 parent fde878f commit 1c8b910

File tree

1 file changed

+137
-4
lines changed

1 file changed

+137
-4
lines changed

scrapegraphai/docloaders/chromium.py

+137-4
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
from langchain_core.documents import Document
55
import aiohttp
66
import async_timeout
7+
from typing import Union
78
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
89

910
logger = get_logger("web-loader")
11+
logger.setLevel("INFO")
1012

1113
class ChromiumLoader(BaseLoader):
1214
"""Scrapes HTML pages from URLs using a (headless) instance of the
@@ -97,14 +99,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9799

98100
return results
99101

102+
async def ascrape_playwright_scroll(
103+
self,
104+
url: str,
105+
timeout: Union[int, None]=30,
106+
scroll: int=15000,
107+
sleep: float=2,
108+
scroll_to_bottom: bool=False
109+
) -> str:
110+
"""
111+
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
112+
113+
Notes:
114+
- The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
115+
- If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
116+
the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
117+
- Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
118+
Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
119+
make sure that the scrolling actually happens, thereby allowing the page height to change.
120+
- Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
121+
122+
Args:
123+
- url (str): The URL to scrape.
124+
- timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
125+
Can also be set to None, in which case the scraper will only stop when the page height stops changing.
126+
- scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
127+
Less than this and we don't scroll enough to see any content change.
128+
- sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
129+
Defaults to 2. Must be greater than 0.
130+
131+
Returns:
132+
str: The scraped HTML content
133+
134+
Raises:
135+
- ValueError: If the timeout value is less than or equal to 0.
136+
- ValueError: If the sleep value is less than or equal to 0.
137+
- ValueError: If the scroll value is less than 5000.
138+
"""
139+
# NB: I have tested using scrollHeight to determine when to stop scrolling
140+
# but it doesn't always work as expected. The page height doesn't change on some sites like
141+
# https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
142+
# In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
143+
144+
if timeout and timeout <= 0:
145+
raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.")
146+
147+
if sleep <= 0:
148+
raise ValueError("Sleep for scrolling scraper value must be greater than 0.")
149+
150+
if scroll < 5000:
151+
raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.")
152+
153+
from playwright.async_api import async_playwright
154+
from undetected_playwright import Malenia
155+
import time
156+
157+
logger.info(f"Starting scraping with scrolling support for {url}...")
158+
159+
results = ""
160+
attempt = 0
161+
162+
while attempt < self.RETRY_LIMIT:
163+
try:
164+
async with async_playwright() as p:
165+
browser = await p.chromium.launch(
166+
headless=self.headless, proxy=self.proxy, **self.browser_config
167+
)
168+
context = await browser.new_context()
169+
await Malenia.apply_stealth(context)
170+
page = await context.new_page()
171+
await page.goto(url, wait_until="domcontentloaded")
172+
await page.wait_for_load_state(self.load_state)
173+
174+
previous_height = None
175+
start_time = time.time()
176+
177+
# Store the heights of the page after each scroll
178+
# This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
179+
# or simly when the page stops changing for some reason.
180+
heights = []
181+
182+
while True:
183+
current_height = await page.evaluate("document.body.scrollHeight")
184+
heights.append(current_height)
185+
heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory
186+
187+
# Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
188+
# Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
189+
# or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
190+
if scroll_to_bottom and previous_height == current_height:
191+
logger.info(f"Reached bottom of page for url {url}")
192+
break
193+
194+
previous_height = current_height
195+
196+
await page.mouse.wheel(0, scroll)
197+
logger.debug(f"Scrolled {url} to current height {current_height}px...")
198+
time.sleep(sleep) # Allow some time for any lazy-loaded content to load
199+
200+
current_time = time.time()
201+
elapsed_time = current_time - start_time
202+
logger.debug(f"Elapsed time: {elapsed_time} seconds")
203+
204+
if timeout:
205+
if elapsed_time >= timeout:
206+
logger.info(f"Reached timeout of {timeout} seconds for url {url}")
207+
break
208+
elif len(heights) == 5 and len(set(heights)) == 1:
209+
logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.")
210+
break
211+
212+
results = await page.content()
213+
break
214+
215+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
216+
attempt += 1
217+
logger.error(f"Attempt {attempt} failed: {e}")
218+
if attempt == self.RETRY_LIMIT:
219+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
220+
finally:
221+
await browser.close()
222+
223+
return results
224+
100225
async def ascrape_playwright(self, url: str) -> str:
101226
"""
102227
Asynchronously scrape the content of a given URL using Playwright's async API.
228+
229+
Args:
230+
url (str): The URL to scrape.
231+
232+
Returns:
233+
str: The scraped HTML content or an error message if an exception occurs.
103234
"""
104235
from playwright.async_api import async_playwright
105236
from undetected_playwright import Malenia
106237

107238
logger.info(f"Starting scraping with {self.backend}...")
239+
results = ""
108240
attempt = 0
109241

110242
while attempt < self.RETRY_LIMIT:
@@ -120,15 +252,16 @@ async def ascrape_playwright(self, url: str) -> str:
120252
await page.wait_for_load_state(self.load_state)
121253
results = await page.content()
122254
logger.info("Content scraped")
123-
return results
255+
break
124256
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
125257
attempt += 1
126258
logger.error(f"Attempt {attempt} failed: {e}")
127259
if attempt == self.RETRY_LIMIT:
128-
raise RuntimeError(f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}")
260+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
129261
finally:
130-
if 'browser' in locals():
131-
await browser.close()
262+
await browser.close()
263+
264+
return results
132265

133266
async def ascrape_with_js_support(self, url: str) -> str:
134267
"""

0 commit comments

Comments
 (0)