4
4
from langchain_core .documents import Document
5
5
import aiohttp
6
6
import async_timeout
7
+ from typing import Union
7
8
from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
8
9
9
10
logger = get_logger ("web-loader" )
11
+ logger .setLevel ("INFO" )
10
12
11
13
class ChromiumLoader (BaseLoader ):
12
14
"""Scrapes HTML pages from URLs using a (headless) instance of the
@@ -97,14 +99,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
97
99
98
100
return results
99
101
102
+ async def ascrape_playwright_scroll (
103
+ self ,
104
+ url : str ,
105
+ timeout : Union [int , None ]= 30 ,
106
+ scroll : int = 15000 ,
107
+ sleep : float = 2 ,
108
+ scroll_to_bottom : bool = False
109
+ ) -> str :
110
+ """
111
+ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
112
+
113
+ Notes:
114
+ - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
115
+ - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
116
+ the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
117
+ - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
118
+ Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
119
+ make sure that the scrolling actually happens, thereby allowing the page height to change.
120
+ - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
121
+
122
+ Args:
123
+ - url (str): The URL to scrape.
124
+ - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
125
+ Can also be set to None, in which case the scraper will only stop when the page height stops changing.
126
+ - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
127
+ Less than this and we don't scroll enough to see any content change.
128
+ - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
129
+ Defaults to 2. Must be greater than 0.
130
+
131
+ Returns:
132
+ str: The scraped HTML content
133
+
134
+ Raises:
135
+ - ValueError: If the timeout value is less than or equal to 0.
136
+ - ValueError: If the sleep value is less than or equal to 0.
137
+ - ValueError: If the scroll value is less than 5000.
138
+ """
139
+ # NB: I have tested using scrollHeight to determine when to stop scrolling
140
+ # but it doesn't always work as expected. The page height doesn't change on some sites like
141
+ # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
142
+ # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
143
+
144
+ if timeout and timeout <= 0 :
145
+ raise ValueError ("If set, timeout value for scrolling scraper must be greater than 0." )
146
+
147
+ if sleep <= 0 :
148
+ raise ValueError ("Sleep for scrolling scraper value must be greater than 0." )
149
+
150
+ if scroll < 5000 :
151
+ raise ValueError ("Scroll value for scrolling scraper must be greater than or equal to 5000." )
152
+
153
+ from playwright .async_api import async_playwright
154
+ from undetected_playwright import Malenia
155
+ import time
156
+
157
+ logger .info (f"Starting scraping with scrolling support for { url } ..." )
158
+
159
+ results = ""
160
+ attempt = 0
161
+
162
+ while attempt < self .RETRY_LIMIT :
163
+ try :
164
+ async with async_playwright () as p :
165
+ browser = await p .chromium .launch (
166
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
167
+ )
168
+ context = await browser .new_context ()
169
+ await Malenia .apply_stealth (context )
170
+ page = await context .new_page ()
171
+ await page .goto (url , wait_until = "domcontentloaded" )
172
+ await page .wait_for_load_state (self .load_state )
173
+
174
+ previous_height = None
175
+ start_time = time .time ()
176
+
177
+ # Store the heights of the page after each scroll
178
+ # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
179
+ # or simly when the page stops changing for some reason.
180
+ heights = []
181
+
182
+ while True :
183
+ current_height = await page .evaluate ("document.body.scrollHeight" )
184
+ heights .append (current_height )
185
+ heights = heights [- 5 :] # Keep only the last 5 heights, to not run out of memory
186
+
187
+ # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
188
+ # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
189
+ # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
190
+ if scroll_to_bottom and previous_height == current_height :
191
+ logger .info (f"Reached bottom of page for url { url } " )
192
+ break
193
+
194
+ previous_height = current_height
195
+
196
+ await page .mouse .wheel (0 , scroll )
197
+ logger .debug (f"Scrolled { url } to current height { current_height } px..." )
198
+ time .sleep (sleep ) # Allow some time for any lazy-loaded content to load
199
+
200
+ current_time = time .time ()
201
+ elapsed_time = current_time - start_time
202
+ logger .debug (f"Elapsed time: { elapsed_time } seconds" )
203
+
204
+ if timeout :
205
+ if elapsed_time >= timeout :
206
+ logger .info (f"Reached timeout of { timeout } seconds for url { url } " )
207
+ break
208
+ elif len (heights ) == 5 and len (set (heights )) == 1 :
209
+ logger .info (f"Page height has not changed for url { url } for the last 5 scrolls. Stopping." )
210
+ break
211
+
212
+ results = await page .content ()
213
+ break
214
+
215
+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
216
+ attempt += 1
217
+ logger .error (f"Attempt { attempt } failed: { e } " )
218
+ if attempt == self .RETRY_LIMIT :
219
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
220
+ finally :
221
+ await browser .close ()
222
+
223
+ return results
224
+
100
225
async def ascrape_playwright (self , url : str ) -> str :
101
226
"""
102
227
Asynchronously scrape the content of a given URL using Playwright's async API.
228
+
229
+ Args:
230
+ url (str): The URL to scrape.
231
+
232
+ Returns:
233
+ str: The scraped HTML content or an error message if an exception occurs.
103
234
"""
104
235
from playwright .async_api import async_playwright
105
236
from undetected_playwright import Malenia
106
237
107
238
logger .info (f"Starting scraping with { self .backend } ..." )
239
+ results = ""
108
240
attempt = 0
109
241
110
242
while attempt < self .RETRY_LIMIT :
@@ -120,15 +252,16 @@ async def ascrape_playwright(self, url: str) -> str:
120
252
await page .wait_for_load_state (self .load_state )
121
253
results = await page .content ()
122
254
logger .info ("Content scraped" )
123
- return results
255
+ break
124
256
except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
125
257
attempt += 1
126
258
logger .error (f"Attempt { attempt } failed: { e } " )
127
259
if attempt == self .RETRY_LIMIT :
128
- raise RuntimeError ( f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } ")
260
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
129
261
finally :
130
- if 'browser' in locals ():
131
- await browser .close ()
262
+ await browser .close ()
263
+
264
+ return results
132
265
133
266
async def ascrape_with_js_support (self , url : str ) -> str :
134
267
"""
0 commit comments