4
4
from langchain_core .documents import Document
5
5
import aiohttp
6
6
import async_timeout
7
+ from selenium import webdriver
8
+ from selenium .webdriver .chrome .options import Options as ChromeOptions
7
9
from typing import Union
8
10
from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
9
11
@@ -36,6 +38,7 @@ def __init__(
36
38
load_state : str = "domcontentloaded" ,
37
39
requires_js_support : bool = False ,
38
40
storage_state : Optional [str ] = None ,
41
+ browser_name : str = "chromium" , #default chromium
39
42
** kwargs : Any ,
40
43
):
41
44
"""Initialize the loader with a list of URL paths.
@@ -66,6 +69,7 @@ def __init__(
66
69
self .load_state = load_state
67
70
self .requires_js_support = requires_js_support
68
71
self .storage_state = storage_state
72
+ self .browser_name = browser_name
69
73
70
74
async def scrape (self , url :str ) -> str :
71
75
if self .backend == "playwright" :
@@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
95
99
while attempt < self .RETRY_LIMIT :
96
100
try :
97
101
async with async_timeout .timeout (self .TIMEOUT ):
98
- driver = uc .Chrome (headless = self .headless )
99
- driver .get (url )
100
- results = driver .page_source
101
- logger .info (f"Successfully scraped { url } " )
102
- break
102
+ # Handling browser selection
103
+ if self .backend == "selenium" :
104
+ if self .browser_name == "chromium" :
105
+ options = ChromeOptions ()
106
+ options .headless = self .headless
107
+ # Initialize undetected chromedriver for Selenium
108
+ driver = uc .Chrome (options = options )
109
+ driver .get (url )
110
+ results = driver .page_source
111
+ logger .info (f"Successfully scraped { url } with { self .browser_name } " )
112
+ break
113
+ elif self .browser_name == "firefox" :
114
+ from selenium .webdriver .firefox .options import Options as FirefoxOptions
115
+ options = FirefoxOptions ()
116
+ options .headless = self .headless
117
+ # Initialize undetected Firefox driver (if required)
118
+ driver = webdriver .Firefox (options = options )
119
+ driver .get (url )
120
+ results = driver .page_source
121
+ logger .info (f"Successfully scraped { url } with { self .browser_name } " )
122
+ break
123
+ else :
124
+ logger .error (f"Unsupported browser { self .browser_name } for Selenium." )
125
+ results = f"Error: Unsupported browser { self .browser_name } ."
126
+ break
127
+ else :
128
+ logger .error (f"Unsupported backend { self .backend } ." )
129
+ results = f"Error: Unsupported backend { self .backend } ."
130
+ break
103
131
except (aiohttp .ClientError , asyncio .TimeoutError ) as e :
104
132
attempt += 1
105
133
logger .error (f"Attempt { attempt } failed: { e } " )
@@ -118,7 +146,8 @@ async def ascrape_playwright_scroll(
118
146
timeout : Union [int , None ]= 30 ,
119
147
scroll : int = 15000 ,
120
148
sleep : float = 2 ,
121
- scroll_to_bottom : bool = False
149
+ scroll_to_bottom : bool = False ,
150
+ browser_name : str = "chromium" #default chrome is added
122
151
) -> str :
123
152
"""
124
153
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
@@ -175,9 +204,17 @@ async def ascrape_playwright_scroll(
175
204
while attempt < self .RETRY_LIMIT :
176
205
try :
177
206
async with async_playwright () as p :
178
- browser = await p .chromium .launch (
207
+ browser = None
208
+ if browser_name == "chromium" :
209
+ browser = await p .chromium .launch (
179
210
headless = self .headless , proxy = self .proxy , ** self .browser_config
180
211
)
212
+ elif browser_name == "firefox" :
213
+ browser = await p .firefox .launch (
214
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
215
+ )
216
+ else :
217
+ raise ValueError (f"Invalid browser name: { browser_name } " )
181
218
context = await browser .new_context ()
182
219
await Malenia .apply_stealth (context )
183
220
page = await context .new_page ()
@@ -235,7 +272,7 @@ async def ascrape_playwright_scroll(
235
272
236
273
return results
237
274
238
- async def ascrape_playwright (self , url : str ) -> str :
275
+ async def ascrape_playwright (self , url : str , browser_name : str = "chromium" ) -> str :
239
276
"""
240
277
Asynchronously scrape the content of a given URL using Playwright's async API.
241
278
@@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str:
255
292
while attempt < self .RETRY_LIMIT :
256
293
try :
257
294
async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
258
- browser = await p .chromium .launch (
295
+ browser = None
296
+ if browser_name == "chromium" :
297
+ browser = await p .chromium .launch (
259
298
headless = self .headless , proxy = self .proxy , ** self .browser_config
260
299
)
300
+ elif browser_name == "firefox" :
301
+ browser = await p .firefox .launch (
302
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
303
+ )
304
+ else :
305
+ raise ValueError (f"Invalid browser name: { browser_name } " )
261
306
context = await browser .new_context (
262
307
storage_state = self .storage_state
263
308
)
@@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str:
282
327
283
328
284
329
285
- async def ascrape_with_js_support (self , url : str ) -> str :
330
+ async def ascrape_with_js_support (self , url : str , browser_name : str = "chromium" ) -> str :
286
331
"""
287
332
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
288
333
@@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str:
302
347
while attempt < self .RETRY_LIMIT :
303
348
try :
304
349
async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
305
- browser = await p .chromium .launch (
350
+ browser = None
351
+ if browser_name == "chromium" :
352
+ browser = await p .chromium .launch (
306
353
headless = self .headless , proxy = self .proxy , ** self .browser_config
307
354
)
355
+ elif browser_name == "firefox" :
356
+ browser = await p .firefox .launch (
357
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
358
+ )
359
+ else :
360
+ raise ValueError (f"Invalid browser name: { browser_name } " )
308
361
context = await browser .new_context (
309
362
storage_state = self .storage_state
310
363
)
0 commit comments