Skip to content

Commit 1dc2e5b

Browse files
authored
Merge pull request #848 from SwapnilSonker/add/changebrowser_to_Firefox
2 parents 9cba928 + 4914928 commit 1dc2e5b

File tree

2 files changed

+74
-15
lines changed

2 files changed

+74
-15
lines changed

examples/extras/chromium_selenium.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,11 @@ async def main():
8787
# Test with Playwright backend
8888
print("\n--- Testing Playwright Backend ---")
8989
try:
90-
scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True)
91-
await test_scraper_with_analysis(scraper_playwright, urls_to_scrape)
90+
scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium")
91+
await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape)
92+
93+
scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox")
94+
await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape)
9295
except ImportError as ie:
9396
print(f"❌ Playwright ImportError: {ie}")
9497
except Exception as e:
@@ -97,8 +100,11 @@ async def main():
97100
# Test with Selenium backend
98101
print("\n--- Testing Selenium Backend ---")
99102
try:
100-
scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True)
101-
await test_scraper_with_analysis(scraper_selenium, urls_to_scrape)
103+
scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium")
104+
await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape)
105+
106+
scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox")
107+
await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape)
102108
except ImportError as ie:
103109
print(f"❌ Selenium ImportError: {ie}")
104110
except Exception as e:

scrapegraphai/docloaders/chromium.py

+64-11
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from langchain_core.documents import Document
55
import aiohttp
66
import async_timeout
7+
from selenium import webdriver
8+
from selenium.webdriver.chrome.options import Options as ChromeOptions
79
from typing import Union
810
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
911

@@ -36,6 +38,7 @@ def __init__(
3638
load_state: str = "domcontentloaded",
3739
requires_js_support: bool = False,
3840
storage_state: Optional[str] = None,
41+
browser_name: str = "chromium", #default chromium
3942
**kwargs: Any,
4043
):
4144
"""Initialize the loader with a list of URL paths.
@@ -66,6 +69,7 @@ def __init__(
6669
self.load_state = load_state
6770
self.requires_js_support = requires_js_support
6871
self.storage_state = storage_state
72+
self.browser_name = browser_name
6973

7074
async def scrape(self, url:str) -> str:
7175
if self.backend == "playwright":
@@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9599
while attempt < self.RETRY_LIMIT:
96100
try:
97101
async with async_timeout.timeout(self.TIMEOUT):
98-
driver = uc.Chrome(headless=self.headless)
99-
driver.get(url)
100-
results = driver.page_source
101-
logger.info(f"Successfully scraped {url}")
102-
break
102+
# Handling browser selection
103+
if self.backend == "selenium":
104+
if self.browser_name == "chromium":
105+
options = ChromeOptions()
106+
options.headless = self.headless
107+
# Initialize undetected chromedriver for Selenium
108+
driver = uc.Chrome(options=options)
109+
driver.get(url)
110+
results = driver.page_source
111+
logger.info(f"Successfully scraped {url} with {self.browser_name}")
112+
break
113+
elif self.browser_name == "firefox":
114+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
115+
options = FirefoxOptions()
116+
options.headless = self.headless
117+
# Initialize undetected Firefox driver (if required)
118+
driver = webdriver.Firefox(options=options)
119+
driver.get(url)
120+
results = driver.page_source
121+
logger.info(f"Successfully scraped {url} with {self.browser_name}")
122+
break
123+
else:
124+
logger.error(f"Unsupported browser {self.browser_name} for Selenium.")
125+
results = f"Error: Unsupported browser {self.browser_name}."
126+
break
127+
else:
128+
logger.error(f"Unsupported backend {self.backend}.")
129+
results = f"Error: Unsupported backend {self.backend}."
130+
break
103131
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
104132
attempt += 1
105133
logger.error(f"Attempt {attempt} failed: {e}")
@@ -118,7 +146,8 @@ async def ascrape_playwright_scroll(
118146
timeout: Union[int, None]=30,
119147
scroll: int=15000,
120148
sleep: float=2,
121-
scroll_to_bottom: bool=False
149+
scroll_to_bottom: bool=False,
150+
browser_name: str = "chromium" #default chrome is added
122151
) -> str:
123152
"""
124153
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
@@ -175,9 +204,17 @@ async def ascrape_playwright_scroll(
175204
while attempt < self.RETRY_LIMIT:
176205
try:
177206
async with async_playwright() as p:
178-
browser = await p.chromium.launch(
207+
browser = None
208+
if browser_name == "chromium":
209+
browser = await p.chromium.launch(
179210
headless=self.headless, proxy=self.proxy, **self.browser_config
180211
)
212+
elif browser_name == "firefox":
213+
browser = await p.firefox.launch(
214+
headless=self.headless, proxy=self.proxy, **self.browser_config
215+
)
216+
else:
217+
raise ValueError(f"Invalid browser name: {browser_name}")
181218
context = await browser.new_context()
182219
await Malenia.apply_stealth(context)
183220
page = await context.new_page()
@@ -235,7 +272,7 @@ async def ascrape_playwright_scroll(
235272

236273
return results
237274

238-
async def ascrape_playwright(self, url: str) -> str:
275+
async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> str:
239276
"""
240277
Asynchronously scrape the content of a given URL using Playwright's async API.
241278
@@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str:
255292
while attempt < self.RETRY_LIMIT:
256293
try:
257294
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
258-
browser = await p.chromium.launch(
295+
browser = None
296+
if browser_name == "chromium":
297+
browser = await p.chromium.launch(
259298
headless=self.headless, proxy=self.proxy, **self.browser_config
260299
)
300+
elif browser_name == "firefox":
301+
browser = await p.firefox.launch(
302+
headless=self.headless, proxy=self.proxy, **self.browser_config
303+
)
304+
else:
305+
raise ValueError(f"Invalid browser name: {browser_name}")
261306
context = await browser.new_context(
262307
storage_state=self.storage_state
263308
)
@@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str:
282327

283328

284329

285-
async def ascrape_with_js_support(self, url: str) -> str:
330+
async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str:
286331
"""
287332
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
288333
@@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str:
302347
while attempt < self.RETRY_LIMIT:
303348
try:
304349
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
305-
browser = await p.chromium.launch(
350+
browser = None
351+
if browser_name == "chromium":
352+
browser = await p.chromium.launch(
306353
headless=self.headless, proxy=self.proxy, **self.browser_config
307354
)
355+
elif browser_name == "firefox":
356+
browser = await p.firefox.launch(
357+
headless=self.headless, proxy=self.proxy, **self.browser_config
358+
)
359+
else:
360+
raise ValueError(f"Invalid browser name: {browser_name}")
308361
context = await browser.new_context(
309362
storage_state=self.storage_state
310363
)

0 commit comments

Comments
 (0)