Skip to content

Commit 043ae2d

Browse files
authored
Merge pull request #846 from SwapnilSonker/add/selenium-support
Add/selenium support
2 parents f97c45c + cbc75ad commit 043ae2d

23 files changed

+191
-63
lines changed

CHANGELOG.md

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
## [1.34.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.2...v1.34.0-beta.1) (2024-12-08)
2-
3-
4-
### Features
5-
6-
* add new model token ([2a032d6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2a032d6d7cf18c435fba59764e7cb28707737f0c))
7-
* added scrolling method to chromium docloader ([1c8b910](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c8b910562112947a357277bca9dc81619b72e61))
8-
9-
10-
### CI
11-
12-
* **release:** 1.33.0-beta.1 [skip ci] ([60e2fdf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60e2fdff78e405e127ba8b10daa454d634bccf46)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822)
13-
* **release:** 1.33.0-beta.2 [skip ci] ([09995cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/09995cd56c96cfa709a68bea73113ab5debfcb97))
14-
151
## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06)
162

173

README.md

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ graph_config = {
8787

8888
# Create the SmartScraperGraph instance
8989
smart_scraper_graph = SmartScraperGraph(
90-
prompt="Find some information about what does the company do, the name and a contact email.",
91-
source="https://scrapegraphai.com/",
90+
prompt="Extract me all the news from the website",
91+
source="https://www.wired.com",
9292
config=graph_config
9393
)
9494

@@ -100,10 +100,20 @@ print(json.dumps(result, indent=4))
100100
The output will be a dictionary like the following:
101101

102102
```python
103-
{
104-
"company": "ScrapeGraphAI",
105-
"name": "ScrapeGraphAI Extracting content from websites and local documents using LLM",
106-
"contact_email": "[email protected]"
103+
"result": {
104+
"news": [
105+
{
106+
"title": "The New Jersey Drone Mystery May Not Actually Be That Mysterious",
107+
"link": "https://www.wired.com/story/new-jersey-drone-mystery-maybe-not-drones/",
108+
"author": "Lily Hay Newman"
109+
},
110+
{
111+
"title": "Former ByteDance Intern Accused of Sabotage Among Winners of Prestigious AI Award",
112+
"link": "https://www.wired.com/story/bytedance-intern-best-paper-neurips/",
113+
"author": "Louise Matsakis"
114+
},
115+
...
116+
]
107117
}
108118
```
109119
There are other pipelines that can be used to extract information from multiple pages, generate Python scripts, or even generate audio files.
@@ -126,7 +136,7 @@ Remember to have [Ollama](https://ollama.com/) installed and download the models
126136
## 🔍 Demo
127137
Official streamlit demo:
128138

129-
[![My Skills](https://skillicons.dev/icons?i=react)](https://scrapegraph-ai-web-dashboard.streamlit.app)
139+
[![My Skills](https://skillicons.dev/icons?i=react)](https://scrapegraph-demo-demo.streamlit.app)
130140

131141
Try it directly on the web using Google Colab:
132142

@@ -203,3 +213,5 @@ ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://githu
203213

204214
- We would like to thank all the contributors to the project and the open-source community for their support.
205215
- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library.
216+
217+
Made with ❤️ by [ScrapeGraph AI](https://scrapegraphai.com)

examples/extras/chromium_selenium.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import asyncio
2+
import os
3+
import json
4+
from dotenv import load_dotenv
5+
from scrapegraphai.docloaders.chromium import ChromiumLoader # Import your ChromiumLoader class
6+
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.utils import prettify_exec_info
8+
from aiohttp import ClientError
9+
10+
# Load environment variables for API keys
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define function to analyze content with ScrapegraphAI
15+
# ************************************************
16+
async def analyze_content_with_scrapegraph(content: str):
17+
"""
18+
Analyze scraped content using ScrapegraphAI.
19+
20+
Args:
21+
content (str): The scraped HTML or text content.
22+
23+
Returns:
24+
dict: The result from ScrapegraphAI analysis.
25+
"""
26+
try:
27+
# Initialize ScrapegraphAI SmartScraperGraph
28+
smart_scraper = SmartScraperGraph(
29+
prompt="Summarize the main content of this webpage and extract any contact information.",
30+
source=content, # Pass the content directly
31+
config={
32+
"llm": {
33+
"api_key": os.getenv("OPENAI_API_KEY"),
34+
"model": "openai/gpt-4o",
35+
},
36+
"verbose": True
37+
}
38+
)
39+
result = smart_scraper.run()
40+
return result
41+
except Exception as e:
42+
print(f"❌ ScrapegraphAI analysis failed: {e}")
43+
return {"error": str(e)}
44+
45+
# ************************************************
46+
# Test scraper and ScrapegraphAI pipeline
47+
# ************************************************
48+
async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list):
49+
"""
50+
Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.
51+
52+
Args:
53+
scraper (ChromiumLoader): The ChromiumLoader instance.
54+
urls (list): A list of URLs to scrape.
55+
"""
56+
for url in urls:
57+
try:
58+
print(f"\n🔎 Scraping: {url} using {scraper.backend}...")
59+
result = await scraper.scrape(url)
60+
61+
if "Error" in result or not result.strip():
62+
print(f"❌ Failed to scrape {url}: {result}")
63+
else:
64+
print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}")
65+
66+
# Pass scraped content to ScrapegraphAI for analysis
67+
print("🤖 Analyzing content with ScrapegraphAI...")
68+
analysis_result = await analyze_content_with_scrapegraph(result)
69+
print("📝 Analysis Result:")
70+
print(json.dumps(analysis_result, indent=4))
71+
72+
except ClientError as ce:
73+
print(f"❌ Network error while scraping {url}: {ce}")
74+
except Exception as e:
75+
print(f"❌ Unexpected error while scraping {url}: {e}")
76+
77+
# ************************************************
78+
# Main Execution
79+
# ************************************************
80+
async def main():
81+
urls_to_scrape = [
82+
"https://example.com",
83+
"https://www.python.org",
84+
"https://invalid-url.test"
85+
]
86+
87+
# Test with Playwright backend
88+
print("\n--- Testing Playwright Backend ---")
89+
try:
90+
scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True)
91+
await test_scraper_with_analysis(scraper_playwright, urls_to_scrape)
92+
except ImportError as ie:
93+
print(f"❌ Playwright ImportError: {ie}")
94+
except Exception as e:
95+
print(f"❌ Error initializing Playwright ChromiumLoader: {e}")
96+
97+
# Test with Selenium backend
98+
print("\n--- Testing Selenium Backend ---")
99+
try:
100+
scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True)
101+
await test_scraper_with_analysis(scraper_selenium, urls_to_scrape)
102+
except ImportError as ie:
103+
print(f"❌ Selenium ImportError: {ie}")
104+
except Exception as e:
105+
print(f"❌ Error initializing Selenium ChromiumLoader: {e}")
106+
107+
if __name__ == "__main__":
108+
try:
109+
asyncio.run(main())
110+
except KeyboardInterrupt:
111+
print("❌ Program interrupted by user.")
112+
except Exception as e:
113+
print(f"❌ Program crashed: {e}")

pyproject.toml

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ name = "scrapegraphai"
33

44

55

6-
version = "1.34.0b1"
6+
version = "1.33.2"
7+
78

89

910

@@ -114,9 +115,36 @@ screenshot_scraper = [
114115
]
115116

116117
[build-system]
117-
requires = ["hatchling"]
118+
requires = ["hatchling>=1.0.0", "hatch-vcs"]
118119
build-backend = "hatchling.build"
119120

121+
[tool.hatch.build]
122+
packages = ["scrapegraphai"]
123+
exclude = [
124+
"tests/**",
125+
"examples/**",
126+
]
127+
128+
[tool.hatch.version]
129+
source = "vcs"
130+
131+
[tool.hatch.build.hooks.vcs]
132+
version-file = "scrapegraphai/_version.py"
133+
134+
[tool.hatch.build.targets.wheel]
135+
packages = ["scrapegraphai"]
136+
137+
[tool.hatch.build.targets.sdist]
138+
include = [
139+
"/scrapegraphai",
140+
"pyproject.toml",
141+
"README.md",
142+
"LICENSE",
143+
]
144+
145+
[tool.hatch.metadata]
146+
allow-direct-references = true
147+
120148
[dependency-groups]
121149
dev = [
122150
"burr[start]==0.22.1",

scrapegraphai/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
"""
22
__init__.py file for scrapegraphai folder
33
"""
4+
__version__ = "1.33.7"

scrapegraphai/_version.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Version information."""
2+
__version__ = "1.33.7"
3+
version = __version__

scrapegraphai/docloaders/chromium.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ def __init__(
6666
self.load_state = load_state
6767
self.requires_js_support = requires_js_support
6868
self.storage_state = storage_state
69+
70+
async def scrape(self, url:str) -> str:
71+
if self.backend == "playwright":
72+
return await self.ascrape_playwright(url)
73+
elif self.backend == "selenium":
74+
return await self.ascrape_undetected_chromedriver(url)
75+
else:
76+
raise ValueError(f"Unsupported backend: {self.backend}")
77+
6978

7079
async def ascrape_undetected_chromedriver(self, url: str) -> str:
7180
"""

scrapegraphai/graphs/base_graph.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,11 @@ def __init__(self, nodes: list, edges: list, entry_point: str,
5656
self.callback_manager = CustomLLMCallbackManager()
5757

5858
if nodes[0].node_name != entry_point.node_name:
59-
# raise a warning if the entry point is not the first node in the list
6059
warnings.warn(
6160
"Careful! The entry point node is different from the first node in the graph.")
6261

6362
self._set_conditional_node_edges()
6463

65-
# Burr configuration
6664
self.use_burr = use_burr
6765
self.burr_config = burr_config or {}
6866

@@ -91,7 +89,8 @@ def _set_conditional_node_edges(self):
9189
if node.node_type == 'conditional_node':
9290
outgoing_edges = [(from_node, to_node) for from_node, to_node in self.raw_edges if from_node.node_name == node.node_name]
9391
if len(outgoing_edges) != 2:
94-
raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.")
92+
raise ValueError(f"""ConditionalNode '{node.node_name}'
93+
must have exactly two outgoing edges.""")
9594
node.true_node_name = outgoing_edges[0][1].node_name
9695
try:
9796
node.false_node_name = outgoing_edges[1][1].node_name
@@ -151,14 +150,14 @@ def _get_schema(self, current_node):
151150
"""Extracts schema information from the node configuration."""
152151
if not hasattr(current_node, "node_config"):
153152
return None
154-
153+
155154
if not isinstance(current_node.node_config, dict):
156155
return None
157-
156+
158157
schema_config = current_node.node_config.get("schema")
159158
if not schema_config or isinstance(schema_config, dict):
160159
return None
161-
160+
162161
try:
163162
return schema_config.schema()
164163
except Exception:
@@ -167,7 +166,7 @@ def _get_schema(self, current_node):
167166
def _execute_node(self, current_node, state, llm_model, llm_model_name):
168167
"""Executes a single node and returns execution information."""
169168
curr_time = time.time()
170-
169+
171170
with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
172171
result = current_node.execute(state)
173172
node_exec_time = time.time() - curr_time
@@ -197,17 +196,17 @@ def _get_next_node(self, current_node, result):
197196
raise ValueError(
198197
f"Conditional Node returned a node name '{result}' that does not exist in the graph"
199198
)
200-
199+
201200
return self.edges.get(current_node.node_name)
202201

203202
def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
204203
"""
205-
Executes the graph by traversing nodes starting from the entry point using the standard method.
204+
Executes the graph by traversing nodes
205+
starting from the entry point using the standard method.
206206
"""
207207
current_node_name = self.entry_point
208208
state = initial_state
209-
210-
# Tracking variables
209+
211210
total_exec_time = 0.0
212211
exec_info = []
213212
cb_total = {
@@ -230,16 +229,13 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
230229

231230
while current_node_name:
232231
current_node = self._get_node_by_name(current_node_name)
233-
234-
# Update source information if needed
232+
235233
if source_type is None:
236234
source_type, source, prompt = self._update_source_info(current_node, state)
237-
238-
# Get model information if needed
235+
239236
if llm_model is None:
240237
llm_model, llm_model_name, embedder_model = self._get_model_info(current_node)
241-
242-
# Get schema if needed
238+
243239
if schema is None:
244240
schema = self._get_schema(current_node)
245241

@@ -273,7 +269,6 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
273269
)
274270
raise e
275271

276-
# Add total results to execution info
277272
exec_info.append({
278273
"node_name": "TOTAL RESULT",
279274
"total_tokens": cb_total["total_tokens"],
@@ -284,7 +279,6 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
284279
"exec_time": total_exec_time,
285280
})
286281

287-
# Log final execution results
288282
graph_execution_time = time.time() - start_time
289283
response = state.get("answer", None) if source_type == "url" else None
290284
content = state.get("parsed_doc", None) if response is not None else None
@@ -343,4 +337,3 @@ def append_node(self, node):
343337
self.raw_edges.append((last_node, node))
344338
self.nodes.append(node)
345339
self.edges = self._create_edges({e for e in self.raw_edges})
346-

scrapegraphai/graphs/code_generator_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
GenerateCodeNode,
1818
)
1919

20-
2120
class CodeGeneratorGraph(AbstractGraph):
2221
"""
2322
CodeGeneratorGraph is a script generator pipeline that generates

scrapegraphai/graphs/csv_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def _create_graph(self):
5959
"""
6060
Creates the graph of nodes representing the workflow for web scraping.
6161
"""
62-
62+
6363
fetch_node = FetchNode(
6464
input="csv | csv_dir",
6565
output=["doc"],

scrapegraphai/graphs/depth_search_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
GenerateAnswerNodeKLevel,
1616
)
1717

18-
1918
class DepthSearchGraph(AbstractGraph):
2019
"""
2120
CodeGeneratorGraph is a script generator pipeline that generates

0 commit comments

Comments
 (0)