Skip to content

Commit 8aa9103

Browse files
committed
feat: add api integration
1 parent 92bb8bb commit 8aa9103

File tree

5 files changed

+66
-5
lines changed

5 files changed

+66
-5
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
import os
5+
import json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
17+
graph_config = {
18+
"llm": {
19+
"model": "scrapegraphai/smart-scraper",
20+
"api_key": os.getenv("SCRAPEGRAPH_API_KEY")
21+
},
22+
"verbose": True,
23+
"headless": False,
24+
}
25+
26+
# ************************************************
27+
# Create the SmartScraperGraph instance and run it
28+
# ************************************************
29+
30+
smart_scraper_graph = SmartScraperGraph(
31+
prompt="Extract me all the articles",
32+
source="https://www.wired.com",
33+
config=graph_config
34+
)
35+
36+
result = smart_scraper_graph.run()
37+
print(json.dumps(result, indent=4))
38+
39+
# ************************************************
40+
# Get graph execution info
41+
# ************************************************
42+
43+
graph_exec_info = smart_scraper_graph.get_execution_info()
44+
print(prettify_exec_info(graph_exec_info))

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ dependencies = [
4343
"transformers>=4.44.2",
4444
"googlesearch-python>=1.2.5",
4545
"simpleeval>=1.0.0",
46-
"async_timeout>=4.0.3"
46+
"async_timeout>=4.0.3",
47+
"scrapegraph-py>=0.0.3"
4748
]
4849

4950
license = "MIT"

requirements-dev.lock

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ pyasn1==0.6.0
353353
# via rsa
354354
pyasn1-modules==0.4.0
355355
# via google-auth
356-
pydantic==2.8.2
356+
pydantic==2.10.1
357357
# via burr
358358
# via fastapi
359359
# via fastapi-pagination
@@ -368,7 +368,8 @@ pydantic==2.8.2
368368
# via openai
369369
# via pydantic-settings
370370
# via qdrant-client
371-
pydantic-core==2.20.1
371+
# via scrapegraph-py
372+
pydantic-core==2.27.1
372373
# via pydantic
373374
pydantic-settings==2.5.2
374375
# via langchain-community
@@ -396,6 +397,7 @@ python-dateutil==2.9.0.post0
396397
# via pandas
397398
python-dotenv==1.0.1
398399
# via pydantic-settings
400+
# via scrapegraph-py
399401
# via scrapegraphai
400402
pytz==2024.1
401403
# via pandas
@@ -424,6 +426,7 @@ requests==2.32.3
424426
# via langchain-community
425427
# via langsmith
426428
# via mistral-common
429+
# via scrapegraph-py
427430
# via sphinx
428431
# via streamlit
429432
# via tiktoken
@@ -439,6 +442,8 @@ s3transfer==0.10.2
439442
# via boto3
440443
safetensors==0.4.5
441444
# via transformers
445+
scrapegraph-py==0.0.3
446+
# via scrapegraphai
442447
semchunk==2.2.0
443448
# via scrapegraphai
444449
sentencepiece==0.2.0

requirements.lock

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ pyasn1==0.6.0
257257
# via rsa
258258
pyasn1-modules==0.4.0
259259
# via google-auth
260-
pydantic==2.8.2
260+
pydantic==2.10.1
261261
# via google-generativeai
262262
# via langchain
263263
# via langchain-aws
@@ -269,7 +269,8 @@ pydantic==2.8.2
269269
# via openai
270270
# via pydantic-settings
271271
# via qdrant-client
272-
pydantic-core==2.20.1
272+
# via scrapegraph-py
273+
pydantic-core==2.27.1
273274
# via pydantic
274275
pydantic-settings==2.5.2
275276
# via langchain-community
@@ -286,6 +287,7 @@ python-dateutil==2.9.0.post0
286287
# via pandas
287288
python-dotenv==1.0.1
288289
# via pydantic-settings
290+
# via scrapegraph-py
289291
# via scrapegraphai
290292
pytz==2024.1
291293
# via pandas
@@ -313,6 +315,7 @@ requests==2.32.3
313315
# via langchain-community
314316
# via langsmith
315317
# via mistral-common
318+
# via scrapegraph-py
316319
# via tiktoken
317320
# via transformers
318321
rpds-py==0.20.0
@@ -324,6 +327,8 @@ s3transfer==0.10.2
324327
# via boto3
325328
safetensors==0.4.5
326329
# via transformers
330+
scrapegraph-py==0.0.3
331+
# via scrapegraphai
327332
semchunk==2.2.0
328333
# via scrapegraphai
329334
sentencepiece==0.2.0

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ConditionalNode
1414
)
1515
from ..prompts import REGEN_ADDITIONAL_INFO
16+
from scrapegraph_py import ScrapeGraphClient, smart_scraper
1617

1718
class SmartScraperGraph(AbstractGraph):
1819
"""
@@ -59,6 +60,11 @@ def _create_graph(self) -> BaseGraph:
5960
Returns:
6061
BaseGraph: A graph instance representing the web scraping workflow.
6162
"""
63+
if self.llm_model == "scrapegraphai/smart-scraper":
64+
client = ScrapeGraphClient(self.config.get("api_key"))
65+
66+
result = smart_scraper(client, self.source, self.prompt)
67+
return result
6268

6369
fetch_node = FetchNode(
6470
input="url| local_dir",

0 commit comments

Comments
 (0)