File tree 5 files changed +66
-5
lines changed
5 files changed +66
-5
lines changed Original file line number Diff line number Diff line change
1
+ """
2
+ Basic example of scraping pipeline using SmartScraper
3
+ """
4
+ import os
5
+ import json
6
+ from dotenv import load_dotenv
7
+ from scrapegraphai .graphs import SmartScraperGraph
8
+ from scrapegraphai .utils import prettify_exec_info
9
+
10
+ load_dotenv ()
11
+
12
+ # ************************************************
13
+ # Define the configuration for the graph
14
+ # ************************************************
15
+
16
+
17
+ graph_config = {
18
+ "llm" : {
19
+ "model" : "scrapegraphai/smart-scraper" ,
20
+ "api_key" : os .getenv ("SCRAPEGRAPH_API_KEY" )
21
+ },
22
+ "verbose" : True ,
23
+ "headless" : False ,
24
+ }
25
+
26
+ # ************************************************
27
+ # Create the SmartScraperGraph instance and run it
28
+ # ************************************************
29
+
30
+ smart_scraper_graph = SmartScraperGraph (
31
+ prompt = "Extract me all the articles" ,
32
+ source = "https://www.wired.com" ,
33
+ config = graph_config
34
+ )
35
+
36
+ result = smart_scraper_graph .run ()
37
+ print (json .dumps (result , indent = 4 ))
38
+
39
+ # ************************************************
40
+ # Get graph execution info
41
+ # ************************************************
42
+
43
+ graph_exec_info = smart_scraper_graph .get_execution_info ()
44
+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change @@ -43,7 +43,8 @@ dependencies = [
43
43
" transformers>=4.44.2" ,
44
44
" googlesearch-python>=1.2.5" ,
45
45
" simpleeval>=1.0.0" ,
46
- " async_timeout>=4.0.3"
46
+ " async_timeout>=4.0.3" ,
47
+ " scrapegraph-py>=0.0.3"
47
48
]
48
49
49
50
license = " MIT"
Original file line number Diff line number Diff line change @@ -353,7 +353,7 @@ pyasn1==0.6.0
353
353
# via rsa
354
354
pyasn1-modules==0.4.0
355
355
# via google-auth
356
- pydantic==2.8.2
356
+ pydantic==2.10.1
357
357
# via burr
358
358
# via fastapi
359
359
# via fastapi-pagination
@@ -368,7 +368,8 @@ pydantic==2.8.2
368
368
# via openai
369
369
# via pydantic-settings
370
370
# via qdrant-client
371
- pydantic-core==2.20.1
371
+ # via scrapegraph-py
372
+ pydantic-core==2.27.1
372
373
# via pydantic
373
374
pydantic-settings==2.5.2
374
375
# via langchain-community
@@ -396,6 +397,7 @@ python-dateutil==2.9.0.post0
396
397
# via pandas
397
398
python-dotenv==1.0.1
398
399
# via pydantic-settings
400
+ # via scrapegraph-py
399
401
# via scrapegraphai
400
402
pytz==2024.1
401
403
# via pandas
@@ -424,6 +426,7 @@ requests==2.32.3
424
426
# via langchain-community
425
427
# via langsmith
426
428
# via mistral-common
429
+ # via scrapegraph-py
427
430
# via sphinx
428
431
# via streamlit
429
432
# via tiktoken
@@ -439,6 +442,8 @@ s3transfer==0.10.2
439
442
# via boto3
440
443
safetensors==0.4.5
441
444
# via transformers
445
+ scrapegraph-py==0.0.3
446
+ # via scrapegraphai
442
447
semchunk==2.2.0
443
448
# via scrapegraphai
444
449
sentencepiece==0.2.0
Original file line number Diff line number Diff line change @@ -257,7 +257,7 @@ pyasn1==0.6.0
257
257
# via rsa
258
258
pyasn1-modules==0.4.0
259
259
# via google-auth
260
- pydantic==2.8.2
260
+ pydantic==2.10.1
261
261
# via google-generativeai
262
262
# via langchain
263
263
# via langchain-aws
@@ -269,7 +269,8 @@ pydantic==2.8.2
269
269
# via openai
270
270
# via pydantic-settings
271
271
# via qdrant-client
272
- pydantic-core==2.20.1
272
+ # via scrapegraph-py
273
+ pydantic-core==2.27.1
273
274
# via pydantic
274
275
pydantic-settings==2.5.2
275
276
# via langchain-community
@@ -286,6 +287,7 @@ python-dateutil==2.9.0.post0
286
287
# via pandas
287
288
python-dotenv==1.0.1
288
289
# via pydantic-settings
290
+ # via scrapegraph-py
289
291
# via scrapegraphai
290
292
pytz==2024.1
291
293
# via pandas
@@ -313,6 +315,7 @@ requests==2.32.3
313
315
# via langchain-community
314
316
# via langsmith
315
317
# via mistral-common
318
+ # via scrapegraph-py
316
319
# via tiktoken
317
320
# via transformers
318
321
rpds-py==0.20.0
@@ -324,6 +327,8 @@ s3transfer==0.10.2
324
327
# via boto3
325
328
safetensors==0.4.5
326
329
# via transformers
330
+ scrapegraph-py==0.0.3
331
+ # via scrapegraphai
327
332
semchunk==2.2.0
328
333
# via scrapegraphai
329
334
sentencepiece==0.2.0
Original file line number Diff line number Diff line change 13
13
ConditionalNode
14
14
)
15
15
from ..prompts import REGEN_ADDITIONAL_INFO
16
+ from scrapegraph_py import ScrapeGraphClient , smart_scraper
16
17
17
18
class SmartScraperGraph (AbstractGraph ):
18
19
"""
@@ -59,6 +60,11 @@ def _create_graph(self) -> BaseGraph:
59
60
Returns:
60
61
BaseGraph: A graph instance representing the web scraping workflow.
61
62
"""
63
+ if self .llm_model == "scrapegraphai/smart-scraper" :
64
+ client = ScrapeGraphClient (self .config .get ("api_key" ))
65
+
66
+ result = smart_scraper (client , self .source , self .prompt )
67
+ return result
62
68
63
69
fetch_node = FetchNode (
64
70
input = "url| local_dir" ,
You can’t perform that action at this time.
0 commit comments