Skip to content

Commit e3d0194

Browse files
committed
fix: script generator and add new benchmarks
1 parent 7e81f7c commit e3d0194

File tree

7 files changed

+149
-33
lines changed

7 files changed

+149
-33
lines changed
Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Local models
2+
# Local models
23
The two websites benchmark are:
34
- Example 1: https://perinim.github.io/projects
45
- Example 2: https://www.wired.com (at 17/4/2024)
@@ -9,14 +10,12 @@ The time is measured in seconds
910

1011
The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
1112

12-
In particular, is tested with ScriptCreatorGraph
13-
1413
| Hardware | Model | Example 1 | Example 2 |
1514
| ---------------------- | --------------------------------------- | --------- | --------- |
1615
| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 30.54s | 35.76s |
17-
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | 18,46s | 19.59 |
18-
| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text | 27.82s | 29.98s |
19-
| Macbook m2 max<br> | Llama3 on Ollama with nomic-embed-text | 20.83s | 12.29s |
16+
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | | |
17+
| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text | 27.82s | 29.986s |
18+
| Macbook m2 max<br> | Llama3 on Ollama with nomic-embed-text | | |
2019

2120

2221
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama).
@@ -25,17 +24,20 @@ In particular, is tested with ScriptCreatorGraph
2524
**URL**: https://perinim.github.io/projects
2625
**Task**: List me all the projects with their description.
2726

28-
| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
29-
| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
30-
| gpt-3.5-turbo | 4.50s | 1897 | 1802 | 95 | 1 | 0.002893 |
31-
| gpt-4-turbo | 7.88s | 1920 | 1802 | 118 | 1 | 0.02156 |
27+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
28+
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
29+
| gpt-3.5-turbo | 24.21 | 1892 | 1802 | 90 | 1 | 0.002883 |
30+
| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 |
31+
| Grooq with nomic-embed-text | 6.71 | 2201 | 2024 | 177 | 1 | 0 |
3232

3333
### Example 2: Wired
3434
**URL**: https://www.wired.com
3535
**Task**: List me all the articles with their description.
3636

37-
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
38-
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
39-
| gpt-3.5-turbo | Error (text too long) | - | - | - | - | - |
40-
| gpt-4-turbo | Error (TPM limit reach)| - | - | - | - | - |
37+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
38+
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
39+
| gpt-3.5-turbo | | | | | | |
40+
| gpt-4-turbo-preview | | | | | | |
41+
| Grooq with nomic-embed-text | | | | | | |
42+
4143

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from text
3+
"""
4+
import os
5+
from dotenv import load_dotenv
6+
from scrapegraphai.graphs import ScriptCreatorGraph
7+
from scrapegraphai.utils import prettify_exec_info
8+
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the text file
13+
# ************************************************
14+
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
15+
tasks = ["List me all the projects with their description.",
16+
"List me all the articles with their description."]
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
groq_key = os.getenv("GROQ_APIKEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"model": "groq/gemma-7b-it",
27+
"api_key": groq_key,
28+
"temperature": 0
29+
},
30+
"embeddings": {
31+
"model": "ollama/nomic-embed-text",
32+
"temperature": 0,
33+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
34+
},
35+
"headless": False,
36+
"library": "beautifoulsoup"
37+
}
38+
39+
40+
# ************************************************
41+
# Create the SmartScraperGraph instance and run it
42+
# ************************************************
43+
44+
for i in range(0, 2):
45+
with open(files[i], 'r', encoding="utf-8") as file:
46+
text = file.read()
47+
48+
smart_scraper_graph = ScriptCreatorGraph(
49+
prompt=tasks[i],
50+
source=text,
51+
config=graph_config
52+
)
53+
54+
result = smart_scraper_graph.run()
55+
print(result)
56+
# ************************************************
57+
# Get graph execution info
58+
# ************************************************
59+
60+
graph_exec_info = smart_scraper_graph.get_execution_info()
61+
print(prettify_exec_info(graph_exec_info))

examples/benchmarks/GenerateScraper/benchmark_llama3.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22
Basic example of scraping pipeline using SmartScraper from text
33
"""
44

5-
import os
6-
from dotenv import load_dotenv
75
from scrapegraphai.graphs import ScriptCreatorGraph
86
from scrapegraphai.utils import prettify_exec_info
9-
load_dotenv()
107

118
# ************************************************
129
# Read the text file
@@ -19,8 +16,6 @@
1916
# Define the configuration for the graph
2017
# ************************************************
2118

22-
openai_key = os.getenv("GPT4_KEY")
23-
2419

2520
graph_config = {
2621
"llm": {

examples/benchmarks/SmartScraper/Readme.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,37 +5,37 @@ The two websites benchmark are:
55

66
Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection
77

8-
In particular, is tested with SmartScraper
9-
10-
| Hardware | Moodel | Example 1 | Example 2 |
8+
| Hardware | Model | Example 1 | Example 2 |
119
| ------------------ | --------------------------------------- | --------- | --------- |
1210
| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 11.60s | 26.61s |
1311
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | 8.05s | 12.17s |
14-
| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.871s | 35.32s |
12+
| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.87s | 35.32s |
1513
| Macbook m2 max | Llama3 on Ollama with nomic-embed-text | 18.36s | 78.32s |
1614

17-
1815
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
1916

2017
| Hardware | Example 1 | Example 2 |
2118
| ------------------ | --------- | --------- |
22-
| Macbook 14' m1 pro | 139.89s | Too long |
19+
| Macbook 14' m1 pro | 139.89 | Too long |
2320
# Performance on APIs services
2421
### Example 1: personal portfolio
2522
**URL**: https://perinim.github.io/projects
2623
**Task**: List me all the projects with their description.
2724

28-
| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
29-
| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
30-
| gpt-3.5-turbo | 5.58s | 445 | 272 | 173 | 1 | 0.000754 |
31-
| gpt-4-turbo | 9.76s | 445 | 272 | 173 | 1 | 0.00791 |
25+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
26+
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
27+
| gpt-3.5-turbo | 25.22 | 445 | 272 | 173 | 1 | 0.000754 |
28+
| gpt-4-turbo-preview | 9.53 | 449 | 272 | 177 | 1 | 0.00803 |
29+
| Grooq with nomic-embed-text | 1.99 | 474 | 284 | 190 | 1 | 0 |
3230

3331
### Example 2: Wired
3432
**URL**: https://www.wired.com
3533
**Task**: List me all the articles with their description.
3634

37-
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
38-
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
39-
| gpt-3.5-turbo | 6.50 | 2442 | 2199 | 243 | 1 | 0.003784 |
40-
| gpt-4-turbo | 76.07 | 3521 | 2199 | 1322 | 1 | 0.06165 |
35+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
36+
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
37+
| gpt-3.5-turbo | 25.89 | 445 | 272 | 173 | 1 | 0.000754 |
38+
| gpt-4-turbo-preview | 64.70 | 3573 | 2199 | 1374 | 1 | 0.06321 |
39+
| Grooq with nomic-embed-text | 3.82 | 2459 | 2192 | 267 | 1 | 0 |
40+
4141

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from text
3+
"""
4+
import os
5+
from dotenv import load_dotenv
6+
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.utils import prettify_exec_info
8+
9+
load_dotenv()
10+
11+
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
12+
tasks = ["List me all the projects with their description.",
13+
"List me all the articles with their description."]
14+
15+
16+
# ************************************************
17+
# Define the configuration for the graph
18+
# ************************************************
19+
20+
groq_key = os.getenv("GROQ_APIKEY")
21+
22+
graph_config = {
23+
"llm": {
24+
"model": "groq/gemma-7b-it",
25+
"api_key": groq_key,
26+
"temperature": 0
27+
},
28+
"embeddings": {
29+
"model": "ollama/nomic-embed-text",
30+
"temperature": 0,
31+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
32+
},
33+
"headless": False
34+
}
35+
36+
# ************************************************
37+
# Create the SmartScraperGraph instance and run it
38+
# ************************************************
39+
40+
for i in range(0, 2):
41+
with open(files[i], 'r', encoding="utf-8") as file:
42+
text = file.read()
43+
44+
smart_scraper_graph = SmartScraperGraph(
45+
prompt=tasks[i],
46+
source=text,
47+
config=graph_config
48+
)
49+
50+
result = smart_scraper_graph.run()
51+
print(result)
52+
# ************************************************
53+
# Get graph execution info
54+
# ************************************************
55+
56+
graph_exec_info = smart_scraper_graph.get_execution_info()
57+
print(prettify_exec_info(graph_exec_info))

examples/benchmarks/SmartScraper/benchmark_llama3.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Basic example of scraping pipeline using SmartScraper from text
33
"""
44

5-
import os
65
from scrapegraphai.graphs import SmartScraperGraph
76
from scrapegraphai.utils import prettify_exec_info
87

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def _create_graph(self):
3434
fetch_node = FetchNode(
3535
input="url | local_dir",
3636
output=["doc"],
37+
node_config={
38+
"headless": True if self.config is None else self.config.get("headless", True)}
3739
)
3840
parse_node = ParseNode(
3941
input="doc",

0 commit comments

Comments
 (0)