Skip to content

Commit 61c732f

Browse files
authored
Merge pull request #124 from VinciGit00/csv-scraper
add csv scraper
2 parents 9356124 + 4d542a8 commit 61c732f

17 files changed

+680
-1
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ venv/
2828
*.sqlite
2929
*.google-cookie
3030
examples/graph_examples/ScrapeGraphAI_generated_graph
31-
examples/**/*.csv
31+
examples/**/result.csv
32+
examples/**/result.json
3233
main.py
3334
poetry.lock
3435

examples/gemini/csv_scraper_gemini.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Read the csv file
15+
# ************************************************
16+
17+
text = pd.read_csv("inputs/username.csv")
18+
19+
# ************************************************
20+
# Define the configuration for the graph
21+
# ************************************************
22+
23+
graph_config = {
24+
"llm": {
25+
"model": "ollama/mistral",
26+
"temperature": 0,
27+
"format": "json", # Ollama needs the format to be specified explicitly
28+
# "model_tokens": 2000, # set context length arbitrarily
29+
"base_url": "http://localhost:11434",
30+
},
31+
"embeddings": {
32+
"model": "ollama/nomic-embed-text",
33+
"temperature": 0,
34+
"base_url": "http://localhost:11434",
35+
}
36+
}
37+
38+
# ************************************************
39+
# Create the CSVScraperGraph instance and run it
40+
# ************************************************
41+
42+
csv_scraper_graph = CSVScraperGraph(
43+
prompt="List me all the last names",
44+
source=str(text), # Pass the content of the file, not the file object
45+
config=graph_config
46+
)
47+
48+
result = csv_scraper_graph.run()
49+
print(result)
50+
51+
# ************************************************
52+
# Get graph execution info
53+
# ************************************************
54+
55+
graph_exec_info = csv_scraper_graph.get_execution_info()
56+
print(prettify_exec_info(graph_exec_info))
57+
58+
# Save to json or csv
59+
convert_to_csv(result, "result")
60+
convert_to_json(result, "result")

examples/gemini/inputs/username.csv

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/gemini/scrape_xml_gemini.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
},
26+
"embeddings": {
27+
"model": "ollama/nomic-embed-text",
28+
"temperature": 0,
29+
}
30+
}
31+
32+
# ************************************************
33+
# Create the CSVScraperGraph instance and run it
34+
# ************************************************
35+
36+
csv_scraper_graph = CSVScraperGraph(
37+
prompt="List me all the last names",
38+
source=str(text), # Pass the content of the file, not the file object
39+
config=graph_config
40+
)
41+
42+
result = csv_scraper_graph.run()
43+
print(result)
44+
45+
# ************************************************
46+
# Get graph execution info
47+
# ************************************************
48+
49+
graph_exec_info = csv_scraper_graph.get_execution_info()
50+
print(prettify_exec_info(graph_exec_info))
51+
52+
# Save to json or csv
53+
convert_to_csv(result, "result")
54+
convert_to_json(result, "result")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
"base_url": "http://localhost:11434",
26+
},
27+
"embeddings": {
28+
"model": "ollama/nomic-embed-text",
29+
"temperature": 0,
30+
"base_url": "http://localhost:11434",
31+
}
32+
}
33+
34+
# ************************************************
35+
# Create the CSVScraperGraph instance and run it
36+
# ************************************************
37+
38+
csv_scraper_graph = CSVScraperGraph(
39+
prompt="List me all the last names",
40+
source=str(text), # Pass the content of the file, not the file object
41+
config=graph_config
42+
)
43+
44+
result = csv_scraper_graph.run()
45+
print(result)
46+
47+
# ************************************************
48+
# Get graph execution info
49+
# ************************************************
50+
51+
graph_exec_info = csv_scraper_graph.get_execution_info()
52+
print(prettify_exec_info(graph_exec_info))
53+
54+
# Save to json or csv
55+
convert_to_csv(result, "result")
56+
convert_to_json(result, "result")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/openai/csv_scraper_openai.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
# ************************************************
13+
# Read the csv file
14+
# ************************************************
15+
16+
text = pd.read_csv("inputs/username.csv")
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("OPENAI_APIKEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": openai_key,
27+
"model": "gpt-3.5-turbo",
28+
},
29+
}
30+
31+
# ************************************************
32+
# Create the CSVScraperGraph instance and run it
33+
# ************************************************
34+
35+
csv_scraper_graph = CSVScraperGraph(
36+
prompt="List me all the last names",
37+
source=str(text), # Pass the content of the file, not the file object
38+
config=graph_config
39+
)
40+
41+
result = csv_scraper_graph.run()
42+
print(result)
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = csv_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))
50+
51+
# Save to json or csv
52+
convert_to_csv(result, "result")
53+
convert_to_json(result, "result")

examples/openai/inputs/username.csv

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/openai/scrape_plain_text_openai.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************

scrapegraphai/graphs/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
from .script_creator_graph import ScriptCreatorGraph
99
from .xml_scraper_graph import XMLScraperGraph
1010
from .json_scraper_graph import JSONScraperGraph
11+
from .csv_scraper_graph import CSVScraperGraph
+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
Module for creating the smart scraper
3+
"""
4+
from .base_graph import BaseGraph
5+
from ..nodes import (
6+
FetchNode,
7+
ParseNode,
8+
RAGNode,
9+
GenerateAnswerCSVNode
10+
)
11+
from .abstract_graph import AbstractGraph
12+
13+
14+
class CSVScraperGraph(AbstractGraph):
15+
"""
16+
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17+
information from web pages using a natural language model to interpret and answer prompts.
18+
"""
19+
20+
def __init__(self, prompt: str, source: str, config: dict):
21+
"""
22+
Initializes the CSVScraperGraph with a prompt, source, and configuration.
23+
"""
24+
super().__init__(prompt, config, source)
25+
26+
self.input_key = "csv" if source.endswith("csv") else "csv_dir"
27+
28+
def _create_graph(self):
29+
"""
30+
Creates the graph of nodes representing the workflow for web scraping.
31+
"""
32+
fetch_node = FetchNode(
33+
input="csv_dir",
34+
output=["doc"],
35+
node_config={
36+
"headless": self.headless,
37+
"verbose": self.verbose
38+
}
39+
)
40+
parse_node = ParseNode(
41+
input="doc",
42+
output=["parsed_doc"],
43+
node_config={
44+
"chunk_size": self.model_token,
45+
"verbose": self.verbose
46+
}
47+
)
48+
rag_node = RAGNode(
49+
input="user_prompt & (parsed_doc | doc)",
50+
output=["relevant_chunks"],
51+
node_config={
52+
"llm": self.llm_model,
53+
"embedder_model": self.embedder_model,
54+
"verbose": self.verbose
55+
}
56+
)
57+
generate_answer_node = GenerateAnswerCSVNode(
58+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
59+
output=["answer"],
60+
node_config={
61+
"llm": self.llm_model,
62+
"verbose": self.verbose
63+
}
64+
)
65+
66+
return BaseGraph(
67+
nodes=[
68+
fetch_node,
69+
parse_node,
70+
rag_node,
71+
generate_answer_node,
72+
],
73+
edges=[
74+
(fetch_node, parse_node),
75+
(parse_node, rag_node),
76+
(rag_node, generate_answer_node)
77+
],
78+
entry_point=fetch_node
79+
)
80+
81+
def run(self) -> str:
82+
"""
83+
Executes the web scraping process and returns the answer to the prompt.
84+
"""
85+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
86+
self.final_state, self.execution_info = self.graph.execute(inputs)
87+
88+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@
1414
from .generate_scraper_node import GenerateScraperNode
1515
from .search_link_node import SearchLinkNode
1616
from .robots_node import RobotsNode
17+
from .generate_answer_csv_node import GenerateAnswerCSVNode

0 commit comments

Comments
 (0)