Merge pull request #124 from VinciGit00/csv-scraper

PeriniM · web-flow · commit 61c732fbb9c1 · 2024-05-01T21:35:14.000+02:00
add csv scraper
diff --git a/.gitignore b/.gitignore
@@ -28,7 +28,8 @@ venv/
 *.sqlite
 *.google-cookie
 examples/graph_examples/ScrapeGraphAI_generated_graph
-examples/**/*.csv
+examples/**/result.csv
+examples/**/result.json
 main.py
 poetry.lock
 
diff --git a/examples/gemini/csv_scraper_gemini.py b/examples/gemini/csv_scraper_gemini.py
@@ -0,0 +1,60 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/gemini/inputs/username.csv b/examples/gemini/inputs/username.csv
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
diff --git a/examples/gemini/scrape_xml_gemini.py b/examples/gemini/scrape_xml_gemini.py
@@ -6,6 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
+
 load_dotenv()
 
 # ************************************************
diff --git a/examples/local_models/Docker/csv_scraper_docker.py b/examples/local_models/Docker/csv_scraper_docker.py
@@ -0,0 +1,54 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+    }
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/local_models/Docker/inputs/username.csv b/examples/local_models/Docker/inputs/username.csv
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
diff --git a/examples/local_models/Ollama/csv_scraper_ollama.py b/examples/local_models/Ollama/csv_scraper_ollama.py
@@ -0,0 +1,56 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/local_models/Ollama/inputs/username.csv b/examples/local_models/Ollama/inputs/username.csv
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
diff --git a/examples/openai/csv_scraper_openai.py b/examples/openai/csv_scraper_openai.py
@@ -0,0 +1,53 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+load_dotenv()
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/openai/inputs/username.csv b/examples/openai/inputs/username.csv
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
diff --git a/examples/openai/scrape_plain_text_openai.py b/examples/openai/scrape_plain_text_openai.py
@@ -6,6 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
+
 load_dotenv()
 
 # ************************************************
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -8,3 +8,4 @@
 from .script_creator_graph import ScriptCreatorGraph
 from .xml_scraper_graph import XMLScraperGraph
 from .json_scraper_graph import JSONScraperGraph
+from .csv_scraper_graph import CSVScraperGraph
diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py
@@ -0,0 +1,88 @@
+"""
+Module for creating the smart scraper
+"""
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    RAGNode,
+    GenerateAnswerCSVNode
+)
+from .abstract_graph import AbstractGraph
+
+
+class CSVScraperGraph(AbstractGraph):
+    """
+    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    information from web pages using a natural language model to interpret and answer prompts.
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+        """
+        Initializes the CSVScraperGraph with a prompt, source, and configuration.
+        """
+        super().__init__(prompt, config, source)
+
+        self.input_key = "csv" if source.endswith("csv") else "csv_dir"
+
+    def _create_graph(self):
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+        """
+        fetch_node = FetchNode(
+            input="csv_dir",
+            output=["doc"],
+            node_config={
+                "headless": self.headless,
+                "verbose": self.verbose
+            }
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "chunk_size": self.model_token,
+                "verbose": self.verbose
+            }
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm": self.llm_model,
+                "embedder_model": self.embedder_model,
+                "verbose": self.verbose
+            }
+        )
+        generate_answer_node = GenerateAnswerCSVNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={
+                "llm": self.llm_model,
+                "verbose": self.verbose
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                rag_node,
+                generate_answer_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_answer_node)
+            ],
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping process and returns the answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -14,3 +14,4 @@
 from .generate_scraper_node import GenerateScraperNode
 from .search_link_node import SearchLinkNode
 from .robots_node import RobotsNode
+from .generate_answer_csv_node import GenerateAnswerCSVNode
diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py
diff --git a/scrapegraphai/nodes/generate_answer_node_csv.py b/scrapegraphai/nodes/generate_answer_node_csv.py
diff --git a/tests/graphs/inputs/username.csv b/tests/graphs/inputs/username.csv