Merge pull request #825 from ScrapeGraphAI/revert-to-1.19

VinciGit00 · web-flow · commit 7ff1051852ed · 2024-11-24T10:57:27.000+01:00
feat: revert search function
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -41,19 +41,11 @@ def __init__(
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
-        self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
         self.search_engine = (
             node_config["search_engine"]
             if node_config.get("search_engine")
             else "google"
         )
-
-        self.serper_api_key = (
-            node_config["serper_api_key"]
-            if node_config.get("serper_api_key")
-            else None
-        )
-
         self.max_results = node_config.get("max_results", 3)
 
     def execute(self, state: dict) -> dict:
@@ -102,10 +94,10 @@ def execute(self, state: dict) -> dict:
         self.logger.info(f"Search Query: {search_query}")
 
         answer = search_on_web(query=search_query, max_results=self.max_results,
-                               search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
+                               search_engine=self.search_engine)
 
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")
 
         state.update({self.output[0]: answer})
-        return state
+        return state
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -1,129 +1,73 @@
 """
-research_web module
+Research_web module
 """
 import re
 from typing import List
 from langchain_community.tools import DuckDuckGoSearchResults
 from googlesearch import search as google_search
 import requests
 from bs4 import BeautifulSoup
-import json
 
-def search_on_web(query: str, search_engine: str = "Google",
-                  max_results: int = 10, port: int = 8080,
-                  timeout: int = 10, proxy: str | dict = None,
-                  serper_api_key: str = None) -> List[str]:
-    """Search web function with improved error handling and validation"""
-    
-    # Input validation
-    if not query or not isinstance(query, str):
-        raise ValueError("Query must be a non-empty string")
-        
-    search_engine = search_engine.lower()
-    valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
-    if search_engine not in valid_engines:
-        raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
+def search_on_web(query: str, search_engine: str = "Google", 
+                  max_results: int = 10, port: int = 8080) -> List[str]:
+    """
+    Searches the web for a given query using specified search engine options.
 
-    # Format proxy once
-    formatted_proxy = None
-    if proxy:
-        formatted_proxy = format_proxy(proxy)
-        
-    try:
-        results = []
-        if search_engine == "google":
-            results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))
-            
-        elif search_engine == "duckduckgo":
-            research = DuckDuckGoSearchResults(max_results=max_results)
-            res = research.run(query)
-            results = re.findall(r'https?://[^\s,\]]+', res)
-            
-        elif search_engine == "bing":
-            results = _search_bing(query, max_results, timeout, formatted_proxy)
-            
-        elif search_engine == "searxng":
-            results = _search_searxng(query, max_results, port, timeout)
-        
-        elif search_engine.lower() == "serper":
-            results = _search_serper(query, max_results, serper_api_key, timeout)
-                
-        return filter_pdf_links(results)
-        
-    except requests.Timeout:
-        raise TimeoutError(f"Search request timed out after {timeout} seconds")
-    except requests.RequestException as e:
-        raise RuntimeError(f"Search request failed: {str(e)}")
+    Args:
+        query (str): The search query to find on the internet.
+        search_engine (str, optional): Specifies the search engine to use, 
+        options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
+        max_results (int, optional): The maximum number of search results to return.
+        port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
 
-def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
-    """Helper function for Bing search"""
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    search_url = f"https://www.bing.com/search?q={query}"
-    
-    proxies = {"http": proxy, "https": proxy} if proxy else None
-    response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
-    response.raise_for_status()
-    
-    soup = BeautifulSoup(response.text, "html.parser")
-    return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
+    Returns:
+        List[str]: A list of URLs as strings that are the search results.
 
-def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
-    """Helper function for SearXNG search"""
-    url = f"http://localhost:{port}"
-    params = {
-        "q": query,
-        "format": "json",
-        "engines": "google,duckduckgo,brave,qwant,bing"
-    }
-    response = requests.get(url, params=params, timeout=timeout)
-    response.raise_for_status()
-    return [result['url'] for result in response.json().get("results", [])[:max_results]]
+    Raises:
+        ValueError: If the search engine specified is not supported.
 
-def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
-    """Helper function for serper api"""
-    if not serper_api_key:
-        raise ValueError("API key is required for serper api.")
-    
-    url = "https://google.serper.dev/search"
-    payload = json.dumps({
-        "q": query,
-        "num": max_results
-    })
-    headers = {
-        'X-API-KEY': serper_api_key,
-        'Content-Type': 'application/json'
-    }
-    response = requests.post(url, headers=headers, data=payload, timeout=timeout)
-    response.raise_for_status()
-    return [result.get("link") for result in response.json().get("organic", [])]
+    Example:
+        >>> search_on_web("example query", search_engine="Google", max_results=5)
+        ['http://example.com', 'http://example.org', ...]
+    """
 
+    if search_engine.lower() == "google":
+        res = []
+        for url in google_search(query, num_results=max_results):
+            res.append(url)
+        return res
 
-def format_proxy(proxy):
-    if isinstance(proxy, dict):
-        server = proxy.get('server')
-        username = proxy.get('username')
-        password = proxy.get('password')
+    elif search_engine.lower() == "duckduckgo":
+        research = DuckDuckGoSearchResults(max_results=max_results)
+        res = research.run(query)
+        links = re.findall(r'https?://[^\s,\]]+', res)
+        return links
 
-        if all([username, password, server]):
-            proxy_url = f"http://{username}:{password}@{server}"
-            return proxy_url
-        else:
-            raise ValueError("Proxy dictionary is missing required fields.")
-    elif isinstance(proxy, str):
-        return proxy  # "https://username:password@ip:port"
-    else:
-        raise TypeError("Proxy should be a dictionary or a string.")
-            
-def filter_pdf_links(links: List[str]) -> List[str]:
-    """
-    Filters out any links that point to PDF files.
+    elif search_engine.lower() == "bing":
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        }
+        search_url = f"https://www.bing.com/search?q={query}"
+        response = requests.get(search_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
 
-    Args:
-        links (List[str]): A list of URLs as strings.
+        search_results = []
+        for result in soup.find_all('li', class_='b_algo', limit=max_results):
+            link = result.find('a')['href']
+            search_results.append(link)
+        return search_results
 
-    Returns:
-        List[str]: A list of URLs excluding any that end with '.pdf'.
-    """
-    return [link for link in links if not link.lower().endswith('.pdf')]
+    elif search_engine.lower() == "searxng":
+        url = f"http://localhost:{port}"
+        params = {"q": query, "format": "json"}
+
+        # Send the GET request to the server
+        response = requests.get(url, params=params)
+
+        data = response.json()
+        limited_results = data["results"][:max_results]
+        return limited_results
+
+    else:
+        raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")