Skip to content

feat: revert search function #825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,11 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
self.search_engine = (
node_config["search_engine"]
if node_config.get("search_engine")
else "google"
)

self.serper_api_key = (
node_config["serper_api_key"]
if node_config.get("serper_api_key")
else None
)

self.max_results = node_config.get("max_results", 3)

def execute(self, state: dict) -> dict:
Expand Down Expand Up @@ -102,10 +94,10 @@ def execute(self, state: dict) -> dict:
self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
search_engine=self.search_engine)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")

state.update({self.output[0]: answer})
return state
return state
166 changes: 55 additions & 111 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
@@ -1,129 +1,73 @@
"""
research_web module
Research_web module
"""
import re
from typing import List
from langchain_community.tools import DuckDuckGoSearchResults
from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup
import json

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10, proxy: str | dict = None,
serper_api_key: str = None) -> List[str]:
"""Search web function with improved error handling and validation"""

# Input validation
if not query or not isinstance(query, str):
raise ValueError("Query must be a non-empty string")

search_engine = search_engine.lower()
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
if search_engine not in valid_engines:
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080) -> List[str]:
"""
Searches the web for a given query using specified search engine options.

# Format proxy once
formatted_proxy = None
if proxy:
formatted_proxy = format_proxy(proxy)

try:
results = []
if search_engine == "google":
results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))

elif search_engine == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
results = re.findall(r'https?://[^\s,\]]+', res)

elif search_engine == "bing":
results = _search_bing(query, max_results, timeout, formatted_proxy)

elif search_engine == "searxng":
results = _search_searxng(query, max_results, port, timeout)

elif search_engine.lower() == "serper":
results = _search_serper(query, max_results, serper_api_key, timeout)

return filter_pdf_links(results)

except requests.Timeout:
raise TimeoutError(f"Search request timed out after {timeout} seconds")
except requests.RequestException as e:
raise RuntimeError(f"Search request failed: {str(e)}")
Args:
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use,
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.

def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
"""Helper function for Bing search"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
search_url = f"https://www.bing.com/search?q={query}"

proxies = {"http": proxy, "https": proxy} if proxy else None
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")
return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
Returns:
List[str]: A list of URLs as strings that are the search results.

def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
"""Helper function for SearXNG search"""
url = f"http://localhost:{port}"
params = {
"q": query,
"format": "json",
"engines": "google,duckduckgo,brave,qwant,bing"
}
response = requests.get(url, params=params, timeout=timeout)
response.raise_for_status()
return [result['url'] for result in response.json().get("results", [])[:max_results]]
Raises:
ValueError: If the search engine specified is not supported.

def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
"""Helper function for serper api"""
if not serper_api_key:
raise ValueError("API key is required for serper api.")

url = "https://google.serper.dev/search"
payload = json.dumps({
"q": query,
"num": max_results
})
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
response = requests.post(url, headers=headers, data=payload, timeout=timeout)
response.raise_for_status()
return [result.get("link") for result in response.json().get("organic", [])]
Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]
"""

if search_engine.lower() == "google":
res = []
for url in google_search(query, num_results=max_results):
res.append(url)
return res

def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get('server')
username = proxy.get('username')
password = proxy.get('password')
elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r'https?://[^\s,\]]+', res)
return links

if all([username, password, server]):
proxy_url = f"http://{username}:{password}@{server}"
return proxy_url
else:
raise ValueError("Proxy dictionary is missing required fields.")
elif isinstance(proxy, str):
return proxy # "https://username:password@ip:port"
else:
raise TypeError("Proxy should be a dictionary or a string.")

def filter_pdf_links(links: List[str]) -> List[str]:
"""
Filters out any links that point to PDF files.
elif search_engine.lower() == "bing":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
search_url = f"https://www.bing.com/search?q={query}"
response = requests.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

Args:
links (List[str]): A list of URLs as strings.
search_results = []
for result in soup.find_all('li', class_='b_algo', limit=max_results):
link = result.find('a')['href']
search_results.append(link)
return search_results

Returns:
List[str]: A list of URLs excluding any that end with '.pdf'.
"""
return [link for link in links if not link.lower().endswith('.pdf')]
elif search_engine.lower() == "searxng":
url = f"http://localhost:{port}"
params = {"q": query, "format": "json"}

# Send the GET request to the server
response = requests.get(url, params=params)

data = response.json()
limited_results = data["results"][:max_results]
return limited_results

else:
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
Loading