Skip to content

Commit 7ff1051

Browse files
authored
Merge pull request #825 from ScrapeGraphAI/revert-to-1.19
feat: revert search function
2 parents 92bb8bb + faf0c01 commit 7ff1051

File tree

2 files changed

+57
-121
lines changed

2 files changed

+57
-121
lines changed

scrapegraphai/nodes/search_internet_node.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,11 @@ def __init__(
4141
self.verbose = (
4242
False if node_config is None else node_config.get("verbose", False)
4343
)
44-
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
4544
self.search_engine = (
4645
node_config["search_engine"]
4746
if node_config.get("search_engine")
4847
else "google"
4948
)
50-
51-
self.serper_api_key = (
52-
node_config["serper_api_key"]
53-
if node_config.get("serper_api_key")
54-
else None
55-
)
56-
5749
self.max_results = node_config.get("max_results", 3)
5850

5951
def execute(self, state: dict) -> dict:
@@ -102,10 +94,10 @@ def execute(self, state: dict) -> dict:
10294
self.logger.info(f"Search Query: {search_query}")
10395

10496
answer = search_on_web(query=search_query, max_results=self.max_results,
105-
search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
97+
search_engine=self.search_engine)
10698

10799
if len(answer) == 0:
108100
raise ValueError("Zero results found for the search query.")
109101

110102
state.update({self.output[0]: answer})
111-
return state
103+
return state

scrapegraphai/utils/research_web.py

+55-111
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,73 @@
11
"""
2-
research_web module
2+
Research_web module
33
"""
44
import re
55
from typing import List
66
from langchain_community.tools import DuckDuckGoSearchResults
77
from googlesearch import search as google_search
88
import requests
99
from bs4 import BeautifulSoup
10-
import json
1110

12-
def search_on_web(query: str, search_engine: str = "Google",
13-
max_results: int = 10, port: int = 8080,
14-
timeout: int = 10, proxy: str | dict = None,
15-
serper_api_key: str = None) -> List[str]:
16-
"""Search web function with improved error handling and validation"""
17-
18-
# Input validation
19-
if not query or not isinstance(query, str):
20-
raise ValueError("Query must be a non-empty string")
21-
22-
search_engine = search_engine.lower()
23-
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
24-
if search_engine not in valid_engines:
25-
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
11+
def search_on_web(query: str, search_engine: str = "Google",
12+
max_results: int = 10, port: int = 8080) -> List[str]:
13+
"""
14+
Searches the web for a given query using specified search engine options.
2615
27-
# Format proxy once
28-
formatted_proxy = None
29-
if proxy:
30-
formatted_proxy = format_proxy(proxy)
31-
32-
try:
33-
results = []
34-
if search_engine == "google":
35-
results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))
36-
37-
elif search_engine == "duckduckgo":
38-
research = DuckDuckGoSearchResults(max_results=max_results)
39-
res = research.run(query)
40-
results = re.findall(r'https?://[^\s,\]]+', res)
41-
42-
elif search_engine == "bing":
43-
results = _search_bing(query, max_results, timeout, formatted_proxy)
44-
45-
elif search_engine == "searxng":
46-
results = _search_searxng(query, max_results, port, timeout)
47-
48-
elif search_engine.lower() == "serper":
49-
results = _search_serper(query, max_results, serper_api_key, timeout)
50-
51-
return filter_pdf_links(results)
52-
53-
except requests.Timeout:
54-
raise TimeoutError(f"Search request timed out after {timeout} seconds")
55-
except requests.RequestException as e:
56-
raise RuntimeError(f"Search request failed: {str(e)}")
16+
Args:
17+
query (str): The search query to find on the internet.
18+
search_engine (str, optional): Specifies the search engine to use,
19+
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
20+
max_results (int, optional): The maximum number of search results to return.
21+
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
5722
58-
def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
59-
"""Helper function for Bing search"""
60-
headers = {
61-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
62-
}
63-
search_url = f"https://www.bing.com/search?q={query}"
64-
65-
proxies = {"http": proxy, "https": proxy} if proxy else None
66-
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
67-
response.raise_for_status()
68-
69-
soup = BeautifulSoup(response.text, "html.parser")
70-
return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
23+
Returns:
24+
List[str]: A list of URLs as strings that are the search results.
7125
72-
def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
73-
"""Helper function for SearXNG search"""
74-
url = f"http://localhost:{port}"
75-
params = {
76-
"q": query,
77-
"format": "json",
78-
"engines": "google,duckduckgo,brave,qwant,bing"
79-
}
80-
response = requests.get(url, params=params, timeout=timeout)
81-
response.raise_for_status()
82-
return [result['url'] for result in response.json().get("results", [])[:max_results]]
26+
Raises:
27+
ValueError: If the search engine specified is not supported.
8328
84-
def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
85-
"""Helper function for serper api"""
86-
if not serper_api_key:
87-
raise ValueError("API key is required for serper api.")
88-
89-
url = "https://google.serper.dev/search"
90-
payload = json.dumps({
91-
"q": query,
92-
"num": max_results
93-
})
94-
headers = {
95-
'X-API-KEY': serper_api_key,
96-
'Content-Type': 'application/json'
97-
}
98-
response = requests.post(url, headers=headers, data=payload, timeout=timeout)
99-
response.raise_for_status()
100-
return [result.get("link") for result in response.json().get("organic", [])]
29+
Example:
30+
>>> search_on_web("example query", search_engine="Google", max_results=5)
31+
['http://example.com', 'http://example.org', ...]
32+
"""
10133

34+
if search_engine.lower() == "google":
35+
res = []
36+
for url in google_search(query, num_results=max_results):
37+
res.append(url)
38+
return res
10239

103-
def format_proxy(proxy):
104-
if isinstance(proxy, dict):
105-
server = proxy.get('server')
106-
username = proxy.get('username')
107-
password = proxy.get('password')
40+
elif search_engine.lower() == "duckduckgo":
41+
research = DuckDuckGoSearchResults(max_results=max_results)
42+
res = research.run(query)
43+
links = re.findall(r'https?://[^\s,\]]+', res)
44+
return links
10845

109-
if all([username, password, server]):
110-
proxy_url = f"http://{username}:{password}@{server}"
111-
return proxy_url
112-
else:
113-
raise ValueError("Proxy dictionary is missing required fields.")
114-
elif isinstance(proxy, str):
115-
return proxy # "https://username:password@ip:port"
116-
else:
117-
raise TypeError("Proxy should be a dictionary or a string.")
118-
119-
def filter_pdf_links(links: List[str]) -> List[str]:
120-
"""
121-
Filters out any links that point to PDF files.
46+
elif search_engine.lower() == "bing":
47+
headers = {
48+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
49+
}
50+
search_url = f"https://www.bing.com/search?q={query}"
51+
response = requests.get(search_url, headers=headers)
52+
response.raise_for_status()
53+
soup = BeautifulSoup(response.text, "html.parser")
12254

123-
Args:
124-
links (List[str]): A list of URLs as strings.
55+
search_results = []
56+
for result in soup.find_all('li', class_='b_algo', limit=max_results):
57+
link = result.find('a')['href']
58+
search_results.append(link)
59+
return search_results
12560

126-
Returns:
127-
List[str]: A list of URLs excluding any that end with '.pdf'.
128-
"""
129-
return [link for link in links if not link.lower().endswith('.pdf')]
61+
elif search_engine.lower() == "searxng":
62+
url = f"http://localhost:{port}"
63+
params = {"q": query, "format": "json"}
64+
65+
# Send the GET request to the server
66+
response = requests.get(url, params=params)
67+
68+
data = response.json()
69+
limited_results = data["results"][:max_results]
70+
return limited_results
71+
72+
else:
73+
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")

0 commit comments

Comments
 (0)