1
1
"""
2
- research_web module
2
+ Research_web module
3
3
"""
4
4
import re
5
5
from typing import List
6
6
from langchain_community .tools import DuckDuckGoSearchResults
7
7
from googlesearch import search as google_search
8
8
import requests
9
9
from bs4 import BeautifulSoup
10
- import json
11
10
12
- def search_on_web (query : str , search_engine : str = "Google" ,
13
- max_results : int = 10 , port : int = 8080 ,
14
- timeout : int = 10 , proxy : str | dict = None ,
15
- serper_api_key : str = None ) -> List [str ]:
16
- """Search web function with improved error handling and validation"""
17
-
18
- # Input validation
19
- if not query or not isinstance (query , str ):
20
- raise ValueError ("Query must be a non-empty string" )
21
-
22
- search_engine = search_engine .lower ()
23
- valid_engines = {"google" , "duckduckgo" , "bing" , "searxng" , "serper" }
24
- if search_engine not in valid_engines :
25
- raise ValueError (f"Search engine must be one of: { ', ' .join (valid_engines )} " )
11
+ def search_on_web (query : str , search_engine : str = "Google" ,
12
+ max_results : int = 10 , port : int = 8080 ) -> List [str ]:
13
+ """
14
+ Searches the web for a given query using specified search engine options.
26
15
27
- # Format proxy once
28
- formatted_proxy = None
29
- if proxy :
30
- formatted_proxy = format_proxy (proxy )
31
-
32
- try :
33
- results = []
34
- if search_engine == "google" :
35
- results = list (google_search (query , num_results = max_results , proxy = formatted_proxy ))
36
-
37
- elif search_engine == "duckduckgo" :
38
- research = DuckDuckGoSearchResults (max_results = max_results )
39
- res = research .run (query )
40
- results = re .findall (r'https?://[^\s,\]]+' , res )
41
-
42
- elif search_engine == "bing" :
43
- results = _search_bing (query , max_results , timeout , formatted_proxy )
44
-
45
- elif search_engine == "searxng" :
46
- results = _search_searxng (query , max_results , port , timeout )
47
-
48
- elif search_engine .lower () == "serper" :
49
- results = _search_serper (query , max_results , serper_api_key , timeout )
50
-
51
- return filter_pdf_links (results )
52
-
53
- except requests .Timeout :
54
- raise TimeoutError (f"Search request timed out after { timeout } seconds" )
55
- except requests .RequestException as e :
56
- raise RuntimeError (f"Search request failed: { str (e )} " )
16
+ Args:
17
+ query (str): The search query to find on the internet.
18
+ search_engine (str, optional): Specifies the search engine to use,
19
+ options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
20
+ max_results (int, optional): The maximum number of search results to return.
21
+ port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
57
22
58
- def _search_bing (query : str , max_results : int , timeout : int , proxy : str = None ) -> List [str ]:
59
- """Helper function for Bing search"""
60
- headers = {
61
- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
62
- }
63
- search_url = f"https://www.bing.com/search?q={ query } "
64
-
65
- proxies = {"http" : proxy , "https" : proxy } if proxy else None
66
- response = requests .get (search_url , headers = headers , timeout = timeout , proxies = proxies )
67
- response .raise_for_status ()
68
-
69
- soup = BeautifulSoup (response .text , "html.parser" )
70
- return [result .find ('a' )['href' ] for result in soup .find_all ('li' , class_ = 'b_algo' , limit = max_results )]
23
+ Returns:
24
+ List[str]: A list of URLs as strings that are the search results.
71
25
72
- def _search_searxng (query : str , max_results : int , port : int , timeout : int ) -> List [str ]:
73
- """Helper function for SearXNG search"""
74
- url = f"http://localhost:{ port } "
75
- params = {
76
- "q" : query ,
77
- "format" : "json" ,
78
- "engines" : "google,duckduckgo,brave,qwant,bing"
79
- }
80
- response = requests .get (url , params = params , timeout = timeout )
81
- response .raise_for_status ()
82
- return [result ['url' ] for result in response .json ().get ("results" , [])[:max_results ]]
26
+ Raises:
27
+ ValueError: If the search engine specified is not supported.
83
28
84
- def _search_serper (query : str , max_results : int , serper_api_key : str , timeout : int ) -> List [str ]:
85
- """Helper function for serper api"""
86
- if not serper_api_key :
87
- raise ValueError ("API key is required for serper api." )
88
-
89
- url = "https://google.serper.dev/search"
90
- payload = json .dumps ({
91
- "q" : query ,
92
- "num" : max_results
93
- })
94
- headers = {
95
- 'X-API-KEY' : serper_api_key ,
96
- 'Content-Type' : 'application/json'
97
- }
98
- response = requests .post (url , headers = headers , data = payload , timeout = timeout )
99
- response .raise_for_status ()
100
- return [result .get ("link" ) for result in response .json ().get ("organic" , [])]
29
+ Example:
30
+ >>> search_on_web("example query", search_engine="Google", max_results=5)
31
+ ['http://example.com', 'http://example.org', ...]
32
+ """
101
33
34
+ if search_engine .lower () == "google" :
35
+ res = []
36
+ for url in google_search (query , num_results = max_results ):
37
+ res .append (url )
38
+ return res
102
39
103
- def format_proxy ( proxy ) :
104
- if isinstance ( proxy , dict ):
105
- server = proxy . get ( 'server' )
106
- username = proxy . get ( 'username' )
107
- password = proxy . get ( 'password' )
40
+ elif search_engine . lower () == "duckduckgo" :
41
+ research = DuckDuckGoSearchResults ( max_results = max_results )
42
+ res = research . run ( query )
43
+ links = re . findall ( r'https?://[^\s,\]]+' , res )
44
+ return links
108
45
109
- if all ([username , password , server ]):
110
- proxy_url = f"http://{ username } :{ password } @{ server } "
111
- return proxy_url
112
- else :
113
- raise ValueError ("Proxy dictionary is missing required fields." )
114
- elif isinstance (proxy , str ):
115
- return proxy # "https://username:password@ip:port"
116
- else :
117
- raise TypeError ("Proxy should be a dictionary or a string." )
118
-
119
- def filter_pdf_links (links : List [str ]) -> List [str ]:
120
- """
121
- Filters out any links that point to PDF files.
46
+ elif search_engine .lower () == "bing" :
47
+ headers = {
48
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
49
+ }
50
+ search_url = f"https://www.bing.com/search?q={ query } "
51
+ response = requests .get (search_url , headers = headers )
52
+ response .raise_for_status ()
53
+ soup = BeautifulSoup (response .text , "html.parser" )
122
54
123
- Args:
124
- links (List[str]): A list of URLs as strings.
55
+ search_results = []
56
+ for result in soup .find_all ('li' , class_ = 'b_algo' , limit = max_results ):
57
+ link = result .find ('a' )['href' ]
58
+ search_results .append (link )
59
+ return search_results
125
60
126
- Returns:
127
- List[str]: A list of URLs excluding any that end with '.pdf'.
128
- """
129
- return [link for link in links if not link .lower ().endswith ('.pdf' )]
61
+ elif search_engine .lower () == "searxng" :
62
+ url = f"http://localhost:{ port } "
63
+ params = {"q" : query , "format" : "json" }
64
+
65
+ # Send the GET request to the server
66
+ response = requests .get (url , params = params )
67
+
68
+ data = response .json ()
69
+ limited_results = data ["results" ][:max_results ]
70
+ return limited_results
71
+
72
+ else :
73
+ raise ValueError ("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG" )
0 commit comments