Skip to content

Commit df4aa5f

Browse files
authored
Merge pull request #962 from lrdoflnlss/add-js-scraping
tune scraper
2 parents bdf813e + 98a7bab commit df4aa5f

File tree

3 files changed

+55
-58
lines changed

3 files changed

+55
-58
lines changed

scrapegraphai/docloaders/chromium.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,8 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
360360
else:
361361
raise ValueError(f"Invalid browser name: {browser_name}")
362362
context = await browser.new_context(
363-
storage_state=self.storage_state
363+
storage_state=self.storage_state,
364+
ignore_https_errors=True,
364365
)
365366
await Malenia.apply_stealth(context)
366367
page = await context.new_page()

scrapegraphai/utils/cleanup_html.py

+41-7
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,44 @@
33
"""
44

55
import re
6+
import json
67
from urllib.parse import urljoin
78

89
from bs4 import BeautifulSoup, Comment
910
from minify_html import minify
1011

1112

13+
def extract_from_script_tags(soup):
14+
script_content = []
15+
16+
for script in soup.find_all("script"):
17+
content = script.string
18+
if content:
19+
try:
20+
json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
21+
json_matches = re.findall(json_pattern, content)
22+
23+
for potential_json in json_matches:
24+
try:
25+
parsed = json.loads(potential_json)
26+
if parsed:
27+
script_content.append(f"JSON data from script: {json.dumps(parsed, indent=2)}")
28+
except json.JSONDecodeError:
29+
pass
30+
31+
if "window." in content or "document." in content:
32+
data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
33+
data_matches = re.findall(data_pattern, content)
34+
35+
for var_name, var_value in data_matches:
36+
script_content.append(f"Dynamic data - {var_name}: {var_value.strip()}")
37+
except Exception:
38+
if len(content) < 1000:
39+
script_content.append(f"Script content: {content.strip()}")
40+
41+
return "\n\n".join(script_content)
42+
43+
1244
def cleanup_html(html_content: str, base_url: str) -> str:
1345
"""
1446
Processes HTML content by removing unnecessary tags,
@@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:
3466

3567
title_tag = soup.find("title")
3668
title = title_tag.get_text() if title_tag else ""
37-
38-
for tag in soup.find_all(["script", "style"]):
69+
70+
script_content = extract_from_script_tags(soup)
71+
72+
for tag in soup.find_all("style"):
3973
tag.extract()
4074

4175
link_urls = [
@@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
5488
body_content = soup.find("body")
5589
if body_content:
5690
minimized_body = minify(str(body_content))
57-
return title, minimized_body, link_urls, image_urls
91+
return title, minimized_body, link_urls, image_urls, script_content
5892

5993
else:
6094
raise ValueError(
@@ -106,10 +140,10 @@ def reduce_html(html, reduction):
106140
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
107141
comment.extract()
108142

109-
for tag in soup(["script", "style"]):
143+
for tag in soup(["style"]):
110144
tag.string = ""
111145

112-
attrs_to_keep = ["class", "id", "href", "src"]
146+
attrs_to_keep = ["class", "id", "href", "src", "type"]
113147
for tag in soup.find_all(True):
114148
for attr in list(tag.attrs):
115149
if attr not in attrs_to_keep:
@@ -118,15 +152,15 @@ def reduce_html(html, reduction):
118152
if reduction == 1:
119153
return minify_html(str(soup))
120154

121-
for tag in soup(["script", "style"]):
155+
for tag in soup(["style"]):
122156
tag.decompose()
123157

124158
body = soup.body
125159
if not body:
126160
return "No <body> tag found in the HTML"
127161

128162
for tag in body.find_all(string=True):
129-
if tag.parent.name not in ["script", "style"]:
163+
if tag.parent.name not in ["script"]:
130164
tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20])
131165

132166
reduced_html = str(body)

scrapegraphai/utils/proxy_rotation.py

+12-50
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import requests
1111
from fp.errors import FreeProxyException
1212
from fp.fp import FreeProxy
13-
13+
from urllib.parse import urlparse
1414

1515
class ProxyBrokerCriteria(TypedDict, total=False):
1616
"""
@@ -188,59 +188,21 @@ def is_ipv4_address(address: str) -> bool:
188188

189189

190190
def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
191-
"""parses a proxy configuration or searches for a new one matching
192-
the specified broker criteria
193-
194-
Args:
195-
proxy: The proxy configuration to parse or search for.
196-
197-
Returns:
198-
A 'playwright' compliant proxy configuration.
199-
200-
Notes:
201-
- If the proxy server is a IP address, it is assumed to be
202-
a proxy server address.
203-
- If the proxy server is 'broker', a proxy server is searched for
204-
based on the provided broker criteria.
205-
206-
Example:
207-
>>> proxy = {
208-
... "server": "broker",
209-
... "criteria": {
210-
... "anonymous": True,
211-
... "countryset": {"GB", "US"},
212-
... "secure": True,
213-
... "timeout": 5.0
214-
... "search_outside_if_empty": False
215-
... }
216-
... }
217-
218-
>>> parse_or_search_proxy(proxy)
219-
{
220-
"server": "<proxy-server-matching-criteria>",
221-
}
222-
223-
Example:
224-
>>> proxy = {
225-
... "server": "192.168.1.1:8080",
226-
... "username": "<username>",
227-
... "password": "<password>"
228-
... }
229-
230-
>>> parse_or_search_proxy(proxy)
231-
{
232-
"server": "192.168.1.1:8080",
233-
"username": "<username>",
234-
"password": "<password>"
235-
}
236191
"""
237-
assert "server" in proxy, "missing server in the proxy configuration"
192+
Parses a proxy configuration or searches for a matching one via broker.
193+
"""
194+
assert "server" in proxy, "Missing 'server' field in the proxy configuration."
195+
196+
parsed_url = urlparse(proxy["server"])
197+
server_address = parsed_url.hostname
238198

239-
server_address = re.sub(r"^\w+://", "", proxy["server"]).split(":", maxsplit=1)[0]
199+
if server_address is None:
200+
raise ValueError(f"Invalid proxy server format: {proxy['server']}")
240201

241-
if is_ipv4_address(server_address):
202+
# Accept both IP addresses and domain names like 'gate.nodemaven.com'
203+
if is_ipv4_address(server_address) or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address):
242204
return _parse_proxy(proxy)
243205

244-
assert proxy["server"] == "broker", "unknown proxy server"
206+
assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}"
245207

246208
return _search_proxy(proxy)

0 commit comments

Comments
 (0)