3
3
"""
4
4
5
5
import re
6
+ import json
6
7
from urllib .parse import urljoin
7
8
8
9
from bs4 import BeautifulSoup , Comment
9
10
from minify_html import minify
10
11
11
12
13
+ def extract_from_script_tags (soup ):
14
+ script_content = []
15
+
16
+ for script in soup .find_all ("script" ):
17
+ content = script .string
18
+ if content :
19
+ try :
20
+ json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
21
+ json_matches = re .findall (json_pattern , content )
22
+
23
+ for potential_json in json_matches :
24
+ try :
25
+ parsed = json .loads (potential_json )
26
+ if parsed :
27
+ script_content .append (f"JSON data from script: { json .dumps (parsed , indent = 2 )} " )
28
+ except json .JSONDecodeError :
29
+ pass
30
+
31
+ if "window." in content or "document." in content :
32
+ data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
33
+ data_matches = re .findall (data_pattern , content )
34
+
35
+ for var_name , var_value in data_matches :
36
+ script_content .append (f"Dynamic data - { var_name } : { var_value .strip ()} " )
37
+ except Exception :
38
+ if len (content ) < 1000 :
39
+ script_content .append (f"Script content: { content .strip ()} " )
40
+
41
+ return "\n \n " .join (script_content )
42
+
43
+
12
44
def cleanup_html (html_content : str , base_url : str ) -> str :
13
45
"""
14
46
Processes HTML content by removing unnecessary tags,
@@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:
34
66
35
67
title_tag = soup .find ("title" )
36
68
title = title_tag .get_text () if title_tag else ""
37
-
38
- for tag in soup .find_all (["script" , "style" ]):
69
+
70
+ script_content = extract_from_script_tags (soup )
71
+
72
+ for tag in soup .find_all ("style" ):
39
73
tag .extract ()
40
74
41
75
link_urls = [
@@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
54
88
body_content = soup .find ("body" )
55
89
if body_content :
56
90
minimized_body = minify (str (body_content ))
57
- return title , minimized_body , link_urls , image_urls
91
+ return title , minimized_body , link_urls , image_urls , script_content
58
92
59
93
else :
60
94
raise ValueError (
@@ -106,10 +140,10 @@ def reduce_html(html, reduction):
106
140
for comment in soup .find_all (string = lambda text : isinstance (text , Comment )):
107
141
comment .extract ()
108
142
109
- for tag in soup (["script" , " style" ]):
143
+ for tag in soup (["style" ]):
110
144
tag .string = ""
111
145
112
- attrs_to_keep = ["class" , "id" , "href" , "src" ]
146
+ attrs_to_keep = ["class" , "id" , "href" , "src" , "type" ]
113
147
for tag in soup .find_all (True ):
114
148
for attr in list (tag .attrs ):
115
149
if attr not in attrs_to_keep :
@@ -118,15 +152,15 @@ def reduce_html(html, reduction):
118
152
if reduction == 1 :
119
153
return minify_html (str (soup ))
120
154
121
- for tag in soup (["script" , " style" ]):
155
+ for tag in soup (["style" ]):
122
156
tag .decompose ()
123
157
124
158
body = soup .body
125
159
if not body :
126
160
return "No <body> tag found in the HTML"
127
161
128
162
for tag in body .find_all (string = True ):
129
- if tag .parent .name not in ["script" , "style" ]:
163
+ if tag .parent .name not in ["script" ]:
130
164
tag .replace_with (re .sub (r"\s+" , " " , tag .strip ())[:20 ])
131
165
132
166
reduced_html = str (body )
0 commit comments