@@ -27,6 +27,8 @@ class ParseNode(BaseNode):
27
27
node_config (dict): Additional configuration for the node.
28
28
node_name (str): The unique identifier name for the node, defaulting to "Parse".
29
29
"""
30
+ url_pattern = re .compile (r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)" )
31
+ relative_url_pattern = re .compile (r"[\(](/[^\(\)\s]*)" )
30
32
31
33
def __init__ (
32
34
self ,
@@ -123,12 +125,26 @@ def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
123
125
return [], []
124
126
125
127
image_extensions = default_filters .filter_dict ["img_exts" ]
126
- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
127
- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
128
-
129
- all_urls = url_pattern .findall (text )
128
+ url = ""
129
+ all_urls = set ()
130
+
131
+ for group in ParseNode .url_pattern .findall (text ):
132
+ for el in group :
133
+ if el != '' :
134
+ url += el
135
+ all_urls .add (url )
136
+ url = ""
137
+
138
+ url = ""
139
+ for group in ParseNode .relative_url_pattern .findall (text ):
140
+ for el in group :
141
+ if el not in ['' , '[' , ']' , '(' , ')' , '{' , '}' ]:
142
+ url += el
143
+ all_urls .add (urljoin (source , url ))
144
+ url = ""
145
+
146
+ all_urls = list (all_urls )
130
147
all_urls = self ._clean_urls (all_urls )
131
-
132
148
if not source .startswith ("http" ):
133
149
all_urls = [url for url in all_urls if url .startswith ("http" )]
134
150
else :
@@ -151,9 +167,32 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
151
167
"""
152
168
cleaned_urls = []
153
169
for url in urls :
154
- url = re .sub (r'.*?\]\(' , '' , url )
155
- url = url .rstrip (').' )
170
+ if not ParseNode ._is_valid_url (url ):
171
+ url = re .sub (r'.*?\]\(' , '' , url )
172
+ url = re .sub (r'.*?\[\(' , '' , url )
173
+ url = re .sub (r'.*?\[\)' , '' , url )
174
+ url = re .sub (r'.*?\]\)' , '' , url )
175
+ url = re .sub (r'.*?\)\[' , '' , url )
176
+ url = re .sub (r'.*?\)\[' , '' , url )
177
+ url = re .sub (r'.*?\(\]' , '' , url )
178
+ url = re .sub (r'.*?\)\]' , '' , url )
179
+ url = url .rstrip (').-' )
180
+ if len (url ) > 0 :
181
+ cleaned_urls .append (url )
182
+
183
+ return cleaned_urls
184
+
185
+ @staticmethod
186
+ def _is_valid_url (url : str ) -> bool :
187
+ """
188
+ CHecks if the URL format is valid.
156
189
157
- cleaned_urls .append (url )
190
+ Args:
191
+ url (str): The URL to check.
158
192
159
- return cleaned_urls
193
+ Returns:
194
+ bool: True if the URL format is valid, False otherwise
195
+ """
196
+ if re .fullmatch (ParseNode .url_pattern , url ) is not None :
197
+ return True
198
+ return False
0 commit comments