Skip to content

Commit 7da7bfe

Browse files
committed
fix: improved links extraction for parse_node, resolves #822
1 parent b98dd39 commit 7da7bfe

File tree

1 file changed

+48
-9
lines changed

1 file changed

+48
-9
lines changed

scrapegraphai/nodes/parse_node.py

+48-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ class ParseNode(BaseNode):
2727
node_config (dict): Additional configuration for the node.
2828
node_name (str): The unique identifier name for the node, defaulting to "Parse".
2929
"""
30+
url_pattern = re.compile(r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)")
31+
relative_url_pattern = re.compile(r"[\(](/[^\(\)\s]*)")
3032

3133
def __init__(
3234
self,
@@ -123,12 +125,26 @@ def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
123125
return [], []
124126

125127
image_extensions = default_filters.filter_dict["img_exts"]
126-
image_extension_seq = '|'.join(image_extensions).replace('.','')
127-
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
128-
129-
all_urls = url_pattern.findall(text)
128+
url = ""
129+
all_urls = set()
130+
131+
for group in ParseNode.url_pattern.findall(text):
132+
for el in group:
133+
if el != '':
134+
url += el
135+
all_urls.add(url)
136+
url = ""
137+
138+
url = ""
139+
for group in ParseNode.relative_url_pattern.findall(text):
140+
for el in group:
141+
if el not in ['', '[', ']', '(', ')', '{', '}']:
142+
url += el
143+
all_urls.add(urljoin(source, url))
144+
url = ""
145+
146+
all_urls = list(all_urls)
130147
all_urls = self._clean_urls(all_urls)
131-
132148
if not source.startswith("http"):
133149
all_urls = [url for url in all_urls if url.startswith("http")]
134150
else:
@@ -151,9 +167,32 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
151167
"""
152168
cleaned_urls = []
153169
for url in urls:
154-
url = re.sub(r'.*?\]\(', '', url)
155-
url = url.rstrip(').')
170+
if not ParseNode._is_valid_url(url):
171+
url = re.sub(r'.*?\]\(', '', url)
172+
url = re.sub(r'.*?\[\(', '', url)
173+
url = re.sub(r'.*?\[\)', '', url)
174+
url = re.sub(r'.*?\]\)', '', url)
175+
url = re.sub(r'.*?\)\[', '', url)
176+
url = re.sub(r'.*?\)\[', '', url)
177+
url = re.sub(r'.*?\(\]', '', url)
178+
url = re.sub(r'.*?\)\]', '', url)
179+
url = url.rstrip(').-')
180+
if len(url) > 0:
181+
cleaned_urls.append(url)
182+
183+
return cleaned_urls
184+
185+
@staticmethod
186+
def _is_valid_url(url: str) -> bool:
187+
"""
188+
CHecks if the URL format is valid.
156189
157-
cleaned_urls.append(url)
190+
Args:
191+
url (str): The URL to check.
158192
159-
return cleaned_urls
193+
Returns:
194+
bool: True if the URL format is valid, False otherwise
195+
"""
196+
if re.fullmatch(ParseNode.url_pattern, url) is not None:
197+
return True
198+
return False

0 commit comments

Comments
 (0)