Skip to content

Commit 626a65f

Browse files
committed
Added test_urls under tests to check for dead urls Signed-off-by: Shrish Mishra [email protected]
Signed-off-by: Shrish0098 <[email protected]>
1 parent 432a7d4 commit 626a65f

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed

.github/workflows/upstream_test.yml

+3
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,6 @@ jobs:
3131
run: make webtest
3232
env:
3333
GH_TOKEN: 1
34+
35+
- name: Run URL health checks
36+
run: pytest -v tests/test_urls.py

vulnerabilities/tests/test_urls.py

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import re
11+
import time
12+
from urllib.parse import urlparse
13+
from pathlib import Path
14+
import pytest
15+
import requests
16+
from requests.adapters import HTTPAdapter
17+
from requests.packages.urllib3.util.retry import Retry
18+
19+
# Configuration - Update EXCLUDED_URLS with problematic URLs
20+
EXCLUDED_URLS = {
21+
# Timeout-prone URLs
22+
'https://www.cisa.gov/sites/default/files/publications/cisa-ssvc-guide%20508c.pdf',
23+
24+
# Template URLs with placeholders
25+
'https://www.debian.org/security/oval/oval-definitions-{release}.xml.bz2',
26+
'https://www.postgresql.org/support/security/{cve_id',
27+
'https://www.wireshark.org/security/{wnpa_sec_id}.html',
28+
'https://xenbits.xen.org/xsa/advisory-{number}.html',
29+
'https://xenbits.xen.org/xsa/advisory-{numid}.html',
30+
31+
# Invalid URL patterns
32+
'https://{token}@',
33+
34+
# Known 403/404 URLs
35+
'https://www.openssl.org/news/vulnerabilities.xml',
36+
'https://www.softwaretestinghelp.com/how-to-write-good-bug-report/',
37+
38+
# XML namespace URLs
39+
'http://www.w3.org/2001/XMLSchema-instance'
40+
}
41+
42+
USER_AGENT = 'VulnerableCode URL Checker/1.0'
43+
MAX_RETRIES = 3
44+
TIMEOUT = 25 # Increased timeout
45+
46+
def sanitize_url(url):
47+
"""Clean up URLs from documentation syntax artifacts"""
48+
# Remove template placeholders
49+
url = re.sub(r'\{.*?\}', '', url)
50+
# Remove RST/Markdown formatting
51+
url = re.sub(r'[>`_\[\](){}\\]+$', '', url)
52+
# Remove URL-encoded variables
53+
url = re.sub(r'%7[Bb]raw_data%5[BbDd].*', '', url)
54+
return url.rstrip('.,;:').strip()
55+
56+
def is_valid_url(url):
57+
"""Validate URL structure and exclude templates"""
58+
try:
59+
# Reject URLs with residual placeholders or invalid patterns
60+
if re.search(r'\{\w+?\}', url) or '@' in url.split('//')[-1]:
61+
return False
62+
63+
parsed = urlparse(url)
64+
if not all([parsed.scheme, parsed.netloc]):
65+
return False
66+
67+
return re.match(r'^https?://[^\s/$.?#]+\.[^\s]{2,}', url) is not None
68+
except ValueError:
69+
return False
70+
71+
def extract_urls(content):
72+
"""Find URLs while ignoring documentation syntax"""
73+
# regex to avoid capturing template URLs
74+
url_pattern = re.compile(
75+
r'\bhttps?://(?:[^\s>"\'\\\]`<{}]+|%[0-9a-fA-F]{2})+\b'
76+
)
77+
return [sanitize_url(url) for url in url_pattern.findall(content)]
78+
79+
def get_all_urls():
80+
"""Get all unique URLs from code and docs with enhanced filtering"""
81+
urls = []
82+
83+
# Scan documentation
84+
docs_dir = Path("docs")
85+
for ext in ('*.rst', '*.md'):
86+
for path in docs_dir.rglob(ext):
87+
urls.extend(extract_urls(path.read_text()))
88+
89+
# Scan codebase
90+
code_dirs = [
91+
Path("vulnerabilities/management/commands"),
92+
Path("vulnerabilities/")
93+
]
94+
for code_dir in code_dirs:
95+
for path in code_dir.rglob('*.py'):
96+
urls.extend(extract_urls(path.read_text()))
97+
98+
return sorted({
99+
url for url in urls
100+
if is_valid_url(url) and url not in EXCLUDED_URLS
101+
})
102+
103+
@pytest.fixture(scope="module")
104+
def session():
105+
"""Configure HTTP session with enhanced retry logic"""
106+
session = requests.Session()
107+
retries = Retry(
108+
total=MAX_RETRIES,
109+
backoff_factor=1.5,
110+
status_forcelist=[500, 502, 503, 504, 429],
111+
allowed_methods=['HEAD', 'GET'],
112+
respect_retry_after_header=True
113+
)
114+
session.mount('http://', HTTPAdapter(max_retries=retries))
115+
session.mount('https://', HTTPAdapter(max_retries=retries))
116+
return session
117+
118+
@pytest.mark.parametrize("url", get_all_urls())
119+
def test_url_is_alive(url, session):
120+
headers = {'User-Agent': USER_AGENT}
121+
122+
try:
123+
# Initial attempt with HEAD
124+
try:
125+
response = session.head(
126+
url,
127+
headers=headers,
128+
allow_redirects=True,
129+
timeout=TIMEOUT
130+
)
131+
if response.status_code == 405:
132+
response = session.get(
133+
url,
134+
headers=headers,
135+
allow_redirects=True,
136+
timeout=TIMEOUT
137+
)
138+
except requests.exceptions.SSLError:
139+
# Fallback to GET without SSL verification
140+
response = session.get(
141+
url,
142+
headers=headers,
143+
verify=False,
144+
timeout=TIMEOUT
145+
)
146+
147+
# Handle special cases
148+
if response.status_code in [403, 404] and url in EXCLUDED_URLS:
149+
pytest.skip(f"Skipping excluded URL: {url}")
150+
151+
if response.status_code == 403:
152+
pytest.xfail(f"Expected 403 Forbidden for protected resource: {url}")
153+
154+
assert 200 <= response.status_code < 400, \
155+
f"URL {url} returned status {response.status_code}"
156+
157+
except requests.exceptions.Timeout:
158+
pytest.xfail(f"Timeout occurred for {url} - may be temporary")
159+
160+
except requests.exceptions.RequestException as e:
161+
if isinstance(e, requests.exceptions.ConnectionError):
162+
if url in EXCLUDED_URLS:
163+
pytest.skip(f"Skipping connection error for excluded URL: {url}")
164+
pytest.xfail(f"Connection failed for {url} - possible network issue")
165+
166+
pytest.fail(f"Failed to access {url}: {str(e)}")
167+
168+
finally:
169+
time.sleep(1)

0 commit comments

Comments
 (0)