1
+ #
2
+ # Copyright (c) nexB Inc. and others. All rights reserved.
3
+ # VulnerableCode is a trademark of nexB Inc.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+ # See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6
+ # See https://github.com/aboutcode-org/vulnerablecode for support or download.
7
+ # See https://aboutcode.org for more information about nexB OSS projects.
8
+ #
9
+
10
+ import re
11
+ import time
12
+ from urllib .parse import urlparse
13
+ from pathlib import Path
14
+ import pytest
15
+ import requests
16
+ from requests .adapters import HTTPAdapter
17
+ from requests .packages .urllib3 .util .retry import Retry
18
+
19
+ # Configuration - Update EXCLUDED_URLS with problematic URLs
20
+ EXCLUDED_URLS = {
21
+ # Timeout-prone URLs
22
+ 'https://www.cisa.gov/sites/default/files/publications/cisa-ssvc-guide%20508c.pdf' ,
23
+
24
+ # Template URLs with placeholders
25
+ 'https://www.debian.org/security/oval/oval-definitions-{release}.xml.bz2' ,
26
+ 'https://www.postgresql.org/support/security/{cve_id' ,
27
+ 'https://www.wireshark.org/security/{wnpa_sec_id}.html' ,
28
+ 'https://xenbits.xen.org/xsa/advisory-{number}.html' ,
29
+ 'https://xenbits.xen.org/xsa/advisory-{numid}.html' ,
30
+
31
+ # Invalid URL patterns
32
+ 'https://{token}@' ,
33
+
34
+ # Known 403/404 URLs
35
+ 'https://www.openssl.org/news/vulnerabilities.xml' ,
36
+ 'https://www.softwaretestinghelp.com/how-to-write-good-bug-report/' ,
37
+
38
+ # XML namespace URLs
39
+ 'http://www.w3.org/2001/XMLSchema-instance'
40
+ }
41
+
42
+ USER_AGENT = 'VulnerableCode URL Checker/1.0'
43
+ MAX_RETRIES = 3
44
+ TIMEOUT = 25 # Increased timeout
45
+
46
+ def sanitize_url (url ):
47
+ """Clean up URLs from documentation syntax artifacts"""
48
+ # Remove template placeholders
49
+ url = re .sub (r'\{.*?\}' , '' , url )
50
+ # Remove RST/Markdown formatting
51
+ url = re .sub (r'[>`_\[\](){}\\]+$' , '' , url )
52
+ # Remove URL-encoded variables
53
+ url = re .sub (r'%7[Bb]raw_data%5[BbDd].*' , '' , url )
54
+ return url .rstrip ('.,;:' ).strip ()
55
+
56
+ def is_valid_url (url ):
57
+ """Validate URL structure and exclude templates"""
58
+ try :
59
+ # Reject URLs with residual placeholders or invalid patterns
60
+ if re .search (r'\{\w+?\}' , url ) or '@' in url .split ('//' )[- 1 ]:
61
+ return False
62
+
63
+ parsed = urlparse (url )
64
+ if not all ([parsed .scheme , parsed .netloc ]):
65
+ return False
66
+
67
+ return re .match (r'^https?://[^\s/$.?#]+\.[^\s]{2,}' , url ) is not None
68
+ except ValueError :
69
+ return False
70
+
71
+ def extract_urls (content ):
72
+ """Find URLs while ignoring documentation syntax"""
73
+ # regex to avoid capturing template URLs
74
+ url_pattern = re .compile (
75
+ r'\bhttps?://(?:[^\s>"\'\\\]`<{}]+|%[0-9a-fA-F]{2})+\b'
76
+ )
77
+ return [sanitize_url (url ) for url in url_pattern .findall (content )]
78
+
79
+ def get_all_urls ():
80
+ """Get all unique URLs from code and docs with enhanced filtering"""
81
+ urls = []
82
+
83
+ # Scan documentation
84
+ docs_dir = Path ("docs" )
85
+ for ext in ('*.rst' , '*.md' ):
86
+ for path in docs_dir .rglob (ext ):
87
+ urls .extend (extract_urls (path .read_text ()))
88
+
89
+ # Scan codebase
90
+ code_dirs = [
91
+ Path ("vulnerabilities/management/commands" ),
92
+ Path ("vulnerabilities/" )
93
+ ]
94
+ for code_dir in code_dirs :
95
+ for path in code_dir .rglob ('*.py' ):
96
+ urls .extend (extract_urls (path .read_text ()))
97
+
98
+ return sorted ({
99
+ url for url in urls
100
+ if is_valid_url (url ) and url not in EXCLUDED_URLS
101
+ })
102
+
103
+ @pytest .fixture (scope = "module" )
104
+ def session ():
105
+ """Configure HTTP session with enhanced retry logic"""
106
+ session = requests .Session ()
107
+ retries = Retry (
108
+ total = MAX_RETRIES ,
109
+ backoff_factor = 1.5 ,
110
+ status_forcelist = [500 , 502 , 503 , 504 , 429 ],
111
+ allowed_methods = ['HEAD' , 'GET' ],
112
+ respect_retry_after_header = True
113
+ )
114
+ session .mount ('http://' , HTTPAdapter (max_retries = retries ))
115
+ session .mount ('https://' , HTTPAdapter (max_retries = retries ))
116
+ return session
117
+
118
+ @pytest .mark .parametrize ("url" , get_all_urls ())
119
+ def test_url_is_alive (url , session ):
120
+ headers = {'User-Agent' : USER_AGENT }
121
+
122
+ try :
123
+ # Initial attempt with HEAD
124
+ try :
125
+ response = session .head (
126
+ url ,
127
+ headers = headers ,
128
+ allow_redirects = True ,
129
+ timeout = TIMEOUT
130
+ )
131
+ if response .status_code == 405 :
132
+ response = session .get (
133
+ url ,
134
+ headers = headers ,
135
+ allow_redirects = True ,
136
+ timeout = TIMEOUT
137
+ )
138
+ except requests .exceptions .SSLError :
139
+ # Fallback to GET without SSL verification
140
+ response = session .get (
141
+ url ,
142
+ headers = headers ,
143
+ verify = False ,
144
+ timeout = TIMEOUT
145
+ )
146
+
147
+ # Handle special cases
148
+ if response .status_code in [403 , 404 ] and url in EXCLUDED_URLS :
149
+ pytest .skip (f"Skipping excluded URL: { url } " )
150
+
151
+ if response .status_code == 403 :
152
+ pytest .xfail (f"Expected 403 Forbidden for protected resource: { url } " )
153
+
154
+ assert 200 <= response .status_code < 400 , \
155
+ f"URL { url } returned status { response .status_code } "
156
+
157
+ except requests .exceptions .Timeout :
158
+ pytest .xfail (f"Timeout occurred for { url } - may be temporary" )
159
+
160
+ except requests .exceptions .RequestException as e :
161
+ if isinstance (e , requests .exceptions .ConnectionError ):
162
+ if url in EXCLUDED_URLS :
163
+ pytest .skip (f"Skipping connection error for excluded URL: { url } " )
164
+ pytest .xfail (f"Connection failed for { url } - possible network issue" )
165
+
166
+ pytest .fail (f"Failed to access { url } : { str (e )} " )
167
+
168
+ finally :
169
+ time .sleep (1 )
0 commit comments