20
20
import time
21
21
from collections import defaultdict
22
22
from functools import cache
23
+ from pathlib import Path
23
24
from typing import Dict , Iterable , List , Tuple
24
25
from urllib .parse import quote , urlparse
25
26
50
51
integrations = [sentry_logging ],
51
52
)
52
53
54
+ DEFAULT_SITEMAP_FILENAME = "sitemap/sitemap.json"
53
55
DEFAULT_BATCH__NOOP = "1_1" # By default treat all URLs as a single batch
54
56
UNEXPECTED_URLS_FILENAME_FRAGMENT = "unexpected_urls_for"
55
57
URL_RETRY_LIMIT = 3
63
65
64
66
@click .command ()
65
67
@click .option (
66
- "--sitemap-url " ,
67
- default = None ,
68
- help = "URL of an XML sitemap to use as source data " ,
68
+ "--hostname " ,
69
+ required = True ,
70
+ help = "Hostname of the site instance to scan " ,
69
71
)
70
72
@click .option (
71
- "--maintain-hostname" ,
72
- default = False ,
73
- is_flag = True ,
74
- help = "If the sitemap points to a different domain (eg a CDN domain), override it and replace it with the hostname that served the sitemap" ,
73
+ "--sitemap-filename" ,
74
+ default = DEFAULT_SITEMAP_FILENAME ,
75
+ help = "Filename of a JSON sitemap file" ,
75
76
)
76
77
@click .option (
77
78
"--specific-url" ,
104
105
help = "If True, we'll export the cached pages as an artifact to {hostname}-cached-pages/batch{batch-id}, for other checks to use" ,
105
106
)
106
107
def run_checks (
107
- sitemap_url : str ,
108
- maintain_hostname : bool ,
108
+ sitemap_filename : str ,
109
+ hostname : str ,
109
110
specific_url : Iterable ,
110
111
batch : str ,
111
112
allowlist : str ,
@@ -114,24 +115,20 @@ def run_checks(
114
115
) -> None :
115
116
# Let's tidy up that variables we get from the input option
116
117
specific_urls = specific_url
118
+ sitemap_path = Path (sitemap_filename )
117
119
118
- if not sitemap_url and not specific_urls :
120
+ if not sitemap_path . exists () and not specific_urls :
119
121
raise Exception ("No sitemap or input URLs specified. Cannot proceed." )
120
122
121
- host_url = (
122
- sitemap_url or specific_urls [0 ]
123
- ) # TODO: ensure all specific URLs use the same hostname
124
- hostname = urlparse (host_url ).netloc
125
-
126
123
allowlist_config = _get_allowlist_config (
127
124
hostname ,
128
125
allowlist_pathname = allowlist ,
129
126
)
130
127
131
128
urls_to_check = _build_urls_to_check (
132
- sitemap_url = sitemap_url ,
129
+ sitemap_path = sitemap_path ,
130
+ hostname = hostname ,
133
131
specific_urls = specific_url ,
134
- maintain_hostname = maintain_hostname ,
135
132
)
136
133
137
134
if additional_urls_file :
@@ -437,80 +434,39 @@ def _check_pages_for_outbound_links(urls: List[str], allowlist_config: Dict) ->
437
434
438
435
439
436
def _build_urls_to_check (
440
- sitemap_url : str ,
437
+ sitemap_path : Path ,
438
+ hostname : str ,
441
439
specific_urls : Iterable ,
442
- maintain_hostname : bool ,
443
440
) -> List [str ]:
444
441
"""Given a sitemap URL and/or specific URLs to check, put together a list
445
442
of overall URLs whose content wen want to check"""
446
443
447
444
urls = []
448
- if sitemap_url :
449
- urls += _get_urls_from_sitemap ( sitemap_url , maintain_hostname )
445
+ if sitemap_path :
446
+ urls += _get_urls_from_json_file ( sitemap_path , hostname )
450
447
if specific_urls :
451
448
# Don't forget any manually specified URLs
452
449
urls += specific_urls
453
450
click .echo (f"Discovered { len (urls )} URLs to check" )
454
451
return urls
455
452
456
453
457
- def _get_urls_from_sitemap (
458
- sitemap_url : str ,
459
- maintain_hostname : bool ,
454
+ def _get_urls_from_json_file (
455
+ sitemap_path : Path ,
456
+ hostname : str ,
460
457
) -> List [str ]:
461
- """Extract URLs to explore from a sitemap, optionally ensuring the hostname in
462
- any URLs found is swapped ('maintained') to be the same as that of the source
463
- sitemap –- this is needed when checking an origin server whose sitemap returns
464
- the CDN's hostname"""
465
-
466
458
urls = []
459
+ with sitemap_path .open () as fh :
460
+ sitemap = json .load (fh )
467
461
468
- _parsed_origin_sitemap_url = urlparse (sitemap_url )
469
- origin_hostname_with_scheme = (
470
- f"{ _parsed_origin_sitemap_url .scheme } ://{ _parsed_origin_sitemap_url .netloc } " # noqa E231
471
- )
462
+ for path , locales in sitemap .items ():
463
+ if not locales :
464
+ urls .append (f"https://{ hostname } { path } " )
465
+ continue
466
+
467
+ for locale in locales :
468
+ urls .append (f"https://{ hostname } /{ locale } { path } " )
472
469
473
- resp = _get_url_with_retry (sitemap_url )
474
-
475
- sitemap_xml = resp .text
476
- soup = BeautifulSoup (sitemap_xml , "lxml" )
477
-
478
- # Look for a <sitemap> node, and get each as a URL for a locale-specific sitemap
479
- sitemap_nodes = soup .find_all ("sitemap" )
480
- if len (sitemap_nodes ):
481
- click .echo (f"Discovered { len (sitemap_nodes )} child sitemaps" )
482
-
483
- for sitemap_node in sitemap_nodes :
484
- sitemap_url = sitemap_node .loc .text
485
-
486
- if maintain_hostname :
487
- sitemap_url = _update_hostname (
488
- origin_hostname_with_scheme = origin_hostname_with_scheme ,
489
- urls = [sitemap_url ],
490
- )[0 ]
491
-
492
- click .echo (f"Diving into { sitemap_url } " )
493
- urls .extend (_get_urls_from_sitemap (sitemap_url , maintain_hostname ))
494
-
495
- # look for regular URL nodes, which may or may not co-exist alongside sitemap nodes
496
- url_nodes = soup .find_all ("url" )
497
- if url_nodes :
498
- click .echo (f"Adding { len (url_nodes )} URLs" )
499
- for url in url_nodes :
500
- try :
501
- urls .append (url .loc .text )
502
- except AttributeError as ae :
503
- sentry_sdk .capture_message (
504
- f"URL node { url } missing '<loc>' - exception to follow"
505
- )
506
- sentry_sdk .capture_exception (ae )
507
-
508
- # Also remember to update the hostname on the final set of URLs, if required
509
- if maintain_hostname :
510
- urls = _update_hostname (
511
- origin_hostname_with_scheme = origin_hostname_with_scheme ,
512
- urls = urls ,
513
- )
514
470
return urls
515
471
516
472
0 commit comments