|
| 1 | +import re |
| 2 | +import urllib.parse |
| 3 | +import requests |
| 4 | +import bs4 |
| 5 | + |
| 6 | + |
| 7 | +class ImgLinkInspector(object): |
| 8 | + |
| 9 | + def __init__(self, page_url, **kwargs): |
| 10 | + self.page_url = page_url |
| 11 | + self.src_attr = kwargs.get('src_attr', 'src') |
| 12 | + self.verbose = bool(kwargs.get('verbose', False)) |
| 13 | + |
| 14 | + parsed_url = urllib.parse.urlparse(self.page_url) |
| 15 | + self.default_scheme = parsed_url.scheme |
| 16 | + self.default_scheme_host = f'{parsed_url.scheme}://{parsed_url.netloc}' |
| 17 | + self._re_ignore_images_urls = re.compile(r'^data:', re.I) |
| 18 | + |
| 19 | + def inspect(self): |
| 20 | + html = self._load_page() |
| 21 | + image_urls = self._find_image_urls(html) |
| 22 | + if self.verbose: |
| 23 | + print(f'{len(image_urls)} image urls found') |
| 24 | + url_statuses = self._check_url_response(image_urls) |
| 25 | + return url_statuses |
| 26 | + |
| 27 | + def _load_page(self): |
| 28 | + resp = requests.get(self.page_url) |
| 29 | + resp.raise_for_status() |
| 30 | + html = resp.text |
| 31 | + return html |
| 32 | + |
| 33 | + def _find_image_urls(self, html): |
| 34 | + soup = bs4.BeautifulSoup(html, 'html.parser') |
| 35 | + image_elements = soup.findAll('img') |
| 36 | + image_urls = set() |
| 37 | + for el in image_elements: |
| 38 | + src = el['src'] |
| 39 | + if not self._re_ignore_images_urls.match(src): |
| 40 | + image_urls.add(self._assert_default_scheme(src)) |
| 41 | + else: |
| 42 | + src = el[self.src_attr] |
| 43 | + if not self._re_ignore_images_urls.match(src): |
| 44 | + image_urls.add(self._assert_default_scheme(src)) |
| 45 | + return image_urls |
| 46 | + |
| 47 | + def _assert_default_scheme(self, url): |
| 48 | + if url.startswith('//'): |
| 49 | + return f'{self.default_scheme}:{url}' |
| 50 | + if url.startswith('/'): |
| 51 | + return f'{self.default_scheme_host}{url}' |
| 52 | + return url |
| 53 | + |
| 54 | + def _check_url_response(self, image_urls): |
| 55 | + statuses = [] |
| 56 | + for url in image_urls: |
| 57 | + url_resp = requests.head(url, allow_redirects=True) |
| 58 | + if url_resp.status_code >= 400 or self.verbose: |
| 59 | + statuses.append((url_resp.status_code, url)) |
| 60 | + |
| 61 | + return statuses |
0 commit comments