Skip to content

Commit 58f755f

Browse files
committed
ImgLinkInspector
1 parent 5bf4747 commit 58f755f

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed

inspector.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import re
2+
import urllib.parse
3+
import requests
4+
import bs4
5+
6+
7+
class ImgLinkInspector(object):
8+
9+
def __init__(self, page_url, **kwargs):
10+
self.page_url = page_url
11+
self.src_attr = kwargs.get('src_attr', 'src')
12+
self.verbose = bool(kwargs.get('verbose', False))
13+
14+
parsed_url = urllib.parse.urlparse(self.page_url)
15+
self.default_scheme = parsed_url.scheme
16+
self.default_scheme_host = f'{parsed_url.scheme}://{parsed_url.netloc}'
17+
self._re_ignore_images_urls = re.compile(r'^data:', re.I)
18+
19+
def inspect(self):
20+
html = self._load_page()
21+
image_urls = self._find_image_urls(html)
22+
if self.verbose:
23+
print(f'{len(image_urls)} image urls found')
24+
url_statuses = self._check_url_response(image_urls)
25+
return url_statuses
26+
27+
def _load_page(self):
28+
resp = requests.get(self.page_url)
29+
resp.raise_for_status()
30+
html = resp.text
31+
return html
32+
33+
def _find_image_urls(self, html):
34+
soup = bs4.BeautifulSoup(html, 'html.parser')
35+
image_elements = soup.findAll('img')
36+
image_urls = set()
37+
for el in image_elements:
38+
src = el['src']
39+
if not self._re_ignore_images_urls.match(src):
40+
image_urls.add(self._assert_default_scheme(src))
41+
else:
42+
src = el[self.src_attr]
43+
if not self._re_ignore_images_urls.match(src):
44+
image_urls.add(self._assert_default_scheme(src))
45+
return image_urls
46+
47+
def _assert_default_scheme(self, url):
48+
if url.startswith('//'):
49+
return f'{self.default_scheme}:{url}'
50+
if url.startswith('/'):
51+
return f'{self.default_scheme_host}{url}'
52+
return url
53+
54+
def _check_url_response(self, image_urls):
55+
statuses = []
56+
for url in image_urls:
57+
url_resp = requests.head(url, allow_redirects=True)
58+
if url_resp.status_code >= 400 or self.verbose:
59+
statuses.append((url_resp.status_code, url))
60+
61+
return statuses

0 commit comments

Comments
 (0)