-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsmashwords.py
61 lines (44 loc) · 1.74 KB
/
smashwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Book scraping script for smashwords.com.
Usage: python smashwords.py [scrape_link] [output_dir (defaults to data/books)]
"""
import os
import re
import sys
from bs4 import BeautifulSoup
import requests
def browse(url):
"""Retrieve the server response contents of the given URL."""
# A cookie is required to allow books with adult content to be served.
return requests.get(url, cookies={"adultOff": "no"}).text
def to_filename(s):
"""Convert the given string to a valid filename."""
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
if __name__ == '__main__':
write_dir = 'data/books'
if len(sys.argv) > 2:
write_dir = sys.argv[2]
count = 0
num_downloaded = 0
while True:
res = browse((sys.argv[1] + '/{}').format(count))
soup = BeautifulSoup(res, 'html.parser')
for div in soup.find_all('div', {'class': 'library-book'}):
# Detect language
language_html = div.find('div', {'class': 'subnote'})
language_html = language_html.find_all('span', {'class': 'text-nowrap'})
language_html = ''.join(map(lambda tag: tag.get_text(), language_html))
if 'english' in language_html.lower():
# Get title and download link
link_html = div.find('a', {'class': 'library-title'})
title = link_html.get_text()
link = link_html.get('href').split('/')
link[-2] = 'download'
link.append('6') # text file format
download = browse('/'.join(link))
if not download.startswith('<!DOCTYPE html>'):
num_downloaded += 1
print(num_downloaded, title, sep='\t')
with open(os.path.join(write_dir, to_filename(title)), 'w') as f:
f.write(download)
count += 20