Idea Transcript
de3sw2aq1 / wattpad-ebook-scraper
master
wattpad-ebook-scraper / scrape.py
de3sw2aq1 Allow passing either story or chapter urls to the scraper
Find file
Copy path
4b7eb57 Nov 11, 2015
2 contributors
Executable File 135 lines (101 sloc) 4.65 KB 1
#!/usr/bin/env python3
2 3
import sys
4
import io
5
import re
6 7
import requests
8
import dateutil.parser
9
from genshi.input import HTML
10
import smartypants
11 12
import ez_epub
13 14
# Setup session to not hit Android download app page
15
session = requests.session()
16
# No user agent. Wattpad now blocks all user agents containing "Python".
17
session.headers['User-Agent'] = ''
18 19
# Used by Android app normally
20
# Example parameters are what Android provides
21
API_STORYINFO = 'https://www.wattpad.com/api/v3/stories/' #9876543?drafts=0&include_deleted=1
22 23
# Used by website and Android app normally
24
API_STORYTEXT = 'https://www.wattpad.com/apiv2/storytext' # ?id=23456789
25
# Webpage uses a page parameter: ?id=23456789&page=1
26
# Android uses these parameters: ?id=23456789&increment_read_count=1&include_paragraph_id=1&output=text_zip
27
# Now (2015-06-15), returns HTML instead of JSON. output=json will get JSON again
28 29
API_CHAPTERINFO = 'https://www.wattpad.com/apiv2/info' # ?id=23456789
30 31
# Documented api
32
API_GETCATEGORIES = 'https://www.wattpad.com/apiv2/getcategories'
33 34
ILLEAGAL_FILENAME_CHARACTERS = str.maketrans(r'.:"/\|?*^', '-----------')
35 36
# Fixup the categories data, this could probably be cached too
37
categories = session.get(API_GETCATEGORIES).json()
38
categories = {int(k): v for k, v in categories.items()}
39 40
def download_story(story_id):
41
# TODO: probably use {'drafts': 0, 'include_deleted': 0}
42
storyinfo = session.get(API_STORYINFO + story_id, params={'drafts': 1, 'include_deleted': 1}).json()
43 44
story_title = storyinfo['title']
45
story_description = storyinfo['description']
46
story_createDate = dateutil.parser.parse(storyinfo['createDate'])
47
story_modifyDate = dateutil.parser.parse(storyinfo['modifyDate'])
48
story_author = storyinfo['user']['name']
49
story_categories = [categories[c] for c in storyinfo['categories'] if c in categories] # category can be 0
50
story_rating = storyinfo['rating'] # TODO: I think 4 is adult?
51
story_cover = io.BytesIO(session.get(storyinfo['cover']).content)
52
story_url = storyinfo['url']
53 54
print('Story "{story_title}": {story_id}'.format(story_title=story_title, story_id=story_id))
55 56
# Setup epub
57
book = ez_epub.Book()
58
book.title = story_title
59
book.authors = [story_author]
60
book.sections = []
61
book.impl.addCover(fileobj=story_cover)
62
book.impl.description = HTML(story_description, encoding='utf-8') # TODO: not sure if this is HTML or text
63
book.impl.url = story_url
64
book.impl.addMeta('publisher', 'Wattpad - scraped')
65
book.impl.addMeta('source', story_url)
66 67
for part in storyinfo['parts']:
68
chapter_title = part['title']
69 70
if part['draft']:
71
print('Skipping "{chapter_title}": {chapter_id}, part is draft'.format(chapter_title=chapter_title, chapter_id=chapter_id))
72
continue
73 74
if 'deleted' in part and part['deleted']:
75
print('Skipping "{chapter_title}": {chapter_id}, part is deleted'.format(chapter_title=chapter_title, chapter_id=chapter_id))
76
continue
77 78
chapter_id = part['id']
79 80
# TODO: could intelligently only redownload modified parts
81
chapter_modifyDate = dateutil.parser.parse(part['modifyDate'])
82 83
print('Downloading "{chapter_title}": {chapter_id}'.format(chapter_title=chapter_title, chapter_id=chapter_id))
84 85
chapter_html = session.get(API_STORYTEXT, params={'id': chapter_id, 'output': 'json'}).json()['text']
86
chapter_html = smartypants.smartypants(chapter_html)
87 88 89
section = ez_epub.Section()
90
section.html = HTML(chapter_html, encoding='utf-8')
91
section.title = chapter_title
92
book.sections.append(section)
93 94
print('Saving epub')
95
book.make('./{title}'.format(title=book.title.translate(ILLEAGAL_FILENAME_CHARACTERS)))
96 97 98
def get_story_id(url):
99
# Extract the id number from the url
100
match = re.search(r'\d+', url)
101
if not match:
102
return None
103 104
# Check if it's a valid id of a story
105
url_id = match.group()
106
storyinfo_req = session.get(API_STORYINFO + url_id)
107
if storyinfo_req.ok:
108
return url_id
109 110
# If not, check if it's a chapter id and retrieve the story id
111
chapterinfo_req = session.get(API_CHAPTERINFO, params={'id': url_id})
112
if not chapterinfo_req.ok:
113
return None
114
story_url = chapterinfo_req.json()['url']
115
story_id = re.search(r'\d+', story_url).group()
116
return story_id
117 118 119
def main():
120
if sys.argv[1:]:
121
story_urls = sys.argv[1:]
122
else:
123
story_urls = sys.stdin
124 125
for story_url in story_urls:
126
story_id = get_story_id(story_url)
127
if story_id:
128
download_story(story_id)
129
else:
130
print('ERROR: could not retrieve story', story_url)
131 132 133
if __name__ == '__main__':
134
main()