After reading Harry Potter and the Methods of Rationality, which is both hilarious and absolutely brilliant, I was looking at some other stories on www.fanfiction.net but was disappointed to find that I couldn't easily read them on my ereader. In order to fix that I wrote fanfiction.py, a little script that can get the separate chapters of a book from fanfiction.net and write them to one single html file. That html file can then be imported in calibre and easily converted to epub.
The script requires a couple of command line parameters to work. First it needs the path to the book you want to download, for instance, if your book is at https://www.fanfiction.net/s/5782108/1/Harry_Potter_and_the_Methods_of_Rationality
you need to enter /s/5782108/1/Harry_Potter_and_the_Methods_of_Rationality
as the path. The second parameter is the number of chapters that should be downloaded. An optional parameter -o followed by a filename can be used to specify where the output should be written. If the -o parameter is not provided the book will be written to the standard output stream of the script.
Here's the script:
#!/usr/bin/env python3 # fanfiction.py # # Copyright 2015 Hans Alves <halves@localhost> # This work is free. You can redistribute it and/or modify it under the # terms of the Do What The Fuck You Want To Public License, Version 2, # as published by Sam Hocevar. See http://www.wtfpl.net/ for more details. # import re import sys import time import argparse import http.client from html.parser import HTMLParser class FFStripper(HTMLParser): def __init__(self, title_exp, outfile=sys.stdout): super(FFStripper, self).__init__() self.depth = 0 self.printdepth = None self.title = False self.title_exp = title_exp self.outfile = outfile def reset(self): super(FFStripper, self).reset() self.depth = 0 self.printdepth = None self.title = False def handle_starttag(self, tag, attrs): if tag == 'title': self.title = True if self.printdepth is not None and self.depth >= self.printdepth: tag = re.sub(r'^<(\w+)\b(.*)$', lambda m: '<' + m.group(1).lower() + m.group(2), self.get_starttag_text()) print(tag, file=self.outfile) if tag.lower() == 'div': self.depth += 1 if tag == 'div' and dict(attrs).get('id', '') == 'storytext': self.printdepth = self.depth def handle_endtag(self, tag): if tag == 'title': self.title = False if tag.lower() == 'div': self.depth -= 1 if self.printdepth is not None and self.depth < self.printdepth: self.printdepth = None if self.printdepth is not None and self.depth >= self.printdepth: print('</{0}>'.format(tag), file=self.outfile) def handle_data(self, data): if self.title: m = self.title_exp.match(data) if m is None: print('Error: could not match title:', data, file=sys.stderr) title = data else: title = m.group(1) print('<h2 class="chapter">{0}</h2>'.format(title), file=self.outfile) print(data, file=sys.stderr) if self.printdepth is not None and self.depth >= self.printdepth: print(data, file=self.outfile) def main(): parser = argparse.ArgumentParser(description='Rip a book from ' 'fanfiction.net') parser.add_argument('-o', '--output', metavar='FILE', help='output file, by default the book will be ' 'printed to stdout') parser.add_argument('path', type=str, help='the path of the book ex: /s/5782108/1/Harry-' 'Potter-and-the-Methods-of-Rationality') parser.add_argument('chapters', type=int, help='the number of chapters') args = parser.parse_args() m = re.match(r'/s/(\d+)/\d+/(.+)', args.path) if m is None: print('Error: path should be of the form /s/#id/#chapter/title', file=sys.stderr) return 1 id = m.group(1) title = m.group(2) title_exp = re.compile(re.sub(r'\W', '.', title) + r'\s+(.*), a .* fanfic \| FanFiction') path_template = '/s/{0}/{{0}}/{1}'.format(*m.groups()) if args.output is not None: try: outfile = open(args.output, 'w') except Exception as ex: print(ex, file=sys.stderr) return 2 else: outfile = sys.stdout conn = http.client.HTTPSConnection("www.fanfiction.net") parser = FFStripper(title_exp, outfile) print('<html><head><meta charset="utf-8"></head><body>', file=outfile) for i in range(1, args.chapters + 1): conn.request("GET", path_template.format(i)) r = conn.getresponse() parser.feed(r.read().decode()) parser.reset() time.sleep(0.5) # go easy on the server and avoid being banned print('</body></html>', file=outfile) return 0 if __name__ == '__main__': sys.exit(main())