qzwx.nl | FanFiction

After reading Harry Potter and the Methods of Rationality, which is both hilarious and absolutely brilliant, I was looking at some other stories on www.fanfiction.net but was disappointed to find that I couldn't easily read them on my ereader. In order to fix that I wrote fanfiction.py, a little script that can get the separate chapters of a book from fanfiction.net and write them to one single html file. That html file can then be imported in calibre and easily converted to epub.

The script requires a couple of command line parameters to work. First it needs the path to the book you want to download, for instance, if your book is at https://www.fanfiction.net/s/5782108/1/Harry_Potter_and_the_Methods_of_Rationality you need to enter /s/5782108/1/Harry_Potter_and_the_Methods_of_Rationality as the path. The second parameter is the number of chapters that should be downloaded. An optional parameter -o followed by a filename can be used to specify where the output should be written. If the -o parameter is not provided the book will be written to the standard output stream of the script.

Here's the script:

#!/usr/bin/env python3
# fanfiction.py
#
#  Copyright 2015 Hans Alves <halves@localhost>
#  This work is free. You can redistribute it and/or modify it under the
#  terms of the Do What The Fuck You Want To Public License, Version 2,
#  as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
#

import re
import sys
import time
import argparse
import http.client
from html.parser import HTMLParser

class FFStripper(HTMLParser):
    def __init__(self, title_exp, outfile=sys.stdout):
        super(FFStripper, self).__init__()
        self.depth = 0
        self.printdepth = None
        self.title = False
        self.title_exp = title_exp
        self.outfile = outfile

    def reset(self):
        super(FFStripper, self).reset()
        self.depth = 0
        self.printdepth = None
        self.title = False

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.title = True

        if self.printdepth is not None and self.depth >= self.printdepth:
            tag = re.sub(r'^<(\w+)\b(.*)$',
                         lambda m: '<' + m.group(1).lower() + m.group(2),
                         self.get_starttag_text())
            print(tag, file=self.outfile)

        if tag.lower() == 'div':
            self.depth += 1
        if tag == 'div' and dict(attrs).get('id', '') == 'storytext':
            self.printdepth = self.depth

    def handle_endtag(self, tag):
        if tag == 'title':
            self.title = False

        if tag.lower() == 'div':
            self.depth -= 1
        if self.printdepth is not None and self.depth < self.printdepth:
            self.printdepth = None

        if self.printdepth is not None and self.depth >= self.printdepth:
            print('</{0}>'.format(tag), file=self.outfile)

    def handle_data(self, data):
        if self.title:
            m = self.title_exp.match(data)
            if m is None:
                print('Error: could not match title:', data, file=sys.stderr)
                title = data
            else:
                title = m.group(1)
            print('<h2 class="chapter">{0}</h2>'.format(title),
                  file=self.outfile)
            print(data, file=sys.stderr)

        if self.printdepth is not None and self.depth >= self.printdepth:
            print(data, file=self.outfile)

def main():
    parser = argparse.ArgumentParser(description='Rip a book from '
                                                 'fanfiction.net')
    parser.add_argument('-o', '--output', metavar='FILE',
                        help='output file, by default the book will be '
                             'printed to stdout')
    parser.add_argument('path', type=str,
                       help='the path of the book ex: /s/5782108/1/Harry-'
                            'Potter-and-the-Methods-of-Rationality')
    parser.add_argument('chapters', type=int,
                       help='the number of chapters')
    args = parser.parse_args()
    m = re.match(r'/s/(\d+)/\d+/(.+)', args.path)
    if m is None:
        print('Error: path should be of the form /s/#id/#chapter/title',
              file=sys.stderr)
        return 1

    id = m.group(1)
    title = m.group(2)
    title_exp = re.compile(re.sub(r'\W', '.', title) +
                                  r'\s+(.*), a .* fanfic \| FanFiction')
    path_template = '/s/{0}/{{0}}/{1}'.format(*m.groups())

    if args.output is not None:
        try:
            outfile = open(args.output, 'w')
        except Exception as ex:
            print(ex, file=sys.stderr)
            return 2
    else:
        outfile = sys.stdout

    conn = http.client.HTTPSConnection("www.fanfiction.net")
    parser = FFStripper(title_exp, outfile)
    print('<html><head><meta charset="utf-8"></head><body>', file=outfile)
    for i in range(1, args.chapters + 1):
        conn.request("GET", path_template.format(i))
        r = conn.getresponse()
        parser.feed(r.read().decode())
        parser.reset()
        time.sleep(0.5) # go easy on the server and avoid being banned
    print('</body></html>', file=outfile)
    return 0

if __name__ == '__main__':
    sys.exit(main())