Epubscan is a script that can read epub files and output a list of words and the number of times they occur in the epubs or search for a string or regular expression in the files.

Epub files are basically zipfiles containing html or xhtml files plus some css files for markup. The script will open the book as a zipfile, using pythons zipfile module, and parse all the (x)html files while ignoring other file types. The html in the files is parsed with html.parser so only data within tags is taken into account.

#!/usr/bin/env python3
#
# Copyright © 2016 Hans Alves <halves@localhost>
# This work is free. You can redistribute it and/or modify it under the
# terms of the Do What The Fuck You Want To Public License, Version 2,
# as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
#
import argparse
import fnmatch
import html.parser
import logging
import os.path
import re
import sys
import zipfile

logging.basicConfig(format='{levelname: <8s} - {message}',
                    style='{',
                    stream=sys.stderr)
logger = logging.getLogger('EpubWords')

wordexp = re.compile('\W+')
digexp = re.compile('.*\d.*')


class WordCountParser(html.parser.HTMLParser):
    def __init__(self, words):
        super().__init__()
        self.words = words

    def handle_data(self, data):
        prev = None
        for p in wordexp.split(data):
            p = p.lower()
            if p and not digexp.match(p):
                self.words[p] = self.words.get(p, 0) + 1

class SearchParser(html.parser.HTMLParser):
    def __init__(self, filename, search_string):
        super().__init__()
        self.filename = filename
        self.search_string = search_string

    def handle_data(self, data):
        if self.search_string in data:
            print(self.filename + ':', data)

class ISearchParser(html.parser.HTMLParser):
    def __init__(self, filename, search_string):
        super().__init__()
        self.filename = filename
        self.search_string = search_string.lower()

    def handle_data(self, data):
        if self.search_string in data.lower():
            print(self.filename + ':', data)

class RegexParser(html.parser.HTMLParser):
    def __init__(self, filename, regex):
        super().__init__()
        self.filename = filename
        self.regex = regex

    def handle_data(self, data):
        if self.regex.search(data):
            print(self.filename + ':', data)


def read_book_count(b, words):
    p = WordCountParser(words)
    p.feed(b.read().decode())


def read_archive_count(b, words):
    with zipfile.ZipFile(b) as ziparchive:
        for f in ziparchive.namelist():
            if fnmatch.fnmatch(f, '*.xhtml') or fnmatch.fnmatch(f, '*.html'):
                logger.debug('Reading file {}'.format(f))
                with ziparchive.open(f) as xhtml:
                    read_book_count(xhtml, words)


def read_book_search(f, b, searchclass, **args):
    p = searchclass(filename=f, **args)
    p.feed(b.read().decode())


def read_archive_search(b, searchclass, **args):
    with zipfile.ZipFile(b) as ziparchive:
        for f in sorted(ziparchive.namelist()):
            if fnmatch.fnmatch(f, '*.xhtml') or fnmatch.fnmatch(f, '*.html'):
                logger.debug('Reading file {}'.format(f))
                with ziparchive.open(f) as xhtml:
                    read_book_search(os.path.join(b, f),
                                     xhtml, searchclass, **args)

def read_archives_search(iterator, searchclass, **args):
    for b in iterator:
        try:
            b = b.strip('\n')
            logger.info('Reading epub {}'.format(b))
            read_archive_search(b, searchclass, **args)
        except zipfile.BadZipFile as e:
            logger.error(e)


def main():
    parser = argparse.ArgumentParser(description='Epub scanner.')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help='Be more verbose')
    parser.add_argument('-I', '--stdin', action='store_true', default=False,
                        help='Read list of books to parse from stdin')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-w', '--count-words', action='store_true',
                       default=False, help='Count words in the books '
                       'and output a list of words and the number of '
                       'times they occur')
    group.add_argument('-s', '--search', metavar='STRING',
                       help='Search for STRING in the books')
    group.add_argument('-i', '--isearch', metavar='STRING',
                       help='Search for STRING in the books (case insensitive)')
    group.add_argument('-r', '--regex', metavar='EXPRESSION',
                       help='Search for regular expression in the books')
    parser.add_argument('books', nargs='*', help='books to parse')

    options = parser.parse_args()

    lvl = 30 - min(options.verbose, 2) * 10
    logger.setLevel(lvl)
    logger.debug("set loglevel to {}".format(lvl))

    if options.stdin:
        iterator = sys.stdin
    else:
        iterator = options.books

    if options.count_words:
        words = {}
        for b in iterator:
            try:
                b = b.strip('\n')
                logger.info('Reading epub {}'.format(b))
                read_archive_count(b, words)
                words = dict(i for i in words.items() if i[1] > 1)
            except zipfile.BadZipFile as e:
                logger.error(e)

        for c, w in sorted(((words[k], k) for k in words), reverse=True):
            print('{0:>10d} {1}'.format(c, w))
    elif options.search is not None:
        read_archives_search(iterator, SearchParser,
                             search_string=options.search)
    elif options.isearch is not None:
        read_archives_search(iterator, ISearchParser,
                             search_string=options.isearch)
    elif options.regex is not None:
        read_archives_search(iterator, RegexParser,
                             regex=re.compile(options.regex))


if __name__ == '__main__':
    main()