qzwx.nl | html

This is a very simple script that I use to generate the html code to show syntax-highlighted python code on this website.
The script reads in a file and uses pythons own tokenize module to split the code up in tokens that should get different colors.
The lists of keywords and identifiers in the script were taken from the config file for python syntax highlighting from Geany.
#!/usr/bin/env python3
#
#  Copyright 2015 Hans Alves <halves@localhost>
#  This work is free. You can redistribute it and/or modify it under the
#  terms of the Do What The Fuck You Want To Public License, Version 2,
#  as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
#

import sys
import argparse
import tokenize
import html

# these were stolen from one of the config files of my favorite editor, Geany
keywords = set('''and as assert break class continue def del elif else except
exec finally for from global if import in is lambda not or pass print raise
return try while with yield False None True'''.split())
identifiers = set('''ArithmeticError AssertionError AttributeError
BaseException BufferError BytesWarning DeprecationWarning EOFError Ellipsis
EnvironmentError Exception False FloatingPointError FutureWarning GeneratorExit
IOError ImportError ImportWarning IndentationError IndexError KeyError
KeyboardInterrupt LookupError MemoryError NameError None NotImplemented
NotImplementedError OSError OverflowError PendingDeprecationWarning
ReferenceError RuntimeError RuntimeWarning StandardError StopIteration
SyntaxError SyntaxWarning SystemError SystemExit TabError True TypeError
UnboundLocalError UnicodeDecodeError UnicodeEncodeError UnicodeError
UnicodeTranslateError UnicodeWarning UserWarning ValueError Warning
ZeroDivisionError __debug__ __doc__ __import__ __name__ __package__ abs all any
apply basestring bin bool buffer bytearray bytes callable chr classmethod cmp
coerce compile complex copyright credits delattr dict dir divmod enumerate eval
execfile exit file filter float format frozenset getattr globals hasattr hash
help hex id input int intern isinstance issubclass iter len license list locals
long map max memoryview min next object oct open ord pow print property quit
range raw_input reduce reload repr reversed round set setattr slice sorted
staticmethod str sum super tuple type unichr unicode vars xrange zip'''.split())

def main():
    parser = argparse.ArgumentParser(description='Annotate python code with'
                                                 'html tags.')
    parser.add_argument('file')
    args = parser.parse_args()

    with open(args.file, 'rb') as f:
        prev = tokenize.TokenInfo(None, '', (0, 0), (0, 0), '')
        for tok in tokenize.tokenize(f.readline):

            if prev.end[0] != tok.start[0]:
                # this token is on a different line than the previous one
                if prev.end[1] != len(prev.line):
                    # if the last token didn't end on the end of the line
                    # print the last part of the line
                    print(html.escape(prev.line[prev.end[1]:].replace('\\\n',
                            '\\\\\n\n')), end='')

                if tok.start[1] != 0:
                    # if the new token doesn't start at the start of the line
                    # print the part of the line before the token
                    print(html.escape(tok.line[0:tok.start[1]]), end='')

            elif tok.start[1] != prev.end[1]:
                # this token is on the same line as the previous, but its not
                # right after it, so print the part of the line in between
                print(html.escape(tok.line[prev.end[1]:tok.start[1]]), end='')

            if tok.type == tokenize.ENCODING:
                text = ''
            else:
                text = html.escape(tok.string)
            typ = None

            if tok.type == tokenize.COMMENT:
                typ = 'comment'
            elif tok.type == tokenize.NAME:
                if text in keywords:
                    typ = 'keyword'
                elif text in identifiers:
                    typ = 'identifier'
            elif tok.type == tokenize.STRING:
                typ = 'string'

            if typ is not None:
                print('<span class="{type}">{text}</span>'.format(
                        type=typ, text=text), end='')
            else:
                print(text, end='')
            prev = tok

    return 0

if __name__ == '__main__':
    sys.exit(main())