This is a very simple script that I use to generate the html code to show syntax-highlighted python code on this website.
The script reads in a file and uses pythons own tokenize module to split the code up in tokens that should get different colors.
The lists of keywords and identifiers in the script were taken from the config file for python syntax highlighting from Geany.
#!/usr/bin/env python3 # # Copyright 2015 Hans Alves <halves@localhost> # This work is free. You can redistribute it and/or modify it under the # terms of the Do What The Fuck You Want To Public License, Version 2, # as published by Sam Hocevar. See http://www.wtfpl.net/ for more details. # import sys import argparse import tokenize import html # these were stolen from one of the config files of my favorite editor, Geany keywords = set('''and as assert break class continue def del elif else except exec finally for from global if import in is lambda not or pass print raise return try while with yield False None True'''.split()) identifiers = set('''ArithmeticError AssertionError AttributeError BaseException BufferError BytesWarning DeprecationWarning EOFError Ellipsis EnvironmentError Exception False FloatingPointError FutureWarning GeneratorExit IOError ImportError ImportWarning IndentationError IndexError KeyError KeyboardInterrupt LookupError MemoryError NameError None NotImplemented NotImplementedError OSError OverflowError PendingDeprecationWarning ReferenceError RuntimeError RuntimeWarning StandardError StopIteration SyntaxError SyntaxWarning SystemError SystemExit TabError True TypeError UnboundLocalError UnicodeDecodeError UnicodeEncodeError UnicodeError UnicodeTranslateError UnicodeWarning UserWarning ValueError Warning ZeroDivisionError __debug__ __doc__ __import__ __name__ __package__ abs all any apply basestring bin bool buffer bytearray bytes callable chr classmethod cmp coerce compile complex copyright credits delattr dict dir divmod enumerate eval execfile exit file filter float format frozenset getattr globals hasattr hash help hex id input int intern isinstance issubclass iter len license list locals long map max memoryview min next object oct open ord pow print property quit range raw_input reduce reload repr reversed round set setattr slice sorted staticmethod str sum super tuple type unichr unicode vars xrange zip'''.split()) def main(): parser = argparse.ArgumentParser(description='Annotate python code with' 'html tags.') parser.add_argument('file') args = parser.parse_args() with open(args.file, 'rb') as f: prev = tokenize.TokenInfo(None, '', (0, 0), (0, 0), '') for tok in tokenize.tokenize(f.readline): if prev.end[0] != tok.start[0]: # this token is on a different line than the previous one if prev.end[1] != len(prev.line): # if the last token didn't end on the end of the line # print the last part of the line print(html.escape(prev.line[prev.end[1]:].replace('\\\n', '\\\\\n\n')), end='') if tok.start[1] != 0: # if the new token doesn't start at the start of the line # print the part of the line before the token print(html.escape(tok.line[0:tok.start[1]]), end='') elif tok.start[1] != prev.end[1]: # this token is on the same line as the previous, but its not # right after it, so print the part of the line in between print(html.escape(tok.line[prev.end[1]:tok.start[1]]), end='') if tok.type == tokenize.ENCODING: text = '' else: text = html.escape(tok.string) typ = None if tok.type == tokenize.COMMENT: typ = 'comment' elif tok.type == tokenize.NAME: if text in keywords: typ = 'keyword' elif text in identifiers: typ = 'identifier' elif tok.type == tokenize.STRING: typ = 'string' if typ is not None: print('<span class="{type}">{text}</span>'.format( type=typ, text=text), end='') else: print(text, end='') prev = tok return 0 if __name__ == '__main__': sys.exit(main())