#!/usr/bin/python """ Small script to extract helptexts from the ACC ServerAdminHandbook. Usage: First, use pdftohtml with the pdf, then python extractMessages.py ../path/to/ServerAdminHandbooks.html """ import re, sys, json if __name__ == "__main__": if len(sys.argv) != 2: print('Usage: python extractMessages.py ../path/to/ServerAdminHandbooks.html') exit(1) outer = """Property.*\nRemarks.*\n((?:.|\n)*?)\n(?:)? (?:<\/b>)?""" pattern = """(.*)(?:.*)\n((?:.|\n)+?(?=.*|\Z))""" messages = {} for b in re.findall(outer, open(sys.argv[1]).read()): for (key, value) in re.findall(pattern, b): for c in [('
',''), ('\n',''), (' ',' '), ('“','"'), ('”','"')]: key = key.replace(*c).strip() value = value.replace(*c).strip() value = re.sub('
', '', value) value = re.sub('((?:S|s)ee\s+".*")', r'\1 (ServerAdminHandbook)', value) value = re.sub('(?:S|s)ee.*next table', 'see table in ServerAdminHandbook', value) messages[key] = value print(messages.keys()) json.dump(messages, open('messages.json', 'w'), sort_keys=True, indent=4)