31 lines
1.2 KiB
Python
31 lines
1.2 KiB
Python
#!/usr/bin/python
|
|
"""
|
|
Small script to extract helptexts from the ACC ServerAdminHandbook.
|
|
Usage: First, use pdftohtml with the pdf, then
|
|
python extractMessages.py ../path/to/ServerAdminHandbooks.html
|
|
"""
|
|
|
|
import re, sys, json
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print('Usage: python extractMessages.py ../path/to/ServerAdminHandbooks.html')
|
|
exit(1)
|
|
|
|
outer = """Property.*\nRemarks.*\n((?:.|\n)*?)<br\/>\n(?:<b>)? (?:<\/b>)?<br\/>"""
|
|
pattern = """(.*)<b>(?:.*)<br\/>\n((?:.|\n)+?(?=.*<b>|\Z))"""
|
|
|
|
messages = {}
|
|
for b in re.findall(outer, open(sys.argv[1]).read()):
|
|
for (key, value) in re.findall(pattern, b):
|
|
for c in [('<br/>',''), ('\n',''), (' ',' '), ('“','"'), ('”','"')]:
|
|
key = key.replace(*c).strip()
|
|
value = value.replace(*c).strip()
|
|
value = re.sub('<hr/><a.*</a>', '', value)
|
|
value = re.sub('((?:S|s)ee\s+".*")', r'\1 (ServerAdminHandbook)', value)
|
|
value = re.sub('(?:S|s)ee.*next table', 'see table in ServerAdminHandbook', value)
|
|
messages[key] = value
|
|
|
|
print(messages.keys())
|
|
json.dump(messages, open('messages.json', 'w'), sort_keys=True, indent=4) |