# Version 1.1, February 11 2016
#
# Added ability to process — −
# Added footer message to clarify original source.
#
import os, sys, re
source = "netpbm documentation"
section = 1
warning = r'''\
.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
.\" Do not hand-hack it! If you have bug fixes or improvements, please find
.\" the corresponding HTML page on the Netpbm website, generate a patch
.\" against that, and send it to the Netpbm maintainer.
'''
footerprefix = '''.SH DOCUMENT SOURCE
This manual page was generated by the Netpbm tool 'makeman' from HTML
source. The master documentation is at
.IP
.B http://netpbm.sourceforge.net/doc/'''
class LiftException(Exception):
def __init__(self, message, retval=1):
self.message = message
self.retval = retval
def makeman(name, file, indoc):
"Transform a string representing an HTML document into man markup."
global section, sectmap
# Dot at left margin confuses troff.
# This program generates these,
indoc = indoc.replace("\n.", "\n@%@%@")
# Protect escapes before we try generating font changes.
indoc = indoc.replace("\\", r"\e")
# Header-bashing
indoc = re.sub('(?i)]*>', "", indoc)
indoc = indoc.replace('', "")
indoc = indoc.replace('', "")
indoc = indoc.replace('\n',"")
indoc = indoc.replace('', "")
indoc = indoc.replace('', "")
indoc = indoc.replace("", "").replace("", "")
indoc = indoc.replace("", "").replace("", "")
indoc = re.sub('(?i)Table Of Contents', "", indoc)
datematch = re.compile("Updated: (.*)\n")
match = datematch.search(indoc)
if match:
date = match.group(1)
else:
date = ""
indoc = datematch.sub("", indoc)
namematch = re.compile("(.*)
", re.I)
match = namematch.search(indoc)
if match:
name = match.group(1)
else:
name = None
section = 1
meta = re.compile('(?i)')
match = meta.search(indoc)
if match:
section = int(match.group(1))
indoc = meta.sub("", indoc)
else:
section = sectmap.get(name, 0)
indoc = namematch.sub("", indoc)
indoc = re.sub("(?i)]*>", "", indoc)
indoc = re.sub("(?i)", "", indoc)
# Remove more superfluous headers
titlematch = re.compile("(.*)\n+", re.I)
match = titlematch.search(indoc)
if match:
title = match.group(1)
else:
title = None
indoc = titlematch.sub("", indoc)
indoc = re.sub("(?i)\n*
\n+", "\n", indoc)
indoc = re.sub("(?i)
", "\n", indoc)
indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
# Literal layout
indoc = re.sub("(?i)\n *", "\n.nf", indoc)
indoc = re.sub("(?i) *
\n", "\n.fi\n", indoc)
indoc = re.sub("(?i)\n *", "\n.RS", indoc)
indoc = re.sub("(?i)\n *
", "\n.RE", indoc)
# Highlight processing
indoc = re.sub("(?i)", r"\\fB", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)
", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fB", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\u", indoc)
indoc = re.sub("(?i)", r"\\d", indoc)
# Paragraph handling
indoc = re.sub("(?i)\n*\n*", r"\n.PP\n", indoc)
indoc = re.sub("(?i)
", r"\n.PP\n", indoc)
indoc = re.sub("(?i)
", "", indoc)
indoc = re.sub("(?i)", "", indoc)
indoc = re.sub("(?i)]*>", "", indoc)
lines = indoc.split("\n")
listdepth = 0
for i in range(len(lines)):
lowered = lines[i].lower()
if "" in lowered or "" in lowered or "" in lowered:
listdepth -= 1
indoc = "\n".join(lines)
indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
# Format email addresses as italic
indoc = re.sub('(?i)]+">([^<]+)', r'\\fI\1\\fP', indoc)
# Format manual cross-references
def xrefmatch(match):
xrefto = match.group(2)
xrefurl = match.group(1)
xrefsection = sectmap.get(xrefurl, 1)
if xrefsection == 0:
return "\n.I " + xrefto
else:
return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection)
indoc = re.sub(r'(?i)\n* *(?:\\fB)?]+.html)"?>([^<]+)(?:\\fP)?',
xrefmatch, indoc)
# Format URLs
def urlmatch(match):
url = match.group(1).replace('\n', ' ')
txt = match.group(2).replace('\n', ' ')
return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
indoc = re.sub(r'(?i)\n*(?:<)?]+)">([^<]+)(?:>)?',
urlmatch, indoc)
# Turn some entities into harmless cookies
indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#")
indoc = indoc.replace("×", r"\(mu")
indoc = indoc.replace("®", r"\*R")
indoc = indoc.replace("©", r"\(co")
# Turn anchors into .UN tags
indoc = re.sub('(?i)(?: )*\s*', ".UN \\1\n", indoc)
# Strip off the index trailer
trailer = re.compile('
.*', re.DOTALL | re.IGNORECASE)
indoc = re.sub(trailer, "", indoc)
# If there was no index trailer, we still need to strip these
indoc = indoc.replace("", "").replace("", "")
indoc = indoc.replace("", "").replace("", "")
# Recognize sections with IDs
indoc = re.sub('(?i)',
".UN \\1\n.SH \\2", indoc)
indoc = re.sub('(?i)',
".UN \\1\n.SS \\2", indoc)
indoc = re.sub('(?i)',
".UN \\1\n.B \\2", indoc)
indoc = re.sub('(?i)([^><]*)
',
".UN \\1\n.SH \\2", indoc)
indoc = re.sub('(?i)([^><]*)
',
".UN \\1\n.SS \\2", indoc)
indoc = re.sub('(?i)([^><]*)
',
".UN \\1\n.B \\2", indoc)
# Sections without IDs
indoc = re.sub('(?i)([^><]*)
', ".SH \\1", indoc)
indoc = re.sub('(?i)([^><]*)
', ".SS \\1", indoc)
indoc = re.sub('(?i)([^><]*)
', ".B \\1", indoc)
indoc = re.sub('(?i)([^><]*)
', ".B \\1", indoc)
#
# Process definition lists -- just turn them into .TPs
indoc = re.sub("(?i) *", "", indoc)
indoc = re.sub("(?i) *
", "", indoc)
indoc = re.sub("(?i) *- ", ".TP\n", indoc)
indoc = re.sub("(?i) *
", "", indoc)
indoc = re.sub("(?i)\n*- \n*", "\n", indoc)
indoc = re.sub("(?i) *
", "", indoc)
# Process unordered lists -- just turn them into .TPs
indoc = re.sub("(?i)?[UO]L *(COMPACT)?>", "", indoc)
indoc = re.sub("(?i) *
", ".IP \(bu\n", indoc)
indoc = re.sub("(?i) *", "", indoc)
# No-print tags
indoc = re.sub(".*", "", indoc)
# Passthrough
indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
# Comments
indoc = re.sub("", r'.\"\1', indoc)
# Acronyms
indoc = re.sub('', "", indoc)
indoc = re.sub("", "", indoc)
# Abbreviation - just erase tags
indoc = re.sub(']+>', '', indoc)
indoc = re.sub('', '', indoc)
# Subscript - just erase tags
indoc = re.sub('(?i)]+>', '', indoc)
indoc = re.sub('(?i)', '', indoc)
# Span - just erase tags
indoc = re.sub('(?i)]+>', '', indoc)
indoc = re.sub('(?i)', '', indoc)
# Image tags
indoc = re.sub(' *', ".B \\2\n.IMG -C \\1", indoc)
# Special characters
indoc = indoc.replace(""", "'")
indoc = indoc.replace(" ", "\\ ")
indoc = indoc.replace("−", "-")
indoc = indoc.replace("—", "-")
indoc = indoc.replace("μ", "mu")
indoc = indoc.replace("σ", "sigma")
indoc = indoc.replace("'", "'")
# Tables
# This will not handle rowspan
indoc = re.sub('(?i) *]*>.*', ".TS", indoc)
indoc = re.sub("(?i) *
.*", ".TE", indoc)
# First the single-line case
indoc = re.sub("(?i) *", "\t", indoc)
indoc = re.sub("(?i) | *]*)?>", "", indoc)
indoc = re.sub("(?i) | *
", "", indoc)
# Then the multiline case
indoc = re.sub(r'(?i)\s*]*>([^<\n]*)\s*', '\t\\1', indoc)
indoc = re.sub(r'(?i)\s*]*>([^<]*)\s*', '\tT{\n\\1T}', indoc)
indoc = indoc.replace("\n\\&T}", "\nT}")
indoc = re.sub("(?i) *", "", indoc)
indoc = re.sub("(?i) *]*>\t*", "", indoc)
indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s*", ".B \\1\n.TS\n", indoc)
# Debugging
#sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
# Time for error checking now
# We replaced every HTML tag we could above, so any remaining in
# 'indoc' represent material we don't know how to convert, which we call
# bad lines.
badlines = []
for line in indoc.split("\n"):
if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?").replace("#!@!@!#", "&")
indoc = re.sub("\n+$", "\n", indoc)
# Single-quote at left margin confuses troff.
# This program never generates these.
indoc = indoc.replace("\n'", "\n\\&'")
# Finish guarding against leading dots.
indoc = indoc.replace("\n@%@%@", "\n\\&.")
# Mark these generated pages so people won't hand-hack them.
indoc = warning + indoc
indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP"
return indoc
def main(args, mainout=sys.stdout, mainerr=sys.stderr):
global sectmap
import getopt
(options, arguments) = getopt.getopt(args, "vd:")
dirprefix = ""
verbosity = 0
for (switch, val) in options:
if switch == '-d': # Set HTML input directory
dirprefix = val
elif switch == '-v': # Enable verbose error reporting
verbosity += 1
try:
# First pass: gather locations for cross-references:
sectmap = {}
for file in arguments:
fullfilenm = os.path.join(dirprefix, file)
try:
infp = open(fullfilenm)
except:
sys.stderr.write(
"makeman: can't open input file '%s'\n" % fullfilenm)
continue
indoc = infp.read()
infp.close()
namere = re.compile("(.*)
", re.I)
namematch = namere.search(indoc)
titlere = re.compile("(.*)", re.I)
titlematch = titlere.search(indoc)
if not namematch:
raise LiftException("name missing from %s" % file)
if not titlematch:
raise LiftException("title missing from %s" % file)
else:
title = titlematch.group(1)
name = titlematch.group(1)
meta = re.compile('(?i)')
match = meta.search(indoc)
if match:
section = int(match.group(1))
sectmap[title] = sectmap[file] = sectmap[name] = section
else:
sectmap[title] = sectmap[file] = sectmap[name] = 1
hr = re.compile("(?i)
")
firsthr = hr.search(indoc)
if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
LiftException("%s has two
tags!" % file)
# Second pass: do formatting
for file in arguments:
fullfilenm = os.path.join(dirprefix, file)
try:
infp = open(fullfilenm)
except:
sys.stderr.write(
"makeman: can't open output file '%s'\n" % fullfilenm)
continue
indoc = infp.read()
infp.close()
tempfile = file + ".~%s-%d~" % (name, os.getpid())
try:
outfp = open(tempfile, "w")
except OSError:
sys.stderr.write("%s: can't open tempfile" % name)
return True
try:
if verbosity:
sys.stderr.write("makeman: %s\n" % file)
outdoc = makeman(name, file, indoc)
except:
os.remove(tempfile)
raise
if outdoc == indoc:
os.remove(tempfile)
if outdoc is None:
continue
else:
outfp.write(outdoc)
outfp.close() # under Windows you can't rename an open file
stem = file[:file.find(".")]
os.rename(tempfile, stem + "." + repr(sectmap[file]))
except LiftException as e:
mainerr.write("makeman: " + e.message + "\n")
return e.retval
except IOError as e:
mainerr.write("makeman: file I/O error: %s\n" % e)
return 3
except KeyboardInterrupt:
mainerr.write("makeman: bailing out...\n")
return 4
except:
if verbosity:
raise
else:
mainerr.write("makeman: internal error!\n")
return 5
if __name__ == "__main__":
# Run the main sequence
raise SystemExit(main(sys.argv[1:]))
# The following sets edit modes for GNU EMACS
# Local Variables:
# mode:python
# End: