#!/usr/bin/python # # makeman -- compile netpbm's stereotyped HTML to troff markup # # Example: # # $ makeman pamcut.html pamcomp.html # # $ makeman -v -d /tmp/inputdir pamcut.html # # The output troff file is in the same directory as the input HTML file, named # the same except with .1 extension. # This approach works because we control the entire document universe # this is going to convert and can reinforce useful stereotypes. # # The output of this tool uses cliches parseable by doclifter, # which should thus be able to recover all the semantic information # it looks like this thing is losing. # # Limitations: # * Ordered lists are smashed into unordered lists # * IMG tags are issued as .IMG preceded by a bolded caption containing # the alt content. This will only work if the page is formatted with # mwww macros. # * Loses summary information from tables. # * Only permits one
in the HTML, right before the index. # # You can use the PI to pass text directly through to the # generated manual page, A major use is to insert format lines for tables. # # By Eric S. Raymond # Version 1.0, July 26 2004 # # Modified by Akira F. Urushibata # Version 1.1, February 11 2016 # # Added ability to process — − # Added footer message to clarify original source. # import os, sys, re source = "netpbm documentation" section = 1 warning = r'''\ .\" This man page was generated by the Netpbm tool 'makeman' from HTML source. .\" Do not hand-hack it! If you have bug fixes or improvements, please find .\" the corresponding HTML page on the Netpbm website, generate a patch .\" against that, and send it to the Netpbm maintainer. ''' footerprefix = '''.SH DOCUMENT SOURCE This manual page was generated by the Netpbm tool 'makeman' from HTML source. The master documentation is at .IP .B http://netpbm.sourceforge.net/doc/''' class LiftException(Exception): def __init__(self, message, retval=1): self.message = message self.retval = retval def makeman(name, file, indoc): "Transform a string representing an HTML document into man markup." global section, sectmap # Dot at left margin confuses troff. # This program generates these, indoc = indoc.replace("\n.", "\n@%@%@") # Protect escapes before we try generating font changes. indoc = indoc.replace("\\", r"\e") # Header-bashing indoc = re.sub('(?i)]*>', "", indoc) indoc = indoc.replace('', "") indoc = indoc.replace('', "") indoc = indoc.replace('\n',"") indoc = indoc.replace('', "") indoc = indoc.replace('', "") indoc = indoc.replace("", "").replace("", "") indoc = indoc.replace("", "").replace("", "") indoc = re.sub('(?i)Table Of Contents', "", indoc) datematch = re.compile("Updated: (.*)\n") match = datematch.search(indoc) if match: date = match.group(1) else: date = "" indoc = datematch.sub("", indoc) namematch = re.compile("

(.*)

", re.I) match = namematch.search(indoc) if match: name = match.group(1) else: name = None section = 1 meta = re.compile('(?i)') match = meta.search(indoc) if match: section = int(match.group(1)) indoc = meta.sub("", indoc) else: section = sectmap.get(name, 0) indoc = namematch.sub("", indoc) indoc = re.sub("(?i)]*>", "", indoc) indoc = re.sub("(?i)", "", indoc) # Remove more superfluous headers titlematch = re.compile("(.*)\n+", re.I) match = titlematch.search(indoc) if match: title = match.group(1) else: title = None indoc = titlematch.sub("", indoc) indoc = re.sub("(?i)\n*
\n+", "\n", indoc) indoc = re.sub("(?i)
", "\n", indoc) indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc # Literal layout indoc = re.sub("(?i)\n *
", "\n.nf", indoc)
    indoc = re.sub("(?i) *
\n", "\n.fi\n", indoc) indoc = re.sub("(?i)\n *
", "\n.RS", indoc) indoc = re.sub("(?i)\n *
", "\n.RE", indoc) # Highlight processing indoc = re.sub("(?i)", r"\\fB", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fB", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\u", indoc) indoc = re.sub("(?i)", r"\\d", indoc) # Paragraph handling indoc = re.sub("(?i)\n*

\n*", r"\n.PP\n", indoc) indoc = re.sub("(?i)
", r"\n.PP\n", indoc) indoc = re.sub("(?i)

", "", indoc) indoc = re.sub("(?i)", "", indoc) indoc = re.sub("(?i)]*>", "", indoc) lines = indoc.split("\n") listdepth = 0 for i in range(len(lines)): lowered = lines[i].lower() if "" in lowered or "" in lowered or "" in lowered: listdepth -= 1 indoc = "\n".join(lines) indoc = re.sub(r"\s*\.sp", "\n.sp", indoc) # Format email addresses as italic indoc = re.sub('(?i)([^<]+)', r'\\fI\1\\fP', indoc) # Format manual cross-references def xrefmatch(match): xrefto = match.group(2) xrefurl = match.group(1) xrefsection = sectmap.get(xrefurl, 1) if xrefsection == 0: return "\n.I " + xrefto else: return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection) indoc = re.sub(r'(?i)\n* *(?:\\fB)?([^<]+)(?:\\fP)?', xrefmatch, indoc) # Format URLs def urlmatch(match): url = match.group(1).replace('\n', ' ') txt = match.group(2).replace('\n', ' ') return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt) indoc = re.sub(r'(?i)\n*(?:<)?]+)">([^<]+)(?:>)?', urlmatch, indoc) # Turn some entities into harmless cookies indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#") indoc = indoc.replace("×", r"\(mu") indoc = indoc.replace("®", r"\*R") indoc = indoc.replace("©", r"\(co") # Turn anchors into .UN tags indoc = re.sub('(?i)(?: )*\s*', ".UN \\1\n", indoc) # Strip off the index trailer trailer = re.compile('
.*', re.DOTALL | re.IGNORECASE) indoc = re.sub(trailer, "", indoc) # If there was no index trailer, we still need to strip these indoc = indoc.replace("", "").replace("", "") indoc = indoc.replace("", "").replace("", "") # Recognize sections with IDs indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SH \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SS \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.B \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SH \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SS \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.B \\2", indoc) # Sections without IDs indoc = re.sub('(?i)

([^><]*)

', ".SH \\1", indoc) indoc = re.sub('(?i)

([^><]*)

', ".SS \\1", indoc) indoc = re.sub('(?i)

([^><]*)

', ".B \\1", indoc) indoc = re.sub('(?i)
([^><]*)
', ".B \\1", indoc) # # Process definition lists -- just turn them into .TPs indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i) *
", ".TP\n", indoc) indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i)\n*
\n*", "\n", indoc) indoc = re.sub("(?i) *
", "", indoc) # Process unordered lists -- just turn them into .TPs indoc = re.sub("(?i)", "", indoc) indoc = re.sub("(?i) *
  • ", ".IP \(bu\n", indoc) indoc = re.sub("(?i) *
  • ", "", indoc) # No-print tags indoc = re.sub(".*", "", indoc) # Passthrough indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc) # Comments indoc = re.sub("", r'.\"\1', indoc) # Acronyms indoc = re.sub('', "", indoc) indoc = re.sub("", "", indoc) # Abbreviation - just erase tags indoc = re.sub(']+>', '', indoc) indoc = re.sub('', '', indoc) # Subscript - just erase tags indoc = re.sub('(?i)]+>', '', indoc) indoc = re.sub('(?i)', '', indoc) # Span - just erase tags indoc = re.sub('(?i)]+>', '', indoc) indoc = re.sub('(?i)', '', indoc) # Image tags indoc = re.sub(' *([^', ".B \\2\n.IMG -C \\1", indoc) # Special characters indoc = indoc.replace(""", "'") indoc = indoc.replace(" ", "\\ ") indoc = indoc.replace("−", "-") indoc = indoc.replace("—", "-") indoc = indoc.replace("μ", "mu") indoc = indoc.replace("σ", "sigma") indoc = indoc.replace("'", "'") # Tables # This will not handle rowspan indoc = re.sub('(?i) *]*>.*', ".TS", indoc) indoc = re.sub("(?i) *.*", ".TE", indoc) # First the single-line case indoc = re.sub("(?i) *", "\t", indoc) indoc = re.sub("(?i) *]*)?>", "", indoc) indoc = re.sub("(?i) *", "", indoc) # Then the multiline case indoc = re.sub(r'(?i)\s*]*>([^<\n]*)\s*', '\t\\1', indoc) indoc = re.sub(r'(?i)\s*]*>([^<]*)\s*', '\tT{\n\\1T}', indoc) indoc = indoc.replace("\n\\&T}", "\nT}") indoc = re.sub("(?i) *", "", indoc) indoc = re.sub("(?i) *]*>\t*", "", indoc) indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)\s*", ".B \\1\n.TS\n", indoc) # Debugging #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date)) # Time for error checking now # We replaced every HTML tag we could above, so any remaining in # 'indoc' represent material we don't know how to convert, which we call # bad lines. badlines = [] for line in indoc.split("\n"): if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?").replace("#!@!@!#", "&") indoc = re.sub("\n+$", "\n", indoc) # Single-quote at left margin confuses troff. # This program never generates these. indoc = indoc.replace("\n'", "\n\\&'") # Finish guarding against leading dots. indoc = indoc.replace("\n@%@%@", "\n\\&.") # Mark these generated pages so people won't hand-hack them. indoc = warning + indoc indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP" return indoc def main(args, mainout=sys.stdout, mainerr=sys.stderr): global sectmap import getopt (options, arguments) = getopt.getopt(args, "vd:") dirprefix = "" verbosity = 0 for (switch, val) in options: if switch == '-d': # Set HTML input directory dirprefix = val elif switch == '-v': # Enable verbose error reporting verbosity += 1 try: # First pass: gather locations for cross-references: sectmap = {} for file in arguments: fullfilenm = os.path.join(dirprefix, file) try: infp = open(fullfilenm) except: sys.stderr.write( "makeman: can't open input file '%s'\n" % fullfilenm) continue indoc = infp.read() infp.close() namere = re.compile("

    (.*)

    ", re.I) namematch = namere.search(indoc) titlere = re.compile("(.*)", re.I) titlematch = titlere.search(indoc) if not namematch: raise LiftException("name missing from %s" % file) if not titlematch: raise LiftException("title missing from %s" % file) else: title = titlematch.group(1) name = titlematch.group(1) meta = re.compile('(?i)') match = meta.search(indoc) if match: section = int(match.group(1)) sectmap[title] = sectmap[file] = sectmap[name] = section else: sectmap[title] = sectmap[file] = sectmap[name] = 1 hr = re.compile("(?i)
    ") firsthr = hr.search(indoc) if firsthr and hr.search(indoc[firsthr.start(0)+4:]): LiftException("%s has two
    tags!" % file) # Second pass: do formatting for file in arguments: fullfilenm = os.path.join(dirprefix, file) try: infp = open(fullfilenm) except: sys.stderr.write( "makeman: can't open output file '%s'\n" % fullfilenm) continue indoc = infp.read() infp.close() tempfile = file + ".~%s-%d~" % (name, os.getpid()) try: outfp = open(tempfile, "w") except OSError: sys.stderr.write("%s: can't open tempfile" % name) return True try: if verbosity: sys.stderr.write("makeman: %s\n" % file) outdoc = makeman(name, file, indoc) except: os.remove(tempfile) raise if outdoc == indoc: os.remove(tempfile) if outdoc is None: continue else: outfp.write(outdoc) outfp.close() # under Windows you can't rename an open file stem = file[:file.find(".")] os.rename(tempfile, stem + "." + repr(sectmap[file])) except LiftException as e: mainerr.write("makeman: " + e.message + "\n") return e.retval except IOError as e: mainerr.write("makeman: file I/O error: %s\n" % e) return 3 except KeyboardInterrupt: mainerr.write("makeman: bailing out...\n") return 4 except: if verbosity: raise else: mainerr.write("makeman: internal error!\n") return 5 if __name__ == "__main__": # Run the main sequence raise SystemExit(main(sys.argv[1:])) # The following sets edit modes for GNU EMACS # Local Variables: # mode:python # End: