#!/usr/bin/python # # makeman -- compile netpbm's stereotyped HTML to troff markup # # This approach works because we control the entire document universe # this is going to convert and can reinforce useful stereotypes. # # The output of this tool uses cliches parseable by doclifter, # which should thus be able to recover all the semantic information # it looks like this thing is losing. # # Known bugs: # * Ordered lists are smashed into unordered lists # # Limitations: # * IMG tags are issued as .IMG preceded by a bolded caption containing # the alt content. This will only work if the page is formatted with # mwww macros. # * Loses summary information from tables. # * Only permits one
in the HTML, right before the index. # # You can use the PI to pass text directly through to the # generated manual page, A major use is to insert format lines for tables. # # By Eric S. Raymond # Version 1.0, July 26 2004 import os, sys, exceptions, re source = "netpbm documentation" section = 1 warning = r'''\ .\" This man page was generated by the Netpbm tool 'makeman' from HTML source. .\" Do not hand-hack it! If you have bug fixes or improvements, please find .\" the corresponding HTML page on the Netpbm website, generate a patch .\" against that, and send it to the Netpbm maintainer. ''' class LiftException(exceptions.Exception): def __init__(self, message, retval=1): self.message = message self.retval = retval def makeman(name, file, indoc): "Transform a string representing an HTML document into man markup." global section, sectmap # Dot at left margin confuses troff. # This program generates these, indoc = indoc.replace("\n.", "\n@%@%@") # Protect escapes before we try generating font changes. indoc = indoc.replace("\\", r"\e") # Header-bashing indoc = re.sub('(?i)]*>', "", indoc) indoc = indoc.replace('', "") indoc = indoc.replace('', "") indoc = indoc.replace('\n',"") indoc = indoc.replace('', "") indoc = indoc.replace("", "").replace("", "") indoc = indoc.replace("", "").replace("", "") indoc = re.sub('(?i)Table Of Contents', "", indoc) datematch = re.compile("Updated: (.*)\n") match = datematch.search(indoc) if match: date = match.group(1) else: date = "" indoc = datematch.sub("", indoc) namematch = re.compile("

(.*)

", re.I) match = namematch.search(indoc) if match: name = match.group(1) else: name = None section = 1 meta = re.compile('(?i)') match = meta.search(indoc) if match: section = int(match.group(1)) indoc = meta.sub("", indoc) else: section = sectmap.get(name, 0) indoc = namematch.sub("", indoc) indoc = re.sub("(?i)]*>", "", indoc) indoc = re.sub("(?i)", "", indoc) # Remove more superfluous headers titlematch = re.compile("(.*)\n+", re.I) match = titlematch.search(indoc) if match: title = match.group(1) else: title = None indoc = titlematch.sub("", indoc) indoc = re.sub("(?i)\n*
\n+", "\n", indoc) indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc # Literal layout indoc = re.sub("(?i)\n *
", "\n.nf", indoc)
    indoc = re.sub("(?i)\n *
", "\n.fi", indoc) indoc = re.sub("(?i)\n *
", "\n.RS", indoc) indoc = re.sub("(?i)\n *
", "\n.RE", indoc) # Highlight processing indoc = re.sub("(?i)", r"\\fB", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fI", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\f(CW", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\fB", indoc) indoc = re.sub("(?i)", r"\\fP", indoc) indoc = re.sub("(?i)", r"\\u", indoc) indoc = re.sub("(?i)", r"\\d", indoc) # Paragraph handling indoc = re.sub("(?i)\n*

\n*", r"\n.PP\n", indoc) indoc = re.sub("(?i)
", r"\n.PP\n", indoc) indoc = re.sub("(?i)

", "", indoc) indoc = re.sub("(?i)", "", indoc) indoc = re.sub("(?i)]*>", "", indoc) lines = indoc.split("\n") listdepth = 0 for i in range(len(lines)): lowered = lines[i].lower() if "" in lowered or "" in lowered or "" in lowered: listdepth -= 1 indoc = "\n".join(lines) indoc = re.sub(r"\s*\.sp", "\n.sp", indoc) # Format email addresses as italic indoc = re.sub('(?i)([^<]+)', r'\\fI\1\\fP', indoc) # Format manual crossreferences def xrefmatch(match): xrefto = match.group(2) xrefurl = match.group(1) xrefsection = sectmap.get(xrefurl, 1) if xrefsection == 0: return "\n.I " + xrefto else: return "\n.BR %s (%d)\n" % (xrefto, xrefsection) indoc = re.sub(r'(?i)\n* *(?:\\fB)?([^<]+)(?:\\fP)?', xrefmatch, indoc) # Format URLs def urlmatch(match): url = match.group(1).replace('\n', ' ') txt = match.group(2).replace('\n', ' ') return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt) indoc = re.sub(r'(?i)\n*(?:<)?]+)">([^<]+)(?:>)?', urlmatch, indoc) # Turn some entities into harmless cookies indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#") indoc = indoc.replace("×", r"\(mu") indoc = indoc.replace("®", r"\*R") indoc = indoc.replace("©", r"\(co") # Turn anchors into .UN tags indoc = re.sub('(?i)(?: )*\s*', ".UN \\1\n", indoc) # Strip off the index trailer trailer = re.compile('
.*', re.DOTALL | re.IGNORECASE) indoc = re.sub(trailer, "", indoc) # If there was no index trailer, we still need to strip these indoc = indoc.replace("", "").replace("", "") indoc = indoc.replace("", "").replace("", "") # Recognize sections with IDs indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SH \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SS \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.B \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SH \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.SS \\2", indoc) indoc = re.sub('(?i)

([^><]*)

', ".UN \\1\n.B \\2", indoc) # Sections without IDs indoc = re.sub('(?i)

([^><]*)

', ".SH \\1", indoc) indoc = re.sub('(?i)

([^><]*)

', ".SS \\1", indoc) indoc = re.sub('(?i)

([^><]*)

', ".B \\1", indoc) # # Process definition lists -- just turn them into .TPs indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i) *
", ".TP\n", indoc) indoc = re.sub("(?i) *
", "", indoc) indoc = re.sub("(?i)\n*
\n*", "\n", indoc) indoc = re.sub("(?i) *
", "", indoc) # Process unordered lists -- just turn them into .TPs indoc = re.sub("(?i)", "", indoc) indoc = re.sub("(?i) *
  • ", ".IP \(bu\n", indoc) indoc = re.sub("(?i) *
  • ", "", indoc) # No-print tags indoc = re.sub(".*", "", indoc) # Passthrough indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc) # Comments indoc = re.sub("", r'.\"\1', indoc) # Image tags indoc = re.sub(' *([^', ".B \\2\n.IMG -C \\1", indoc) # Special characters indoc = indoc.replace(""", "'") indoc = indoc.replace(" ", "\\ ") # Tables indoc = re.sub(' *]*>.*', ".TS", indoc) indoc = re.sub(" *.*", ".TE", indoc) # First the single-line case indoc = re.sub(" *", "\t", indoc) indoc = re.sub(" *", "", indoc) indoc = re.sub(" *", "", indoc) # Then the multiline case indoc = re.sub(r'\s*]*>([^<\n]*)\s*', '\t\\1', indoc) indoc = re.sub(r'\s*]*>([^<]*)\s*', '\tT{\n\\1T}', indoc) indoc = indoc.replace("\n\\&T}", "\nT}") indoc = re.sub(" *", "", indoc) indoc = re.sub(" *]*>\t*", "", indoc) indoc = re.sub(r"\.TS\s+([^<]*)\s*", ".B \\1\n.TS\n", indoc) # Debugging #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date)) # Time for error checking now badlines = [] for line in indoc.split("\n"): if "<" in line or ">" in line.replace(" >", "") or re.search("&.*;", line): badlines.append(line) if badlines: sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n") # Goes after bad-line check so we don't misinterpret it as an error indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&") indoc = re.sub("\n+$", "\n", indoc) # Single-quote at left margin confuses troff. # This program never generates these. indoc = indoc.replace("\n'", "\n\\&'") # Finish guarding against leading dots. indoc = indoc.replace("\n@%@%@", "\n\\&.") # Mark these generated pages so people won't hand-hack them. indoc = warning + indoc return indoc def main(args, mainout=sys.stdout, mainerr=sys.stderr): global sectmap import getopt (options, arguments) = getopt.getopt(args, "vd:") dirprefix = "" verbosity = 0 for (switch, val) in options: if switch == '-d': # Set HTML input directory dirprefix = val elif switch == '-v': # Enable verbose error reporting verbosity += 1 try: # First pass: gather locations for crossreferences: sectmap = {} for file in arguments: try: infp = open(os.path.join(dirprefix, file)) except: sys.stderr.write("makeman: can't open %s\n" % file) continue indoc = infp.read() infp.close() namere = re.compile("

    (.*)

    ", re.I) namematch = namere.search(indoc) titlere = re.compile("(.*)", re.I) titlematch = titlere.search(indoc) if not namematch: raise LiftException("name missing from %s" % file) if not titlematch: raise LiftException("title missing from %s" % file) else: title = titlematch.group(1) name = titlematch.group(1) meta = re.compile('(?i)') match = meta.search(indoc) if match: section = int(match.group(1)) sectmap[title] = sectmap[file] = sectmap[name] = section else: sectmap[title] = sectmap[file] = sectmap[name] = 1 hr = re.compile("(?i)
    ") firsthr = hr.search(indoc) if firsthr and hr.search(indoc[firsthr.start(0)+4:]): LiftException("%s has two
    tags!" % file) # Second pass: do formatting for file in arguments: try: infp = open(os.path.join(dirprefix, file)) except: sys.stderr.write("makeman: can't open %s\n" % file) continue indoc = infp.read() infp.close() tempfile = file + ".~%s-%d~" % (name, os.getpid()) try: outfp = open(tempfile, "w") except OSError: sys.stderr.write("%s: can't open tempfile" % name) return True try: if verbosity: sys.stderr.write("makeman: %s\n" % file) outdoc = makeman(name, file, indoc) except: os.remove(tempfile) # Pass the exception upwards (exc_type, exc_value, exc_traceback) = sys.exc_info() raise exc_type, exc_value, exc_traceback if outdoc == indoc: os.remove(tempfile) if outdoc is None: continue else: outfp.write(outdoc) outfp.close() # under Windows you can't rename an open file stem = file[:file.find(".")] os.rename(tempfile, stem + "." + `sectmap[file]`) except LiftException, e: mainerr.write("makeman: " + e.message + "\n") return e.retval except IOError, e: mainerr.write("makeman: file I/O error: %s\n" % e) return 3 except KeyboardInterrupt: mainerr.write("makeman: bailing out...\n") return 4 except: if verbosity: (exc_type, exc_value, exc_traceback) = sys.exc_info() raise exc_type, exc_value, exc_traceback else: mainerr.write("makeman: internal error!\n") return 5 if __name__ == "__main__": # Run the main sequence raise SystemExit, main(sys.argv[1:]) # The following sets edit modes for GNU EMACS # Local Variables: # mode:python # End: