about summary refs log tree commit diff
path: root/buildtools/makeman
diff options
context:
space:
mode:
Diffstat (limited to 'buildtools/makeman')
-rwxr-xr-xbuildtools/makeman333
1 files changed, 333 insertions, 0 deletions
diff --git a/buildtools/makeman b/buildtools/makeman
new file mode 100755
index 00000000..634a2c79
--- /dev/null
+++ b/buildtools/makeman
@@ -0,0 +1,333 @@
+#!/bin/env python
+#
+# makeman -- compile netpbm's stereotyped HTML to troff markup
+#
+# This approach works because we control the entire document universe 
+# this is going to convert and can reinforce useful stereotypes.
+#
+# The output of this tool uses cliches parseable by doclifter,
+# which should thus be able to recover all the semantic information
+# it looks like this thing is losing.
+#
+# Known bugs:
+#  * Ordered lists are smashed into unordered lists
+#
+# Limitations:
+#  * IMG tags are issued as .IMG preceded by a bolded caption containing
+#    the alt content.  This will only work if the page is formatted with
+#    mwww macros.
+#  * Loses summary information from tables.
+#  * Only permits one <HR> in the HTML, right before the index.
+#
+# Use the makeman: passthrough to insert format lines for tables.
+#
+# By Eric S. Raymond <esr@thyrsus.com>
+# Version 1.0, July 26 2004
+
+import os, sys, exceptions, re
+
+source = "netpbm documentation"
+section = 1
+
+warning = '''\
+.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
+.\" Do not hand-hack it!  If you have bug fixes or improvements, please find
+.\" the corresponding HTML page on the Netpbm website, generate a patch
+.\" against that, and send it to the Netpbm maintainer.
+'''
+
+class LiftException(exceptions.Exception):
+    def __init__(self, message, retval=1):
+        self.message = message
+        self.retval = retval
+
+def makeman(name, file, indoc):
+    "Transform a string representing an HTML document into man markup."
+    global section, sectmap
+    # Dot at left margin confuses troff.
+    # This program generates these,
+    indoc = indoc.replace("\n.", "\n@%@%@")
+    # Header-bashing
+    indoc = indoc.replace('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">\n',"")
+    indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "")
+    indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"")
+    indoc = indoc.replace('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "DTD/xhtml11.dtd">', "")
+    indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "")
+    indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "")
+    indoc = indoc.replace("<head>", "").replace("</head>", "")
+    indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc)
+    datematch = re.compile("Updated: (.*)\n")
+    match = datematch.search(indoc)
+    if match:
+        date = match.group(1)
+    else:
+        date = ""
+    indoc = datematch.sub("", indoc)
+    namematch = re.compile("<H1>(.*)</H1>", re.I)
+    match = namematch.search(indoc)
+    if match:
+        name = match.group(1)
+    else:
+        name = None
+    section = 1
+    meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
+    match = meta.search(indoc)
+    if match:
+        section = int(match.group(1))
+        indoc = meta.sub("", indoc)
+    else:
+        section = sectmap.get(name, 0)
+    indoc = namematch.sub("", indoc)
+    indoc = re.sub("(?i)<BODY[^>]*>", "", indoc)
+    indoc = re.sub("(?i)<HTML>", "", indoc)
+    # Remove more superfluous headers
+    titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I)
+    match = titlematch.search(indoc)
+    if match:
+        title = match.group(1)
+    else:
+        title = None
+    indoc = titlematch.sub("", indoc)
+    indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc)
+    indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
+    # Literal layout
+    indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
+    indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
+    indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.nf", indoc)
+    indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.fi", indoc)
+    # Highlight processing
+    indoc = re.sub("(?i)<B>", r"\\fB", indoc)
+    indoc = re.sub("(?i)</B>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<EM>", r"\\fI", indoc)
+    indoc = re.sub("(?i)</EM>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<CITE>", r"\\fI", indoc)
+    indoc = re.sub("(?i)</CITE>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<I>", r"\\fI", indoc)
+    indoc = re.sub("(?i)</I>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc)
+    indoc = re.sub("(?i)</TT>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc)
+    indoc = re.sub("(?i)</KBD>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc)
+    indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc)
+    indoc = re.sub("(?i)<SUP>", r"\\u", indoc)
+    indoc = re.sub("(?i)</SUP>", r"\\d", indoc)
+    # Paragraph handling
+    indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc)
+    indoc = re.sub("(?i)</P>", "", indoc)
+    lines = indoc.split("\n")
+    listdepth = 0
+    for i in range(len(lines)):
+        lowered = lines[i].lower()
+        if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered:
+            listdepth += 1
+        if listdepth:
+            lines[i] = lines[i].replace(".PP", ".sp")
+        if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered:
+            listdepth -= 1
+    indoc = "\n".join(lines)
+    indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
+    # Format email addresses as italic
+    indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)    
+    # Format manual crossreferences
+    def xrefmatch(match):
+        xrefto = match.group(1)
+        xrefsection = sectmap.get(xrefto, 1)
+        if xrefsection == 0:
+            return "\n.I " + xrefto
+        else:
+            return "\n.BR %s (%d)" % (xrefto, xrefsection)
+    indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="[^>]+.html">([^<]+)</A>(?:\\fP)?',
+                   xrefmatch, indoc)
+    # Format URLs
+    def urlmatch(match):
+        url = match.group(1).replace('\n', ' ')
+        txt = match.group(2).replace('\n', ' ')
+        return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
+    indoc = re.sub(r'(?i)\n*(?:&lt;)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:&gt;)?',
+                  urlmatch, indoc)
+    # Turn some entities into harmless cookies
+    indoc = indoc.replace("&lt;", "@#!#@").replace("&gt;", "#@!@#").replace("&amp;", "#!@!@!#")
+    indoc = indoc.replace("&#215;", r"\(mu")
+    indoc = indoc.replace("&#174;", r"\*R")
+    # Turn anchors into .UN tags
+    indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z][a-zA-Z0-9.-]+)">(?:&nbsp;)*</A>\s*', ".UN \\1\n", indoc)
+    # Strip off the index trailer
+    trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE)
+    indoc = re.sub(trailer, "", indoc)
+    # If there was no index trailer, we still need to strip these
+    indoc = indoc.replace("</BODY>", "").replace("</HTML>", "")
+    indoc = indoc.replace("</body>", "").replace("</html>", "")
+    # Recognize sections with IDs
+    indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H2>',
+                   ".UN \\1\n.SH \\2", indoc)
+    indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H3>',
+                   ".UN \\1\n.SS \\2", indoc)
+    indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H4>',
+                   ".UN \\1\n.B \\2", indoc)
+    indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H2>',
+                   ".UN \\1\n.SH \\2", indoc)
+    indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H3>',
+                   ".UN \\1\n.SS \\2", indoc)
+    indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H4>',
+                   ".UN \\1\n.B \\2", indoc)
+    # Sections without IDs
+    indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
+    indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
+    indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
+    # 
+    # Process definition lists -- just turn them into .TPs
+    indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc)
+    indoc = re.sub("(?i) *</DL>", "", indoc)
+    indoc = re.sub("(?i) *<DT>", ".TP\n", indoc)
+    indoc = re.sub("(?i) *</DT>", "", indoc)
+    indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc)
+    indoc = re.sub("(?i) *</DD>", "", indoc)
+    # Process unordered lists -- just turn them into .TPs
+    indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
+    indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
+    indoc = re.sub("(?i) *</LI>", "", indoc)
+    # No-print tags
+    indoc = re.sub("<!--no_print-->.*", "", indoc)
+    # Passthrough
+    indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
+    # Comments
+    indoc = re.sub("<!--([^-])*-->", r'.\"\1', indoc)
+    # Image tags
+    indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc)
+    # Special characters
+    indoc = indoc.replace("&quot;", "'")
+    indoc = indoc.replace("&nbsp;", "\\ ")
+    # Tables
+    indoc = re.sub(' *<table[^>]*>.*', ".TS", indoc)
+    indoc = re.sub(" *</table>.*", ".TE", indoc)
+    # First the single-line case
+    indoc = re.sub("</td> *<td>", "\t", indoc)
+    indoc = re.sub("<tr> *<td>", "", indoc)
+    indoc = re.sub("</td> *</tr>", "", indoc)
+    # Then the multiline case
+    indoc = re.sub(r'\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc)
+    indoc = re.sub(r'\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc)
+    indoc = indoc.replace("\n\\&T}", "\nT}")
+    indoc = re.sub(" *</tr>", "", indoc)
+    indoc = re.sub(" *<tr[^>]*>\t*", "", indoc)
+    indoc = re.sub(r"\.TS\s+<caption>([^<]*)</caption>\s*", ".B \\1\n.TS\n", indoc)
+    # Debugging
+    #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
+    # Time for error checking now
+    badlines = []
+    for line in indoc.split("\n"):
+        if "<" in line or ">" in line or re.search("&.*;", line):
+            badlines.append(line)
+    if badlines:
+        sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n")
+    # Goes after bad-line check so we don't misinterpret it as an error
+    indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&")
+    indoc = re.sub("\n+$", "\n", indoc)
+    # Single-quote at left margin confuses troff.
+    # This program never generates these.
+    indoc = indoc.replace("\n'", "\n\\&'")
+    # Finish guarding against leading dots.
+    indoc = indoc.replace("\n@%@%@", "\n\\&.")
+    # Mark these generated pages so people won't hand-hack them.
+    indoc = warning + indoc
+    return indoc
+
+def main(args, mainout=sys.stdout, mainerr=sys.stderr):
+    global sectmap
+    import getopt
+    (options, arguments) = getopt.getopt(args, "v")
+    verbosity = 0
+    for (switch, val) in options:
+        if switch == '-v':
+            verbosity += 1
+    try:
+        # First pass: gather locations for crossreferences:
+        sectmap = {}
+        for file in arguments:
+            try: 
+                infp = open(file)
+            except:
+                sys.stderr.write("can't open %s" % name)
+                continue
+            indoc = infp.read()
+            infp.close()
+            namere = re.compile("<H1>(.*)</H1>", re.I)
+            namematch = namere.search(indoc)
+            titlere = re.compile("<TITLE>(.*)</TITLE>", re.I)
+            titlematch = titlere.search(indoc)
+            if not namematch:
+                raise LiftException("name missing from %s" % file)
+            if not titlematch:
+                raise LiftException("title missing from %s" % file)
+            else:
+                title = titlematch.group(1)
+                name = titlematch.group(1)
+            meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
+            match = meta.search(indoc)
+            if match:
+                section = int(match.group(1))
+                sectmap[title] = sectmap[file] = sectmap[name] = section
+            else:
+                sectmap[title] = sectmap[file] = sectmap[name] = 1
+            hr = re.compile("(?i)<HR>")
+            firsthr = hr.search(indoc)
+            if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
+                LiftException("%s has two <HR> tags!" % file)
+        # Second pass: do formatting
+        for file in arguments:
+            try: 
+                infp = open(file)
+            except:
+                sys.stderr.write("can't open %s" % name)
+                continue
+            indoc = infp.read()
+            infp.close()
+            tempfile = file + ".~%s-%d~" % (name, os.getpid())
+            try:
+                outfp = open(tempfile, "w")
+            except OSError:
+                sys.stderr.write("%s: can't open tempfile" % name)
+                return True
+            try:
+                if verbosity:
+                    sys.stderr.write("makeman: %s\n" % file)
+                outdoc = makeman(name, file, indoc)
+            except:
+                os.remove(tempfile)
+                # Pass the exception upwards
+                (exc_type, exc_value, exc_traceback) = sys.exc_info()
+                raise exc_type, exc_value, exc_traceback
+            if outdoc == indoc:
+                os.remove(tempfile)
+            if outdoc is None:
+                continue
+            else:
+                outfp.write(outdoc)
+                outfp.close()	# under Windows you can't rename an open file
+                stem = file[:file.find(".")]
+                os.rename(tempfile, stem + "." + `sectmap[file]`)
+    except LiftException, e:
+        mainerr.write("makeman: " + e.message + "\n")
+        return e.retval
+    except IOError, e:
+        mainerr.write("makeman: file I/O error: %s\n" % e)
+        return 3
+    except KeyboardInterrupt:
+        mainerr.write("makeman: bailing out...\n")
+        return 4
+    except:
+        if verbosity:
+            (exc_type, exc_value, exc_traceback) = sys.exc_info()
+            raise exc_type, exc_value, exc_traceback
+        else:
+            return 5
+
+if __name__ == "__main__":
+    # Run the main sequence
+    raise SystemExit, main(sys.argv[1:])
+
+# The following sets edit modes for GNU EMACS
+# Local Variables:
+# mode:python
+# End: