about summary refs log tree commit diff
path: root/buildtools/makeman
diff options
context:
space:
mode:
Diffstat (limited to 'buildtools/makeman')
-rwxr-xr-xbuildtools/makeman59
1 files changed, 42 insertions, 17 deletions
diff --git a/buildtools/makeman b/buildtools/makeman
index 196dbd0a..6e4bb52b 100755
--- a/buildtools/makeman
+++ b/buildtools/makeman
@@ -2,17 +2,24 @@
 #
 # makeman -- compile netpbm's stereotyped HTML to troff markup
 #
-# This approach works because we control the entire document universe 
+# Example:
+#
+#    $ makeman pamcut.html pamcomp.html
+#
+#    $ makeman -v -d /tmp/inputdir pamcut.html
+#
+# The output troff file is in the same directory as the input HTML file, named
+# the same except with .1 extension.
+
+# This approach works because we control the entire document universe
 # this is going to convert and can reinforce useful stereotypes.
 #
 # The output of this tool uses cliches parseable by doclifter,
 # which should thus be able to recover all the semantic information
 # it looks like this thing is losing.
 #
-# Known bugs:
-#  * Ordered lists are smashed into unordered lists
-#
 # Limitations:
+#  * Ordered lists are smashed into unordered lists
 #  * IMG tags are issued as .IMG preceded by a bolded caption containing
 #    the alt content.  This will only work if the page is formatted with
 #    mwww macros.
@@ -29,7 +36,7 @@
 # Version 1.1, February 11 2016
 #
 #   Added ability to process — −
-#   Added footer message to clarify original source. 
+#   Added footer message to clarify original source.
 #
 
 import os, sys, re
@@ -110,7 +117,7 @@ def makeman(name, file, indoc):
     indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
     # Literal layout
     indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
-    indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
+    indoc = re.sub("(?i) *</PRE>\n", "\n.fi\n", indoc)
     indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc)
     indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc)
     # Highlight processing
@@ -151,7 +158,7 @@ def makeman(name, file, indoc):
     indoc = "\n".join(lines)
     indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
     # Format email addresses as italic
-    indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)    
+    indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)
     # Format manual cross-references
     def xrefmatch(match):
         xrefto = match.group(2)
@@ -176,7 +183,7 @@ def makeman(name, file, indoc):
     indoc = indoc.replace("&#174;", r"\*R")
     indoc = indoc.replace("&copy;", r"\(co")
     # Turn anchors into .UN tags
-    indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?:&nbsp;)*</A>\s*', ".UN \\1\n", indoc)
+    indoc = re.sub(r'(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?:&nbsp;)*</A>\s*', ".UN \\1\n", indoc)
     # Strip off the index trailer
     trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE)
     indoc = re.sub(trailer, "", indoc)
@@ -200,7 +207,8 @@ def makeman(name, file, indoc):
     indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
     indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
     indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
-    # 
+    indoc = re.sub('(?i)<H5>([^><]*)</H5>', ".B \\1", indoc)
+    #
     # Process definition lists -- just turn them into .TPs
     indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc)
     indoc = re.sub("(?i) *</DL>", "", indoc)
@@ -210,7 +218,7 @@ def makeman(name, file, indoc):
     indoc = re.sub("(?i) *</DD>", "", indoc)
     # Process unordered lists -- just turn them into .TPs
     indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
-    indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
+    indoc = re.sub("(?i) *<LI>", r".IP \(bu\n", indoc)
     indoc = re.sub("(?i) *</LI>", "", indoc)
     # No-print tags
     indoc = re.sub("<!--no_print-->.*", "", indoc)
@@ -221,6 +229,15 @@ def makeman(name, file, indoc):
     # Acronyms
     indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc)
     indoc = re.sub("</acronym>", "", indoc)
+    # Abbreviation - just erase tags
+    indoc = re.sub('<abbr [^>]+>', '', indoc)
+    indoc = re.sub('</abbr>', '', indoc)
+    # Subscript - just erase tags
+    indoc = re.sub('(?i)<sub [^>]+>', '', indoc)
+    indoc = re.sub('(?i)</sub>', '', indoc)
+    # Span - just erase tags
+    indoc = re.sub('(?i)<span [^>]+>', '', indoc)
+    indoc = re.sub('(?i)</span>', '', indoc)
     # Image tags
     indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc)
     # Special characters
@@ -230,13 +247,14 @@ def makeman(name, file, indoc):
     indoc = indoc.replace("&mdash;", "-")
     indoc = indoc.replace("&mu;", "mu")
     indoc = indoc.replace("&sigma;", "sigma")
+    indoc = indoc.replace("&apos;", "'")
     # Tables
     # This will not handle rowspan
     indoc = re.sub('(?i) *<table[^>]*>.*', ".TS", indoc)
     indoc = re.sub("(?i) *</table>.*", ".TE", indoc)
     # First the single-line case
     indoc = re.sub("(?i)</td> *<td>", "\t", indoc)
-    indoc = re.sub("(?i)<tr> *<td>", "", indoc)
+    indoc = re.sub("(?i)<tr> *<td( [^>]*)?>", "", indoc)
     indoc = re.sub("(?i)</td> *</tr>", "", indoc)
     # Then the multiline case
     indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc)
@@ -248,6 +266,9 @@ def makeman(name, file, indoc):
     # Debugging
     #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
     # Time for error checking now
+    # We replaced every HTML tag we could above, so any remaining in
+    #   'indoc' represent material we don't know how to convert, which we call
+    #   bad lines.
     badlines = []
     for line in indoc.split("\n"):
         if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line):
@@ -282,10 +303,12 @@ def main(args, mainout=sys.stdout, mainerr=sys.stderr):
         # First pass: gather locations for cross-references:
         sectmap = {}
         for file in arguments:
-            try: 
-                infp = open(os.path.join(dirprefix, file))
+            fullfilenm = os.path.join(dirprefix, file)
+            try:
+                infp = open(fullfilenm)
             except:
-                sys.stderr.write("makeman: can't open %s\n" % file)
+                sys.stderr.write(
+                    "makeman: can't open input file '%s'\n" % fullfilenm)
                 continue
             indoc = infp.read()
             infp.close()
@@ -313,10 +336,12 @@ def main(args, mainout=sys.stdout, mainerr=sys.stderr):
                 LiftException("%s has two <HR> tags!" % file)
         # Second pass: do formatting
         for file in arguments:
-            try: 
-                infp = open(os.path.join(dirprefix, file))
+            fullfilenm = os.path.join(dirprefix, file)
+            try:
+                infp = open(fullfilenm)
             except:
-                sys.stderr.write("makeman: can't open %s\n" % file)
+                sys.stderr.write(
+                    "makeman: can't open output file '%s'\n" % fullfilenm)
                 continue
             indoc = infp.read()
             infp.close()