diff options
author | giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> | 2006-08-19 03:12:28 +0000 |
---|---|---|
committer | giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> | 2006-08-19 03:12:28 +0000 |
commit | 1fd361a1ea06e44286c213ca1f814f49306fdc43 (patch) | |
tree | 64c8c96cf54d8718847339a403e5e67b922e8c3f /buildtools/makeman | |
download | netpbm-mirror-1fd361a1ea06e44286c213ca1f814f49306fdc43.tar.gz netpbm-mirror-1fd361a1ea06e44286c213ca1f814f49306fdc43.tar.xz netpbm-mirror-1fd361a1ea06e44286c213ca1f814f49306fdc43.zip |
Create Subversion repository
git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1 9d0c8265-081b-0410-96cb-a4ca84ce46f8
Diffstat (limited to 'buildtools/makeman')
-rwxr-xr-x | buildtools/makeman | 333 |
1 files changed, 333 insertions, 0 deletions
diff --git a/buildtools/makeman b/buildtools/makeman new file mode 100755 index 00000000..634a2c79 --- /dev/null +++ b/buildtools/makeman @@ -0,0 +1,333 @@ +#!/bin/env python +# +# makeman -- compile netpbm's stereotyped HTML to troff markup +# +# This approach works because we control the entire document universe +# this is going to convert and can reinforce useful stereotypes. +# +# The output of this tool uses cliches parseable by doclifter, +# which should thus be able to recover all the semantic information +# it looks like this thing is losing. +# +# Known bugs: +# * Ordered lists are smashed into unordered lists +# +# Limitations: +# * IMG tags are issued as .IMG preceded by a bolded caption containing +# the alt content. This will only work if the page is formatted with +# mwww macros. +# * Loses summary information from tables. +# * Only permits one <HR> in the HTML, right before the index. +# +# Use the makeman: passthrough to insert format lines for tables. +# +# By Eric S. Raymond <esr@thyrsus.com> +# Version 1.0, July 26 2004 + +import os, sys, exceptions, re + +source = "netpbm documentation" +section = 1 + +warning = '''\ +.\" This man page was generated by the Netpbm tool 'makeman' from HTML source. +.\" Do not hand-hack it! If you have bug fixes or improvements, please find +.\" the corresponding HTML page on the Netpbm website, generate a patch +.\" against that, and send it to the Netpbm maintainer. +''' + +class LiftException(exceptions.Exception): + def __init__(self, message, retval=1): + self.message = message + self.retval = retval + +def makeman(name, file, indoc): + "Transform a string representing an HTML document into man markup." + global section, sectmap + # Dot at left margin confuses troff. + # This program generates these, + indoc = indoc.replace("\n.", "\n@%@%@") + # Header-bashing + indoc = indoc.replace('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">\n',"") + indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "") + indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"") + indoc = indoc.replace('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "DTD/xhtml11.dtd">', "") + indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "") + indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "") + indoc = indoc.replace("<head>", "").replace("</head>", "") + indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc) + datematch = re.compile("Updated: (.*)\n") + match = datematch.search(indoc) + if match: + date = match.group(1) + else: + date = "" + indoc = datematch.sub("", indoc) + namematch = re.compile("<H1>(.*)</H1>", re.I) + match = namematch.search(indoc) + if match: + name = match.group(1) + else: + name = None + section = 1 + meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">') + match = meta.search(indoc) + if match: + section = int(match.group(1)) + indoc = meta.sub("", indoc) + else: + section = sectmap.get(name, 0) + indoc = namematch.sub("", indoc) + indoc = re.sub("(?i)<BODY[^>]*>", "", indoc) + indoc = re.sub("(?i)<HTML>", "", indoc) + # Remove more superfluous headers + titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I) + match = titlematch.search(indoc) + if match: + title = match.group(1) + else: + title = None + indoc = titlematch.sub("", indoc) + indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc) + indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc + # Literal layout + indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc) + indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc) + indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.nf", indoc) + indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.fi", indoc) + # Highlight processing + indoc = re.sub("(?i)<B>", r"\\fB", indoc) + indoc = re.sub("(?i)</B>", r"\\fP", indoc) + indoc = re.sub("(?i)<EM>", r"\\fI", indoc) + indoc = re.sub("(?i)</EM>", r"\\fP", indoc) + indoc = re.sub("(?i)<CITE>", r"\\fI", indoc) + indoc = re.sub("(?i)</CITE>", r"\\fP", indoc) + indoc = re.sub("(?i)<I>", r"\\fI", indoc) + indoc = re.sub("(?i)</I>", r"\\fP", indoc) + indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc) + indoc = re.sub("(?i)</TT>", r"\\fP", indoc) + indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc) + indoc = re.sub("(?i)</KBD>", r"\\fP", indoc) + indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc) + indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc) + indoc = re.sub("(?i)<SUP>", r"\\u", indoc) + indoc = re.sub("(?i)</SUP>", r"\\d", indoc) + # Paragraph handling + indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc) + indoc = re.sub("(?i)</P>", "", indoc) + lines = indoc.split("\n") + listdepth = 0 + for i in range(len(lines)): + lowered = lines[i].lower() + if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered: + listdepth += 1 + if listdepth: + lines[i] = lines[i].replace(".PP", ".sp") + if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered: + listdepth -= 1 + indoc = "\n".join(lines) + indoc = re.sub(r"\s*\.sp", "\n.sp", indoc) + # Format email addresses as italic + indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc) + # Format manual crossreferences + def xrefmatch(match): + xrefto = match.group(1) + xrefsection = sectmap.get(xrefto, 1) + if xrefsection == 0: + return "\n.I " + xrefto + else: + return "\n.BR %s (%d)" % (xrefto, xrefsection) + indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="[^>]+.html">([^<]+)</A>(?:\\fP)?', + xrefmatch, indoc) + # Format URLs + def urlmatch(match): + url = match.group(1).replace('\n', ' ') + txt = match.group(2).replace('\n', ' ') + return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt) + indoc = re.sub(r'(?i)\n*(?:<)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:>)?', + urlmatch, indoc) + # Turn some entities into harmless cookies + indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#") + indoc = indoc.replace("×", r"\(mu") + indoc = indoc.replace("®", r"\*R") + # Turn anchors into .UN tags + indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z][a-zA-Z0-9.-]+)">(?: )*</A>\s*', ".UN \\1\n", indoc) + # Strip off the index trailer + trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE) + indoc = re.sub(trailer, "", indoc) + # If there was no index trailer, we still need to strip these + indoc = indoc.replace("</BODY>", "").replace("</HTML>", "") + indoc = indoc.replace("</body>", "").replace("</html>", "") + # Recognize sections with IDs + indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H2>', + ".UN \\1\n.SH \\2", indoc) + indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H3>', + ".UN \\1\n.SS \\2", indoc) + indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</A></H4>', + ".UN \\1\n.B \\2", indoc) + indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H2>', + ".UN \\1\n.SH \\2", indoc) + indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H3>', + ".UN \\1\n.SS \\2", indoc) + indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z]+)">([^><]*)</H4>', + ".UN \\1\n.B \\2", indoc) + # Sections without IDs + indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc) + indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc) + indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc) + # + # Process definition lists -- just turn them into .TPs + indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc) + indoc = re.sub("(?i) *</DL>", "", indoc) + indoc = re.sub("(?i) *<DT>", ".TP\n", indoc) + indoc = re.sub("(?i) *</DT>", "", indoc) + indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc) + indoc = re.sub("(?i) *</DD>", "", indoc) + # Process unordered lists -- just turn them into .TPs + indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc) + indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc) + indoc = re.sub("(?i) *</LI>", "", indoc) + # No-print tags + indoc = re.sub("<!--no_print-->.*", "", indoc) + # Passthrough + indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc) + # Comments + indoc = re.sub("<!--([^-])*-->", r'.\"\1', indoc) + # Image tags + indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc) + # Special characters + indoc = indoc.replace(""", "'") + indoc = indoc.replace(" ", "\\ ") + # Tables + indoc = re.sub(' *<table[^>]*>.*', ".TS", indoc) + indoc = re.sub(" *</table>.*", ".TE", indoc) + # First the single-line case + indoc = re.sub("</td> *<td>", "\t", indoc) + indoc = re.sub("<tr> *<td>", "", indoc) + indoc = re.sub("</td> *</tr>", "", indoc) + # Then the multiline case + indoc = re.sub(r'\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc) + indoc = re.sub(r'\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc) + indoc = indoc.replace("\n\\&T}", "\nT}") + indoc = re.sub(" *</tr>", "", indoc) + indoc = re.sub(" *<tr[^>]*>\t*", "", indoc) + indoc = re.sub(r"\.TS\s+<caption>([^<]*)</caption>\s*", ".B \\1\n.TS\n", indoc) + # Debugging + #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date)) + # Time for error checking now + badlines = [] + for line in indoc.split("\n"): + if "<" in line or ">" in line or re.search("&.*;", line): + badlines.append(line) + if badlines: + sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n") + # Goes after bad-line check so we don't misinterpret it as an error + indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&") + indoc = re.sub("\n+$", "\n", indoc) + # Single-quote at left margin confuses troff. + # This program never generates these. + indoc = indoc.replace("\n'", "\n\\&'") + # Finish guarding against leading dots. + indoc = indoc.replace("\n@%@%@", "\n\\&.") + # Mark these generated pages so people won't hand-hack them. + indoc = warning + indoc + return indoc + +def main(args, mainout=sys.stdout, mainerr=sys.stderr): + global sectmap + import getopt + (options, arguments) = getopt.getopt(args, "v") + verbosity = 0 + for (switch, val) in options: + if switch == '-v': + verbosity += 1 + try: + # First pass: gather locations for crossreferences: + sectmap = {} + for file in arguments: + try: + infp = open(file) + except: + sys.stderr.write("can't open %s" % name) + continue + indoc = infp.read() + infp.close() + namere = re.compile("<H1>(.*)</H1>", re.I) + namematch = namere.search(indoc) + titlere = re.compile("<TITLE>(.*)</TITLE>", re.I) + titlematch = titlere.search(indoc) + if not namematch: + raise LiftException("name missing from %s" % file) + if not titlematch: + raise LiftException("title missing from %s" % file) + else: + title = titlematch.group(1) + name = titlematch.group(1) + meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">') + match = meta.search(indoc) + if match: + section = int(match.group(1)) + sectmap[title] = sectmap[file] = sectmap[name] = section + else: + sectmap[title] = sectmap[file] = sectmap[name] = 1 + hr = re.compile("(?i)<HR>") + firsthr = hr.search(indoc) + if firsthr and hr.search(indoc[firsthr.start(0)+4:]): + LiftException("%s has two <HR> tags!" % file) + # Second pass: do formatting + for file in arguments: + try: + infp = open(file) + except: + sys.stderr.write("can't open %s" % name) + continue + indoc = infp.read() + infp.close() + tempfile = file + ".~%s-%d~" % (name, os.getpid()) + try: + outfp = open(tempfile, "w") + except OSError: + sys.stderr.write("%s: can't open tempfile" % name) + return True + try: + if verbosity: + sys.stderr.write("makeman: %s\n" % file) + outdoc = makeman(name, file, indoc) + except: + os.remove(tempfile) + # Pass the exception upwards + (exc_type, exc_value, exc_traceback) = sys.exc_info() + raise exc_type, exc_value, exc_traceback + if outdoc == indoc: + os.remove(tempfile) + if outdoc is None: + continue + else: + outfp.write(outdoc) + outfp.close() # under Windows you can't rename an open file + stem = file[:file.find(".")] + os.rename(tempfile, stem + "." + `sectmap[file]`) + except LiftException, e: + mainerr.write("makeman: " + e.message + "\n") + return e.retval + except IOError, e: + mainerr.write("makeman: file I/O error: %s\n" % e) + return 3 + except KeyboardInterrupt: + mainerr.write("makeman: bailing out...\n") + return 4 + except: + if verbosity: + (exc_type, exc_value, exc_traceback) = sys.exc_info() + raise exc_type, exc_value, exc_traceback + else: + return 5 + +if __name__ == "__main__": + # Run the main sequence + raise SystemExit, main(sys.argv[1:]) + +# The following sets edit modes for GNU EMACS +# Local Variables: +# mode:python +# End: |