#!/bin/env python
#
# makeman -- compile netpbm's stereotyped HTML to troff markup
#
# This approach works because we control the entire document universe
# this is going to convert and can reinforce useful stereotypes.
#
# The output of this tool uses cliches parseable by doclifter,
# which should thus be able to recover all the semantic information
# it looks like this thing is losing.
#
# Known bugs:
# * Ordered lists are smashed into unordered lists
#
# Limitations:
# * IMG tags are issued as .IMG preceded by a bolded caption containing
# the alt content. This will only work if the page is formatted with
# mwww macros.
# * Loses summary information from tables.
# * Only permits one
in the HTML, right before the index.
#
# You can use the PI to pass text directly through to the
# generated manual page, A major use is to insert format lines for tables.
#
# By Eric S. Raymond
# Version 1.0, July 26 2004
import os, sys, exceptions, re
source = "netpbm documentation"
section = 1
warning = r'''\
.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
.\" Do not hand-hack it! If you have bug fixes or improvements, please find
.\" the corresponding HTML page on the Netpbm website, generate a patch
.\" against that, and send it to the Netpbm maintainer.
'''
class LiftException(exceptions.Exception):
def __init__(self, message, retval=1):
self.message = message
self.retval = retval
def makeman(name, file, indoc):
"Transform a string representing an HTML document into man markup."
global section, sectmap
# Dot at left margin confuses troff.
# This program generates these,
indoc = indoc.replace("\n.", "\n@%@%@")
# Protect escapes before we try generating font changes.
indoc = indoc.replace("\\", r"\e")
# Header-bashing
indoc = indoc.replace('', "")
indoc = indoc.replace('', "")
indoc = indoc.replace('\n',"")
indoc = indoc.replace('', "")
indoc = indoc.replace('', "")
indoc = indoc.replace('\n',"")
indoc = indoc.replace('', "")
indoc = indoc.replace("", "").replace("", "")
indoc = indoc.replace("", "").replace("", "")
indoc = re.sub('(?i)Table Of Contents', "", indoc)
datematch = re.compile("Updated: (.*)\n")
match = datematch.search(indoc)
if match:
date = match.group(1)
else:
date = ""
indoc = datematch.sub("", indoc)
namematch = re.compile("(.*)
", re.I)
match = namematch.search(indoc)
if match:
name = match.group(1)
else:
name = None
section = 1
meta = re.compile('(?i)')
match = meta.search(indoc)
if match:
section = int(match.group(1))
indoc = meta.sub("", indoc)
else:
section = sectmap.get(name, 0)
indoc = namematch.sub("", indoc)
indoc = re.sub("(?i)]*>", "", indoc)
indoc = re.sub("(?i)", "", indoc)
# Remove more superfluous headers
titlematch = re.compile("(.*)\n+", re.I)
match = titlematch.search(indoc)
if match:
title = match.group(1)
else:
title = None
indoc = titlematch.sub("", indoc)
indoc = re.sub("(?i)\n*
\n+", "\n", indoc)
indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
# Literal layout
indoc = re.sub("(?i)\n *", "\n.nf", indoc)
indoc = re.sub("(?i)\n *
", "\n.fi", indoc)
indoc = re.sub("(?i)\n *", "\n.RS", indoc)
indoc = re.sub("(?i)\n *
", "\n.RE", indoc)
# Highlight processing
indoc = re.sub("(?i)", r"\\fB", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fI", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\f(CW", indoc)
indoc = re.sub("(?i)
", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\fB", indoc)
indoc = re.sub("(?i)", r"\\fP", indoc)
indoc = re.sub("(?i)", r"\\u", indoc)
indoc = re.sub("(?i)", r"\\d", indoc)
# Paragraph handling
indoc = re.sub("(?i)\n*\n*", r"\n.PP\n", indoc)
indoc = re.sub("(?i)
", "", indoc)
lines = indoc.split("\n")
listdepth = 0
for i in range(len(lines)):
lowered = lines[i].lower()
if "" in lowered or "" in lowered or "" in lowered:
listdepth -= 1
indoc = "\n".join(lines)
indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
# Format email addresses as italic
indoc = re.sub('(?i)]+">([^<]+)', r'\\fI\1\\fP', indoc)
# Format manual crossreferences
def xrefmatch(match):
xrefto = match.group(1)
xrefsection = sectmap.get(xrefto, 1)
if xrefsection == 0:
return "\n.I " + xrefto
else:
return "\n.BR %s (%d)" % (xrefto, xrefsection)
indoc = re.sub(r'(?i)\n* *(?:\\fB)?]+.html">([^<]+)(?:\\fP)?',
xrefmatch, indoc)
# Format URLs
def urlmatch(match):
url = match.group(1).replace('\n', ' ')
txt = match.group(2).replace('\n', ' ')
return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
indoc = re.sub(r'(?i)\n*(?:<)?]+)">([^<]+)(?:>)?',
urlmatch, indoc)
# Turn some entities into harmless cookies
indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#")
indoc = indoc.replace("×", r"\(mu")
indoc = indoc.replace("®", r"\*R")
indoc = indoc.replace("©", r"\(co")
# Turn anchors into .UN tags
indoc = re.sub('(?i)(?: )*\s*', ".UN \\1\n", indoc)
# Strip off the index trailer
trailer = re.compile('
.*', re.DOTALL | re.IGNORECASE)
indoc = re.sub(trailer, "", indoc)
# If there was no index trailer, we still need to strip these
indoc = indoc.replace("", "").replace("", "")
indoc = indoc.replace("", "").replace("