docfixer.py 36.4 KB
Newer Older
1 2
#! /usr/bin/env python

Fred Drake's avatar
Fred Drake committed
3 4
"""Perform massive transformations on a document tree created from the LaTeX
of the Python documentation, and dump the ESIS data for the transformed tree.
5 6 7 8
"""


import errno
Fred Drake's avatar
Fred Drake committed
9 10
import esistools
import re
11
import sys
12 13
import xml.dom
import xml.dom.minidom
14

15 16 17
ELEMENT = xml.dom.Node.ELEMENT_NODE
ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE
TEXT = xml.dom.Node.TEXT_NODE
18 19


20 21 22 23
class ConversionError(Exception):
    pass


24 25 26 27 28 29 30 31 32 33 34 35 36 37
ewrite = sys.stderr.write
try:
    # We can only do this trick on Unix (if tput is on $PATH)!
    if sys.platform != "posix" or not sys.stderr.isatty():
        raise ImportError
    import commands
except ImportError:
    bwrite = ewrite
else:
    def bwrite(s, BOLDON=commands.getoutput("tput bold"),
               BOLDOFF=commands.getoutput("tput sgr0")):
        ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))


38 39
PARA_ELEMENT = "para"

40 41
DEBUG_PARA_FIXER = 0

Fred Drake's avatar
Fred Drake committed
42 43
if DEBUG_PARA_FIXER:
    def para_msg(s):
44
        ewrite("*** %s\n" % s)
Fred Drake's avatar
Fred Drake committed
45 46 47 48
else:
    def para_msg(s):
        pass

49

50 51
def get_first_element(doc, gi):
    for n in doc.childNodes:
52
        if n.nodeName == gi:
53 54 55 56 57 58 59 60 61
            return n

def extract_first_element(doc, gi):
    node = get_first_element(doc, gi)
    if node is not None:
        doc.removeChild(node)
    return node


62 63 64 65 66 67 68 69 70 71 72 73
def get_documentElement(node):
    result = None
    for child in node.childNodes:
        if child.nodeType == ELEMENT:
            result = child
    return result


def set_tagName(elem, gi):
    elem.nodeName = elem.tagName = gi


Fred Drake's avatar
Fred Drake committed
74 75
def find_all_elements(doc, gi):
    nodes = []
76
    if doc.nodeName == gi:
Fred Drake's avatar
Fred Drake committed
77 78
        nodes.append(doc)
    for child in doc.childNodes:
79
        if child.nodeType == ELEMENT:
80
            if child.tagName == gi:
Fred Drake's avatar
Fred Drake committed
81 82 83
                nodes.append(child)
            for node in child.getElementsByTagName(gi):
                nodes.append(node)
84 85
    return nodes

86 87 88
def find_all_child_elements(doc, gi):
    nodes = []
    for child in doc.childNodes:
89
        if child.nodeName == gi:
90
            nodes.append(child)
91 92
    return nodes

93

94 95 96 97
def find_all_elements_from_set(doc, gi_set):
    return __find_all_elements_from_set(doc, gi_set, [])

def __find_all_elements_from_set(doc, gi_set, nodes):
98
    if doc.nodeName in gi_set:
99 100
        nodes.append(doc)
    for child in doc.childNodes:
101
        if child.nodeType == ELEMENT:
102
            __find_all_elements_from_set(child, gi_set, nodes)
103
    return nodes
Fred Drake's avatar
Fred Drake committed
104 105


106
def simplify(doc, fragment):
107 108 109 110
    # Try to rationalize the document a bit, since these things are simply
    # not valid SGML/XML documents as they stand, and need a little work.
    documentclass = "document"
    inputs = []
111
    node = extract_first_element(fragment, "documentclass")
112 113
    if node is not None:
        documentclass = node.getAttribute("classname")
114
    node = extract_first_element(fragment, "title")
115 116 117
    if node is not None:
        inputs.append(node)
    # update the name of the root element
118
    node = get_first_element(fragment, "document")
119
    if node is not None:
120
        set_tagName(node, documentclass)
Fred Drake's avatar
Fred Drake committed
121 122 123 124 125 126
        # Move everything that comes before this node into this node;
        # this will be the document element.
        nodelist = fragment.childNodes
        point = node.firstChild
        while not nodelist[0].isSameNode(node):
            node.insertBefore(nodelist[0], point)
127
    while 1:
128
        node = extract_first_element(fragment, "input")
129 130 131 132
        if node is None:
            break
        inputs.append(node)
    if inputs:
133
        docelem = get_documentElement(fragment)
134 135 136 137 138 139
        inputs.reverse()
        for node in inputs:
            text = doc.createTextNode("\n")
            docelem.insertBefore(text, docelem.firstChild)
            docelem.insertBefore(node, text)
        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
140
    while fragment.firstChild and fragment.firstChild.nodeType == TEXT:
141
        fragment.removeChild(fragment.firstChild)
142 143 144 145 146 147 148 149


def cleanup_root_text(doc):
    discards = []
    skip = 0
    for n in doc.childNodes:
        prevskip = skip
        skip = 0
150
        if n.nodeType == TEXT and not prevskip:
151
            discards.append(n)
152
        elif n.nodeName == "COMMENT":
153 154 155 156 157
            skip = 1
    for node in discards:
        doc.removeChild(node)


158 159 160 161 162 163 164
DESCRIPTOR_ELEMENTS = (
    "cfuncdesc", "cvardesc", "ctypedesc",
    "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
    "excdesc", "funcdesc", "funcdescni", "opcodedesc",
    "datadesc", "datadescni",
    )

165 166
def fixup_descriptors(doc, fragment):
    sections = find_all_elements(fragment, "section")
167 168 169 170 171 172 173
    for section in sections:
        find_and_fix_descriptors(doc, section)


def find_and_fix_descriptors(doc, container):
    children = container.childNodes
    for child in children:
174 175
        if child.nodeType == ELEMENT:
            tagName = child.tagName
176 177 178 179 180
            if tagName in DESCRIPTOR_ELEMENTS:
                rewrite_descriptor(doc, child)
            elif tagName == "subsection":
                find_and_fix_descriptors(doc, child)

181 182 183 184

def rewrite_descriptor(doc, descriptor):
    #
    # Do these things:
185
    #   1. Add an "index='no'" attribute to the element if the tagName
186
    #      ends in 'ni', removing the 'ni' from the name.
187 188
    #   2. Create a <signature> from the name attribute
    #   2a.Create an <args> if it appears to be available.
189 190
    #   3. Create additional <signature>s from <*line{,ni}> elements,
    #      if found.
191 192 193 194
    #   4. If a <versionadded> is found, move it to an attribute on the
    #      descriptor.
    #   5. Move remaining child nodes to a <description> element.
    #   6. Put it back together.
195
    #
196
    # 1.
197
    descname = descriptor.tagName
198
    index = descriptor.getAttribute("name") != "no"
199 200 201 202 203 204 205 206 207 208 209
    desctype = descname[:-4] # remove 'desc'
    linename = desctype + "line"
    if not index:
        linename = linename + "ni"
    # 2.
    signature = doc.createElement("signature")
    name = doc.createElement("name")
    signature.appendChild(doc.createTextNode("\n    "))
    signature.appendChild(name)
    name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
    descriptor.removeAttribute("name")
210
    # 2a.
211
    if descriptor.hasAttribute("var"):
212 213 214
        if descname != "opcodedesc":
            raise RuntimeError, \
                  "got 'var' attribute on descriptor other than opcodedesc"
215 216 217 218 219 220 221 222 223
        variable = descriptor.getAttribute("var")
        if variable:
            args = doc.createElement("args")
            args.appendChild(doc.createTextNode(variable))
            signature.appendChild(doc.createTextNode("\n    "))
            signature.appendChild(args)
        descriptor.removeAttribute("var")
    newchildren = [signature]
    children = descriptor.childNodes
224
    pos = skip_leading_nodes(children)
225 226
    if pos < len(children):
        child = children[pos]
227 228 229 230 231 232
        if child.nodeName == "args":
            # move <args> to <signature>, or remove if empty:
            child.parentNode.removeChild(child)
            if len(child.childNodes):
                signature.appendChild(doc.createTextNode("\n    "))
                signature.appendChild(child)
233
    signature.appendChild(doc.createTextNode("\n  "))
234
    # 3, 4.
235
    pos = skip_leading_nodes(children, pos)
236
    while pos < len(children) \
237 238
          and children[pos].nodeName in (linename, "versionadded"):
        if children[pos].tagName == linename:
239
            # this is really a supplemental signature, create <signature>
240 241 242 243 244 245
            oldchild = children[pos].cloneNode(1)
            try:
                sig = methodline_to_signature(doc, children[pos])
            except KeyError:
                print oldchild.toxml()
                raise
246 247 248 249 250
            newchildren.append(sig)
        else:
            # <versionadded added=...>
            descriptor.setAttribute(
                "added", children[pos].getAttribute("version"))
251
        pos = skip_leading_nodes(children, pos + 1)
252
    # 5.
253 254 255 256 257
    description = doc.createElement("description")
    description.appendChild(doc.createTextNode("\n"))
    newchildren.append(description)
    move_children(descriptor, description, pos)
    last = description.childNodes[-1]
258
    if last.nodeType == TEXT:
Fred Drake's avatar
Fred Drake committed
259
        last.data = last.data.rstrip() + "\n  "
260
    # 6.
261 262 263 264 265 266 267 268
    # should have nothing but whitespace and signature lines in <descriptor>;
    # discard them
    while descriptor.childNodes:
        descriptor.removeChild(descriptor.childNodes[0])
    for node in newchildren:
        descriptor.appendChild(doc.createTextNode("\n  "))
        descriptor.appendChild(node)
    descriptor.appendChild(doc.createTextNode("\n"))
269

Fred Drake's avatar
Fred Drake committed
270 271 272 273 274 275

def methodline_to_signature(doc, methodline):
    signature = doc.createElement("signature")
    signature.appendChild(doc.createTextNode("\n    "))
    name = doc.createElement("name")
    name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
276
    methodline.removeAttribute("name")
Fred Drake's avatar
Fred Drake committed
277 278
    signature.appendChild(name)
    if len(methodline.childNodes):
279
        args = doc.createElement("args")
Fred Drake's avatar
Fred Drake committed
280
        signature.appendChild(doc.createTextNode("\n    "))
281 282
        signature.appendChild(args)
        move_children(methodline, args)
Fred Drake's avatar
Fred Drake committed
283 284
    signature.appendChild(doc.createTextNode("\n  "))
    return signature
285 286


287 288 289 290 291 292 293 294
def move_children(origin, dest, start=0):
    children = origin.childNodes
    while start < len(children):
        node = children[start]
        origin.removeChild(node)
        dest.appendChild(node)


295
def handle_appendix(doc, fragment):
Fred Drake's avatar
Fred Drake committed
296
    # must be called after simplfy() if document is multi-rooted to begin with
297
    docelem = get_documentElement(fragment)
298
    toplevel = docelem.tagName == "manual" and "chapter" or "section"
Fred Drake's avatar
Fred Drake committed
299 300 301 302 303
    appendices = 0
    nodes = []
    for node in docelem.childNodes:
        if appendices:
            nodes.append(node)
304
        elif node.nodeType == ELEMENT:
Fred Drake's avatar
Fred Drake committed
305 306 307 308 309 310 311 312 313 314 315 316
            appnodes = node.getElementsByTagName("appendix")
            if appnodes:
                appendices = 1
                parent = appnodes[0].parentNode
                parent.removeChild(appnodes[0])
                parent.normalize()
    if nodes:
        map(docelem.removeChild, nodes)
        docelem.appendChild(doc.createTextNode("\n\n\n"))
        back = doc.createElement("back-matter")
        docelem.appendChild(back)
        back.appendChild(doc.createTextNode("\n"))
317
        while nodes and nodes[0].nodeType == TEXT \
Fred Drake's avatar
Fred Drake committed
318
              and not nodes[0].data.strip():
Fred Drake's avatar
Fred Drake committed
319 320 321
            del nodes[0]
        map(back.appendChild, nodes)
        docelem.appendChild(doc.createTextNode("\n"))
322 323


324 325
def handle_labels(doc, fragment):
    for label in find_all_elements(fragment, "label"):
Fred Drake's avatar
Fred Drake committed
326 327 328 329
        id = label.getAttribute("id")
        if not id:
            continue
        parent = label.parentNode
330
        parentTagName = parent.tagName
331
        if parentTagName == "title":
Fred Drake's avatar
Fred Drake committed
332 333 334 335 336
            parent.parentNode.setAttribute("id", id)
        else:
            parent.setAttribute("id", id)
        # now, remove <label id="..."/> from parent:
        parent.removeChild(label)
337
        if parentTagName == "title":
338 339 340
            parent.normalize()
            children = parent.childNodes
            if children[-1].nodeType == TEXT:
Fred Drake's avatar
Fred Drake committed
341
                children[-1].data = children[-1].data.rstrip()
342 343


Fred Drake's avatar
Fred Drake committed
344 345 346
def fixup_trailing_whitespace(doc, fragment, wsmap):
    queue = [fragment]
    fixups = []
347 348 349
    while queue:
        node = queue[0]
        del queue[0]
350
        if wsmap.has_key(node.nodeName):
Fred Drake's avatar
Fred Drake committed
351
            fixups.append(node)
352
        for child in node.childNodes:
353
            if child.nodeType == ELEMENT:
354 355
                queue.append(child)

Fred Drake's avatar
Fred Drake committed
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
    # reverse the list to process from the inside out
    fixups.reverse()
    for node in fixups:
        node.parentNode.normalize()
        lastchild = node.lastChild
        before, after = wsmap[node.tagName]
        if lastchild.nodeType == TEXT:
            data = lastchild.data.rstrip() + before
            lastchild.data = data
        norm = 0
        if wsmap[node.tagName]:
            nextnode = node.nextSibling
            if nextnode and nextnode.nodeType == TEXT:
                nextnode.data = after + nextnode.data.lstrip()
            else:
                wsnode = doc.createTextNode(after)
                node.parentNode.insertBefore(wsnode, nextnode)
        # hack to get the title in place:
        if node.tagName == "title" \
           and node.parentNode.firstChild.nodeType == ELEMENT:
            node.parentNode.insertBefore(doc.createTextNode("\n  "),
                                         node.parentNode.firstChild)
            node.parentNode.normalize()

380 381 382

def normalize(doc):
    for node in doc.childNodes:
383
        if node.nodeType == ELEMENT:
384 385 386 387 388 389 390 391
            node.normalize()


def cleanup_trailing_parens(doc, element_names):
    d = {}
    for gi in element_names:
        d[gi] = gi
    rewrite_element = d.has_key
392
    queue = [node for node in doc.childNodes if node.nodeType == ELEMENT]
393 394 395
    while queue:
        node = queue[0]
        del queue[0]
396
        if rewrite_element(node.tagName):
397 398 399 400 401
            lastchild = node.lastChild
            if lastchild and lastchild.nodeType == TEXT:
                data = lastchild.data
                if data.endswith("()"):
                    lastchild.data = data[:-2]
402 403
        else:
            for child in node.childNodes:
404
                if child.nodeType == ELEMENT:
405 406 407
                    queue.append(child)


408 409 410 411 412 413 414 415 416
def contents_match(left, right):
    left_children = left.childNodes
    right_children = right.childNodes
    if len(left_children) != len(right_children):
        return 0
    for l, r in map(None, left_children, right_children):
        nodeType = l.nodeType
        if nodeType != r.nodeType:
            return 0
417
        if nodeType == ELEMENT:
418
            if l.tagName != r.tagName:
419 420 421 422
                return 0
            # should check attributes, but that's not a problem here
            if not contents_match(l, r):
                return 0
423
        elif nodeType == TEXT:
424 425 426 427 428 429 430 431 432 433 434 435 436
            if l.data != r.data:
                return 0
        else:
            # not quite right, but good enough
            return 0
    return 1


def create_module_info(doc, section):
    # Heavy.
    node = extract_first_element(section, "modulesynopsis")
    if node is None:
        return
437
    set_tagName(node, "synopsis")
438
    lastchild = node.childNodes[-1]
439
    if lastchild.nodeType == TEXT \
440 441
       and lastchild.data[-1:] == ".":
        lastchild.data = lastchild.data[:-1]
442 443
    modauthor = extract_first_element(section, "moduleauthor")
    if modauthor:
444
        set_tagName(modauthor, "author")
445 446 447
        modauthor.appendChild(doc.createTextNode(
            modauthor.getAttribute("name")))
        modauthor.removeAttribute("name")
448
    platform = extract_first_element(section, "platform")
449
    if section.tagName == "section":
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
        modinfo_pos = 2
        modinfo = doc.createElement("moduleinfo")
        moddecl = extract_first_element(section, "declaremodule")
        name = None
        if moddecl:
            modinfo.appendChild(doc.createTextNode("\n    "))
            name = moddecl.attributes["name"].value
            namenode = doc.createElement("name")
            namenode.appendChild(doc.createTextNode(name))
            modinfo.appendChild(namenode)
            type = moddecl.attributes.get("type")
            if type:
                type = type.value
                modinfo.appendChild(doc.createTextNode("\n    "))
                typenode = doc.createElement("type")
                typenode.appendChild(doc.createTextNode(type))
                modinfo.appendChild(typenode)
467 468 469
        versionadded = extract_first_element(section, "versionadded")
        if versionadded:
            modinfo.setAttribute("added", versionadded.getAttribute("version"))
470 471 472 473
        title = get_first_element(section, "title")
        if title:
            children = title.childNodes
            if len(children) >= 2 \
474
               and children[0].nodeName == "module" \
475 476 477 478
               and children[0].childNodes[0].data == name:
                # this is it; morph the <title> into <short-synopsis>
                first_data = children[1]
                if first_data.data[:4] == " ---":
Fred Drake's avatar
Fred Drake committed
479
                    first_data.data = first_data.data[4:].lstrip()
480
                set_tagName(title, "short-synopsis")
481
                if children[-1].nodeType == TEXT \
Fred Drake's avatar
Fred Drake committed
482
                   and children[-1].data[-1:] == ".":
483 484 485 486 487 488
                    children[-1].data = children[-1].data[:-1]
                section.removeChild(title)
                section.removeChild(section.childNodes[0])
                title.removeChild(children[0])
                modinfo_pos = 0
            else:
489 490
                ewrite("module name in title doesn't match"
                       " <declaremodule/>; no <short-synopsis/>\n")
491
        else:
492
            ewrite("Unexpected condition: <section/> without <title/>\n")
493 494 495 496 497 498 499
        modinfo.appendChild(doc.createTextNode("\n    "))
        modinfo.appendChild(node)
        if title and not contents_match(title, node):
            # The short synopsis is actually different,
            # and needs to be stored:
            modinfo.appendChild(doc.createTextNode("\n    "))
            modinfo.appendChild(title)
500 501 502
        if modauthor:
            modinfo.appendChild(doc.createTextNode("\n    "))
            modinfo.appendChild(modauthor)
503 504 505
        if platform:
            modinfo.appendChild(doc.createTextNode("\n    "))
            modinfo.appendChild(platform)
506 507 508
        modinfo.appendChild(doc.createTextNode("\n  "))
        section.insertBefore(modinfo, section.childNodes[modinfo_pos])
        section.insertBefore(doc.createTextNode("\n  "), modinfo)
509 510 511
        #
        # The rest of this removes extra newlines from where we cut out
        # a lot of elements.  A lot of code for minimal value, but keeps
512
        # keeps the generated *ML from being too funny looking.
513 514 515 516 517
        #
        section.normalize()
        children = section.childNodes
        for i in range(len(children)):
            node = children[i]
518
            if node.nodeName == "moduleinfo":
519
                nextnode = children[i+1]
520
                if nextnode.nodeType == TEXT:
521
                    data = nextnode.data
Fred Drake's avatar
Fred Drake committed
522 523 524
                    s = data.lstrip()
                    if len(s) < (len(data) - 4):
                        nextnode.data = "\n\n\n" + s
525 526


527 528
def cleanup_synopses(doc, fragment):
    for node in find_all_elements(fragment, "section"):
Fred Drake's avatar
Fred Drake committed
529
        create_module_info(doc, node)
530 531


532 533
def fixup_table_structures(doc, fragment):
    for table in find_all_elements(fragment, "table"):
Fred Drake's avatar
Fred Drake committed
534 535
        fixup_table(doc, table)

536 537 538 539 540 541 542 543 544 545 546 547 548 549 550

def fixup_table(doc, table):
    # create the table head
    thead = doc.createElement("thead")
    row = doc.createElement("row")
    move_elements_by_name(doc, table, row, "entry")
    thead.appendChild(doc.createTextNode("\n    "))
    thead.appendChild(row)
    thead.appendChild(doc.createTextNode("\n    "))
    # create the table body
    tbody = doc.createElement("tbody")
    prev_row = None
    last_was_hline = 0
    children = table.childNodes
    for child in children:
551
        if child.nodeType == ELEMENT:
552
            tagName = child.tagName
553 554 555 556 557 558 559 560 561 562 563
            if tagName == "hline" and prev_row is not None:
                prev_row.setAttribute("rowsep", "1")
            elif tagName == "row":
                prev_row = child
    # save the rows:
    tbody.appendChild(doc.createTextNode("\n    "))
    move_elements_by_name(doc, table, tbody, "row", sep="\n    ")
    # and toss the rest:
    while children:
        child = children[0]
        nodeType = child.nodeType
564
        if nodeType == TEXT:
Fred Drake's avatar
Fred Drake committed
565
            if child.data.strip():
566 567
                raise ConversionError("unexpected free data in <%s>: %r"
                                      % (table.tagName, child.data))
568 569
            table.removeChild(child)
            continue
570
        if nodeType == ELEMENT:
571
            if child.tagName != "hline":
572
                raise ConversionError(
573
                    "unexpected <%s> in table" % child.tagName)
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
            table.removeChild(child)
            continue
        raise ConversionError(
            "unexpected %s node in table" % child.__class__.__name__)
    # nothing left in the <table>; add the <thead> and <tbody>
    tgroup = doc.createElement("tgroup")
    tgroup.appendChild(doc.createTextNode("\n  "))
    tgroup.appendChild(thead)
    tgroup.appendChild(doc.createTextNode("\n  "))
    tgroup.appendChild(tbody)
    tgroup.appendChild(doc.createTextNode("\n  "))
    table.appendChild(tgroup)
    # now make the <entry>s look nice:
    for row in table.getElementsByTagName("row"):
        fixup_row(doc, row)


def fixup_row(doc, row):
    entries = []
    map(entries.append, row.childNodes[1:])
    for entry in entries:
        row.insertBefore(doc.createTextNode("\n         "), entry)
#    row.appendChild(doc.createTextNode("\n      "))


def move_elements_by_name(doc, source, dest, name, sep=None):
    nodes = []
    for child in source.childNodes:
602
        if child.nodeName == name:
603 604 605 606 607 608 609 610
            nodes.append(child)
    for node in nodes:
        source.removeChild(node)
        dest.appendChild(node)
        if sep:
            dest.appendChild(doc.createTextNode(sep))


Fred Drake's avatar
Fred Drake committed
611
RECURSE_INTO_PARA_CONTAINERS = (
612
    "chapter", "abstract", "enumerate",
Fred Drake's avatar
Fred Drake committed
613
    "section", "subsection", "subsubsection",
614
    "paragraph", "subparagraph", "back-matter",
615
    "howto", "manual",
616 617
    "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
    "definitionlist", "definition",
618
    )
619 620

PARA_LEVEL_ELEMENTS = (
621
    "moduleinfo", "title", "verbatim", "enumerate", "item",
622
    "interpreter-session", "back-matter", "interactive-session",
623
    "opcodedesc", "classdesc", "datadesc",
Fred Drake's avatar
Fred Drake committed
624
    "cfuncdesc", "ctypedesc", "cvardesc",
625
    "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
Fred Drake's avatar
Fred Drake committed
626
    "funcdescni", "methoddescni", "excdescni",
627
    "tableii", "tableiii", "tableiv", "localmoduletable",
628
    "sectionauthor", "seealso", "itemize",
629
    # include <para>, so we can just do it again to get subsequent paras:
630
    PARA_ELEMENT,
631 632 633
    )

PARA_LEVEL_PRECEEDERS = (
634
    "setindexsubitem", "author",
635
    "stindex", "obindex", "COMMENT", "label", "xi:include", "title",
636
    "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
637
    "moduleauthor", "indexterm", "leader",
638 639
    )

Fred Drake's avatar
Fred Drake committed
640

641 642
def fixup_paras(doc, fragment):
    for child in fragment.childNodes:
643
        if child.nodeName in RECURSE_INTO_PARA_CONTAINERS:
644
            fixup_paras_helper(doc, child)
645
    descriptions = find_all_elements(fragment, "description")
646 647
    for description in descriptions:
        fixup_paras_helper(doc, description)
648 649


Fred Drake's avatar
Fred Drake committed
650
def fixup_paras_helper(doc, container, depth=0):
651 652
    # document is already normalized
    children = container.childNodes
653
    start = skip_leading_nodes(children)
Fred Drake's avatar
Fred Drake committed
654
    while len(children) > start:
655
        if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS:
656
            # Something to recurse into:
Fred Drake's avatar
Fred Drake committed
657
            fixup_paras_helper(doc, children[start])
658 659 660 661 662 663
        else:
            # Paragraph material:
            build_para(doc, container, start, len(children))
            if DEBUG_PARA_FIXER and depth == 10:
                sys.exit(1)
        start = skip_leading_nodes(children, start + 1)
664 665 666 667 668 669


def build_para(doc, parent, start, i):
    children = parent.childNodes
    after = start + 1
    have_last = 0
670
    BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
Fred Drake's avatar
Fred Drake committed
671 672
    # Collect all children until \n\n+ is found in a text node or a
    # member of BREAK_ELEMENTS is found.
673 674 675 676
    for j in range(start, i):
        after = j + 1
        child = children[j]
        nodeType = child.nodeType
677
        if nodeType == ELEMENT:
678
            if child.tagName in BREAK_ELEMENTS:
679 680
                after = j
                break
681
        elif nodeType == TEXT:
Fred Drake's avatar
Fred Drake committed
682
            pos = child.data.find("\n\n")
683 684 685 686 687 688 689 690
            if pos == 0:
                after = j
                break
            if pos >= 1:
                child.splitText(pos)
                break
    else:
        have_last = 1
Fred Drake's avatar
Fred Drake committed
691 692 693
    if (start + 1) > after:
        raise ConversionError(
            "build_para() could not identify content to turn into a paragraph")
694
    if children[after - 1].nodeType == TEXT:
695 696 697
        # we may need to split off trailing white space:
        child = children[after - 1]
        data = child.data
Fred Drake's avatar
Fred Drake committed
698
        if data.rstrip() != data:
699
            have_last = 0
Fred Drake's avatar
Fred Drake committed
700
            child.splitText(len(data.rstrip()))
701
    para = doc.createElement(PARA_ELEMENT)
702 703 704 705
    prev = None
    indexes = range(start, after)
    indexes.reverse()
    for j in indexes:
Fred Drake's avatar
Fred Drake committed
706
        node = parent.childNodes[j]
707 708 709 710 711
        parent.removeChild(node)
        para.insertBefore(node, prev)
        prev = node
    if have_last:
        parent.appendChild(para)
712
        parent.appendChild(doc.createTextNode("\n\n"))
Fred Drake's avatar
Fred Drake committed
713
        return len(parent.childNodes)
714
    else:
715 716 717 718 719 720 721 722 723 724
        nextnode = parent.childNodes[start]
        if nextnode.nodeType == TEXT:
            if nextnode.data and nextnode.data[0] != "\n":
                nextnode.data = "\n" + nextnode.data
        else:
            newnode = doc.createTextNode("\n")
            parent.insertBefore(newnode, nextnode)
            nextnode = newnode
            start = start + 1
        parent.insertBefore(para, nextnode)
Fred Drake's avatar
Fred Drake committed
725 726
        return start + 1

727

728
def skip_leading_nodes(children, start=0):
Fred Drake's avatar
Fred Drake committed
729 730 731
    """Return index into children of a node at which paragraph building should
    begin or a recursive call to fixup_paras_helper() should be made (for
    subsections, etc.).
732

Fred Drake's avatar
Fred Drake committed
733 734 735 736
    When the return value >= len(children), we've built all the paras we can
    from this list of children.
    """
    i = len(children)
737 738
    while i > start:
        # skip over leading comments and whitespace:
Fred Drake's avatar
Fred Drake committed
739
        child = children[start]
740
        nodeType = child.nodeType
741
        if nodeType == TEXT:
742
            data = child.data
Fred Drake's avatar
Fred Drake committed
743
            shortened = data.lstrip()
744 745 746 747
            if shortened:
                if data != shortened:
                    # break into two nodes: whitespace and non-whitespace
                    child.splitText(len(data) - len(shortened))
Fred Drake's avatar
Fred Drake committed
748 749
                    return start + 1
                return start
750
            # all whitespace, just skip
751
        elif nodeType == ELEMENT:
752
            tagName = child.tagName
Fred Drake's avatar
Fred Drake committed
753 754 755 756 757 758
            if tagName in RECURSE_INTO_PARA_CONTAINERS:
                return start
            if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
                return start
        start = start + 1
    return start
759 760


761
def fixup_rfc_references(doc, fragment):
762
    for rfcnode in find_all_elements_from_set(fragment, ("pep", "rfc")):
Fred Drake's avatar
Fred Drake committed
763
        rfcnode.appendChild(doc.createTextNode(
764
            rfcnode.tagName.upper() + " " + rfcnode.getAttribute("num")))
765 766


767 768 769
def fixup_signatures(doc, fragment):
    for child in fragment.childNodes:
        if child.nodeType == ELEMENT:
770 771
            args = child.getElementsByTagName("args")
            for arg in args:
772
                rewrite_args(doc, arg)
773 774
            args = child.getElementsByTagName("constructor-args")
            for arg in args:
775
                rewrite_args(doc, arg)
776

777 778 779 780 781 782
def rewrite_args(doc, arglist):
    fixup_args(doc, arglist)
    arglist.normalize()
    if arglist.childNodes.length == 1 and arglist.firstChild.nodeType == TEXT:
        node = arglist.firstChild
        node.data = ' '.join(node.data.split())
783 784 785

def fixup_args(doc, arglist):
    for child in arglist.childNodes:
786
        if child.nodeName == "optional":
787 788 789 790
            # found it; fix and return
            arglist.insertBefore(doc.createTextNode("["), child)
            optkids = child.childNodes
            while optkids:
791
                arglist.insertBefore(child.firstChild, child)
792 793 794 795 796
            arglist.insertBefore(doc.createTextNode("]"), child)
            arglist.removeChild(child)
            return fixup_args(doc, arglist)


797 798
def fixup_sectionauthors(doc, fragment):
    for sectauth in find_all_elements(fragment, "sectionauthor"):
Fred Drake's avatar
Fred Drake committed
799 800
        section = sectauth.parentNode
        section.removeChild(sectauth)
801
        set_tagName(sectauth, "author")
Fred Drake's avatar
Fred Drake committed
802 803 804 805 806
        sectauth.appendChild(doc.createTextNode(
            sectauth.getAttribute("name")))
        sectauth.removeAttribute("name")
        after = section.childNodes[2]
        title = section.childNodes[1]
807
        if title.nodeName != "title":
Fred Drake's avatar
Fred Drake committed
808 809 810 811 812
            after = section.childNodes[0]
        section.insertBefore(doc.createTextNode("\n  "), after)
        section.insertBefore(sectauth, after)


813 814 815
def fixup_verbatims(doc):
    for verbatim in find_all_elements(doc, "verbatim"):
        child = verbatim.childNodes[0]
816
        if child.nodeType == TEXT \
Fred Drake's avatar
Fred Drake committed
817
           and child.data.lstrip().startswith(">>>"):
818
            set_tagName(verbatim, "interactive-session")
819 820


821
def add_node_ids(fragment, counter=0):
822
    fragment.node_id = counter
823 824 825 826 827
    for node in fragment.childNodes:
        counter = counter + 1
        if node.nodeType == ELEMENT:
            counter = add_node_ids(node, counter)
        else:
828
            node.node_id = counter
829 830 831
    return counter + 1


832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
def fixup_ulink(doc, fragment):
    for ulink in find_all_elements(fragment, "ulink"):
        children = ulink.childNodes
        assert len(children) == 2
        text = children[0]
        href = children[1]
        href.normalize()
        assert len(href.childNodes) == 1
        assert href.childNodes[0].nodeType == TEXT
        url = href.childNodes[0].data
        ulink.setAttribute("href", url)
        ulink.removeChild(href)
        content = text.childNodes
        while len(content):
            ulink.appendChild(content[0])
        ulink.removeChild(text)


850 851 852 853 854 855 856 857 858 859 860
REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
                        'refexmodindex', 'refstmodindex')

def fixup_refmodindexes(fragment):
    # Locate <ref*modindex>...</> co-located with <module>...</>, and
    # remove the <ref*modindex>, replacing it with index=index on the
    # <module> element.
    nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
    d = {}
    for node in nodes:
        parent = node.parentNode
861
        d[parent.node_id] = parent
862 863 864 865 866 867
    del nodes
    map(fixup_refmodindexes_chunk, d.values())


def fixup_refmodindexes_chunk(container):
    # node is probably a <para>; let's see how often it isn't:
868
    if container.tagName != PARA_ELEMENT:
869
        bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
870 871 872 873 874 875 876 877
    module_entries = find_all_elements(container, "module")
    if not module_entries:
        return
    index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
    removes = []
    for entry in index_entries:
        children = entry.childNodes
        if len(children) != 0:
878
            bwrite("--- unexpected number of children for %s node:\n"
879
                   % entry.tagName)
880
            ewrite(entry.toxml() + "\n")
881 882
            continue
        found = 0
883
        module_name = entry.getAttribute("module")
884 885 886 887 888 889
        for node in module_entries:
            if len(node.childNodes) != 1:
                continue
            this_name = node.childNodes[0].data
            if this_name == module_name:
                found = 1
890
                node.setAttribute("index", "yes")
891 892 893 894 895 896 897 898 899
        if found:
            removes.append(entry)
    for node in removes:
        container.removeChild(node)


def fixup_bifuncindexes(fragment):
    nodes = find_all_elements(fragment, 'bifuncindex')
    d = {}
900
    # make sure that each parent is only processed once:
901 902
    for node in nodes:
        parent = node.parentNode
903
        d[parent.node_id] = parent
904 905 906 907 908 909
    del nodes
    map(fixup_bifuncindexes_chunk, d.values())


def fixup_bifuncindexes_chunk(container):
    removes = []
910 911
    entries = find_all_child_elements(container, "bifuncindex")
    function_entries = find_all_child_elements(container, "function")
912 913 914 915 916 917 918 919 920
    for entry in entries:
        function_name = entry.getAttribute("name")
        found = 0
        for func_entry in function_entries:
            t2 = func_entry.childNodes[0].data
            if t2[-2:] != "()":
                continue
            t2 = t2[:-2]
            if t2 == function_name:
921
                func_entry.setAttribute("index", "yes")
922 923 924
                func_entry.setAttribute("module", "__builtin__")
                if not found:
                    found = 1
925
                    removes.append(entry)
926 927 928 929
    for entry in removes:
        container.removeChild(entry)


930 931 932 933 934
def join_adjacent_elements(container, gi):
    queue = [container]
    while queue:
        parent = queue.pop()
        i = 0
935
        children = parent.childNodes
936 937 938 939 940 941 942 943
        nchildren = len(children)
        while i < (nchildren - 1):
            child = children[i]
            if child.nodeName == gi:
                if children[i+1].nodeName == gi:
                    ewrite("--- merging two <%s/> elements\n" % gi)
                    child = children[i]
                    nextchild = children[i+1]
944
                    nextchildren = nextchild.childNodes
945 946 947 948 949 950 951 952 953 954 955
                    while len(nextchildren):
                        node = nextchildren[0]
                        nextchild.removeChild(node)
                        child.appendChild(node)
                    parent.removeChild(nextchild)
                    continue
            if child.nodeType == ELEMENT:
                queue.append(child)
            i = i + 1


Fred Drake's avatar
Fred Drake committed
956
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
957

Fred Drake's avatar
Fred Drake committed
958 959 960
def write_esis(doc, ofp, knownempty):
    for node in doc.childNodes:
        nodeType = node.nodeType
961
        if nodeType == ELEMENT:
962
            gi = node.tagName
Fred Drake's avatar
Fred Drake committed
963 964
            if knownempty(gi):
                if node.hasChildNodes():
965 966
                    raise ValueError, \
                          "declared-empty node <%s> has children" % gi
Fred Drake's avatar
Fred Drake committed
967
                ofp.write("e\n")
968
            for k, value in node.attributes.items():
Fred Drake's avatar
Fred Drake committed
969 970 971 972 973 974 975 976
                if _token_rx.match(value):
                    dtype = "TOKEN"
                else:
                    dtype = "CDATA"
                ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
            ofp.write("(%s\n" % gi)
            write_esis(node, ofp, knownempty)
            ofp.write(")%s\n" % gi)
977
        elif nodeType == TEXT:
Fred Drake's avatar
Fred Drake committed
978
            ofp.write("-%s\n" % esistools.encode(node.data))
979
        elif nodeType == ENTITY_REFERENCE:
980
            ofp.write("&%s\n" % node.nodeName)
Fred Drake's avatar
Fred Drake committed
981 982 983 984
        else:
            raise RuntimeError, "unsupported node type: %s" % nodeType


985
def convert(ifp, ofp):
986 987 988 989 990
    events = esistools.parse(ifp)
    toktype, doc = events.getEvent()
    fragment = doc.createDocumentFragment()
    events.expandNode(fragment)

991 992
    normalize(fragment)
    simplify(doc, fragment)
993
    handle_labels(doc, fragment)
994
    handle_appendix(doc, fragment)
Fred Drake's avatar
Fred Drake committed
995 996 997 998 999 1000 1001 1002 1003 1004
    fixup_trailing_whitespace(doc, fragment, {
        # element -> (before-end-tag, after-end-tag)
        "abstract": ("\n", "\n"),
        "title": ("", "\n"),
        "chapter": ("\n", "\n\n\n"),
        "section": ("\n", "\n\n\n"),
        "subsection": ("\n", "\n\n"),
        "subsubsection": ("\n", "\n\n"),
        "paragraph": ("\n", "\n\n"),
        "subparagraph": ("\n", "\n\n"),
1005
        "description": ("\n", "\n\n"),
Fred Drake's avatar
Fred Drake committed
1006
        "enumeration": ("\n", "\n\n"),
1007
        "item": ("\n", "\n\n"),
1008
        })
1009
    cleanup_root_text(doc)
1010 1011
    cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
    cleanup_synopses(doc, fragment)
1012 1013 1014 1015 1016 1017 1018 1019
    fixup_descriptors(doc, fragment)
    fixup_verbatims(fragment)
    normalize(fragment)
    fixup_paras(doc, fragment)
    fixup_sectionauthors(doc, fragment)
    fixup_table_structures(doc, fragment)
    fixup_rfc_references(doc, fragment)
    fixup_signatures(doc, fragment)
1020
    fixup_ulink(doc, fragment)
1021 1022 1023
    add_node_ids(fragment)
    fixup_refmodindexes(fragment)
    fixup_bifuncindexes(fragment)
1024 1025 1026
    # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
    # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
    join_adjacent_elements(fragment, "option")
1027 1028 1029 1030
    # Attempt to avoid trailing blank lines:
    fragment.normalize()
    if fragment.lastChild.data[-1:] == "\n":
        fragment.lastChild.data = fragment.lastChild.data.rstrip() + "\n"
Fred Drake's avatar
Fred Drake committed
1031 1032
    #
    d = {}
1033
    for gi in events.parser.get_empties():
Fred Drake's avatar
Fred Drake committed
1034
        d[gi] = gi
1035 1036 1037
    for key in ("author", "pep", "rfc"):
        if d.has_key(key):
            del d[key]
Fred Drake's avatar
Fred Drake committed
1038 1039
    knownempty = d.has_key
    #
1040
    try:
1041
        write_esis(fragment, ofp, knownempty)
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
    except IOError, (err, msg):
        # Ignore EPIPE; it just means that whoever we're writing to stopped
        # reading.  The rest of the output would be ignored.  All other errors
        # should still be reported,
        if err != errno.EPIPE:
            raise


def main():
    if len(sys.argv) == 1:
        ifp = sys.stdin
        ofp = sys.stdout
    elif len(sys.argv) == 2:
        ifp = open(sys.argv[1])
        ofp = sys.stdout
    elif len(sys.argv) == 3:
        ifp = open(sys.argv[1])
1059 1060
        import StringIO
        ofp = StringIO.StringIO()
1061 1062 1063 1064
    else:
        usage()
        sys.exit(2)
    convert(ifp, ofp)
1065 1066 1067 1068 1069
    if len(sys.argv) == 3:
        fp = open(sys.argv[2], "w")
        fp.write(ofp.getvalue())
        fp.close()
        ofp.close()
1070 1071 1072 1073


if __name__ == "__main__":
    main()