Traversing minidom's tree (Parsing XML to text)

Wednesday, Dec 26, 2007

"""
Traverses the minidom tree and prints out the text nodes.
"""
from StringIO import StringIO
import xml.dom.minidom as dom
import sys
def xmltotxt(node, indentationLevel=0):
    print indentationLevel * ' ' + 'TAG:' + node.tagName
    #print node.firstChild.data
    for child in node.childNodes:
        if child.nodeType == child.TEXT_NODE:
            print child.data
        if child.nodeType == dom.Node.ELEMENT_NODE:
            xmltotxt(child, indentationLevel+4)
if __name__ == '__main__':
    # My XML was missing a doctype reference which the parser needs...
    dtd = '/home/hendry/inex/inex-1.4/dtd/xmlarticle.dtd'
    front = '\n\n' % dtd
    file = open(sys.argv[1])
    # ... and is added to the front of the article
    article = front + file.read()
    file.close()
    doc = dom.parse(StringIO(article))
    xmltotxt(doc.documentElement)