Published: Wednesday, Dec 26, 2007 Last modified: Thursday, Nov 14, 2024
"""
Traverses the minidom tree and prints out the text nodes.
"""
from StringIO import StringIO
import xml.dom.minidom as dom
import sys
def xmltotxt(node, indentationLevel=0):
print indentationLevel * ' ' + 'TAG:' + node.tagName
#print node.firstChild.data
for child in node.childNodes:
if child.nodeType == child.TEXT_NODE:
print child.data
if child.nodeType == dom.Node.ELEMENT_NODE:
xmltotxt(child, indentationLevel+4)
if __name__ == '__main__':
# My XML was missing a doctype reference which the parser needs...
dtd = '/home/hendry/inex/inex-1.4/dtd/xmlarticle.dtd'
front = '\n\n' % dtd
file = open(sys.argv[1])
# ... and is added to the front of the article
article = front + file.read()
file.close()
doc = dom.parse(StringIO(article))
xmltotxt(doc.documentElement)