ipodderx-core/opmlparser.py

295 lines
8.3 KiB
Python

#!/bin/python
"""
opmlparser.py
David Janes
BlogMatrix
2004.01.18
A pre-order depth-first OPML walker. Tags are stored as class attributes,
values as unicode strings. Several useful functions for returning
values are provided.
See the __main__ section below for an example of how to use this.
"""
# Copyright (c) 2004 David P. Janes. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Redistributions in any form must be accompanied by information on how
# to obtain complete source code for this software and any accompanying
# software that uses this software. The source code must either be
# included in the distribution or be available for no more than the cost
# of distribution plus a nominal fee, and must be freely redistributable
# under reasonable conditions. For an executable file, complete source
# code means the source code for all modules it contains. It does not
# include source code for modules or files that typically accompany the
# major components of the operating system on which the executable file
# runs.
# - Redistributions in any form must not delete, modify or
# otherwise alter the code in "BlogWelcome.py" and/or "BlogManagerMixinSources.py"
# - Redistributions in any form must not
# disable or otherwise circumvent calls to code in "BlogWelcome.py"
# and/or "BlogManagerMixinSources.py"
#
# THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
# ARE DISCLAIMED. IN NO EVENT SHALL SLEEPYCAT SOFTWARE BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import xml.parsers.expat
import re
import types
import sys
import traceback
if __name__ == '__main__':
sys.path.append("../generic")
pack_re = "[^-_a-zA-Z0-9]"
pack_rex = re.compile(pack_re)
class Outline:
def __init__(self, parent = None):
self._parent = parent
self._children = []
def set(self, tag, value):
tag = tag.encode('latin-1', 'ignore')
tag = pack_rex.sub("", tag)
if not tag or tag[0] == '_': return
setattr(self, tag, value)
def get(self, tag, otherwise = None, inherit = False, parent = False):
"""Return the value for 'tag' at this node
otherwise - if the tag isn't found (at all), use this value
inherit - if the tag isn't found, get it from a parent
parent - use the parent of this node rather than this node
"""
if parent:
if not self._parent:
return otherwise
else:
return self._parent.get(tag, otherwise, inherit)
try:
return getattr(self, pack_rex.sub("", tag))
except:
if inherit and self._parent:
return self._parent.get(tag, otherwise, inherit)
else:
return otherwise
def hierarchy(self, tag, otherwise = None):
"""
"""
result = []
node = self
while node and node._parent:
value = node.get(tag, otherwise)
if value != None:
result.insert(0, value)
node = node._parent
return result
def children(self):
return self._children
def __iter__(self):
return self.outline_iter()
def outline_iter(self):
yield self
for child in self._children:
for node in child.outline_iter():
yield node
amp_re = """&(?!(amp|gt|lt#[0-9]+);)"""
amp_rex = re.compile(amp_re, re.I|re.MULTILINE|re.DOTALL)
class OPMLParser:
def __init__(self):
self.Reset()
self.translations = {}
def Reset(self):
self.parser = xml.parsers.expat.ParserCreate()
self.parser.StartElementHandler = self.on_start_element
self.parser.EndElementHandler = self.on_end_element
self.parser.CharacterDataHandler = self.on_char_data
self.stack = []
self.head = {}
self.additional = [] # ( key, attributes, data )
self.outline = Outline()
self.in_head = False
self.in_body = False
self.in_body = False
self.capture = None
def translate(self, f, t):
self.translations[f] = t
def feed(self, data):
if not data:
Log("ignoring empty OPML file")
raise "Empty OPML File"
try:
self.parser.Parse(data, 1)
return
except:
traceback.print_exc(file = sys.stderr)
pass
Log("fixing OPML file", attempt = 1, method = "trying to fix '&'")
data = amp_rex.sub("&", data)
try:
self.Reset()
self.parser.Parse(data, 1)
return
except:
traceback.print_exc(file = sys.stderr)
pass
if type(data) == types.StringType:
original_data = data
Log("fixing OPML file", attempt = 2, method = "flattening to UTF-8")
try:
data = original_data.decode('utf-8')
self.Reset()
self.parser.Parse(data, 1)
return
except:
traceback.print_exc(file = sys.stderr)
pass
Log("fixing OPML file", attempt = 3, method = "flattening to LATIN-1")
try:
data = original_data.decode('latin-1', 'replace')
self.Reset()
self.parser.Parse(data, 1)
return
except:
traceback.print_exc(file = sys.stderr)
pass
Log("broken OPML file ... nothing could parse it further")
raise "Broken OPML File"
# the rest of this class is internal
def on_start_element(self, name, attrs):
self.stack.append((name, attrs))
if len(self.stack) == 2 and name == "head":
self.in_head = True
if len(self.stack) == 2 and name == "body":
self.in_body = True
elif self.in_head:
self.capture = ""
elif self.in_body and name == "outline":
node = Outline(self.outline)
for key, value in attrs.iteritems():
# print key
# print "on_start_element", name, key, value
node.set(self.translations.get(key, key), value)
self.outline._children.append(node)
self.outline = node
def on_end_element(self, name):
last_name, last_attrs = self.stack.pop(-1)
if self.in_head:
if len(self.stack) == 2:
self.head[name.encode('latin-1', 'ignore')] = self.capture
self.capture = None
elif len(self.stack) == 1:
self.in_head = False
elif len(self.stack) > 2:
key = map(lambda (x, y) : x, self.stack[2:])
key.append(name)
key = "/".join(key)
self.additional.append((key, dict(last_attrs), self.capture))
elif self.in_body:
if name == "outline":
self.outline = self.outline._parent
def on_char_data(self, data):
if self.capture != None:
self.capture += data
def __iter__(self):
"""
This skips the leading element, which really isn't an outline.
"""
ii = self.outline.outline_iter()
ii.next()
return ii
if __name__ == '__main__':
import sys
import pprint
# here's a few OPML samples. You'll have to download these yourself
# - http://opml.scripting.com/discuss/reader$19.opml
# - http://radio.weblogs.com/0001000/gems/blogrollFlat.opml
if len(sys.argv) > 1:
fin = open(sys.argv[1], 'rb')
else:
fin = sys.stdin
data = fin.read()
fin.close()
ompl_parser = OPMLParser()
ompl_parser.translate('text', 'title')
ompl_parser.translate('url', 'htmlUrl')
ompl_parser.translate('rssUrl', 'xmlUrl')
ompl_parser.feed(data)
pprint.pprint(ompl_parser.head)
pprint.pprint(ompl_parser.additional)
for node in ompl_parser:
pprint.pprint({
'title' : node.get('title'),
'htmlUrl' : node.get('htmlUrl'),
'category' : node.get('title', otherwise = '', parent = True),
'class' : node.get('bm:class', otherwise = 'unknown', inherit = True),
'hierarchy' : node.hierarchy('title', otherwise = ''),
})