295 lines
8.3 KiB
Python
295 lines
8.3 KiB
Python
#!/bin/python
|
|
|
|
"""
|
|
opmlparser.py
|
|
|
|
David Janes
|
|
BlogMatrix
|
|
2004.01.18
|
|
|
|
A pre-order depth-first OPML walker. Tags are stored as class attributes,
|
|
values as unicode strings. Several useful functions for returning
|
|
values are provided.
|
|
|
|
See the __main__ section below for an example of how to use this.
|
|
"""
|
|
|
|
# Copyright (c) 2004 David P. Janes. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# - Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# - Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# - Redistributions in any form must be accompanied by information on how
|
|
# to obtain complete source code for this software and any accompanying
|
|
# software that uses this software. The source code must either be
|
|
# included in the distribution or be available for no more than the cost
|
|
# of distribution plus a nominal fee, and must be freely redistributable
|
|
# under reasonable conditions. For an executable file, complete source
|
|
# code means the source code for all modules it contains. It does not
|
|
# include source code for modules or files that typically accompany the
|
|
# major components of the operating system on which the executable file
|
|
# runs.
|
|
# - Redistributions in any form must not delete, modify or
|
|
# otherwise alter the code in "BlogWelcome.py" and/or "BlogManagerMixinSources.py"
|
|
# - Redistributions in any form must not
|
|
# disable or otherwise circumvent calls to code in "BlogWelcome.py"
|
|
# and/or "BlogManagerMixinSources.py"
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY EXPRESS OR IMPLIED
|
|
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL SLEEPYCAT SOFTWARE BE LIABLE FOR ANY
|
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
|
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import xml.parsers.expat
|
|
import re
|
|
import types
|
|
import sys
|
|
import traceback
|
|
|
|
if __name__ == '__main__':
|
|
sys.path.append("../generic")
|
|
|
|
pack_re = "[^-_a-zA-Z0-9]"
|
|
pack_rex = re.compile(pack_re)
|
|
|
|
class Outline:
|
|
def __init__(self, parent = None):
|
|
self._parent = parent
|
|
self._children = []
|
|
|
|
def set(self, tag, value):
|
|
tag = tag.encode('latin-1', 'ignore')
|
|
tag = pack_rex.sub("", tag)
|
|
if not tag or tag[0] == '_': return
|
|
|
|
setattr(self, tag, value)
|
|
|
|
def get(self, tag, otherwise = None, inherit = False, parent = False):
|
|
"""Return the value for 'tag' at this node
|
|
|
|
otherwise - if the tag isn't found (at all), use this value
|
|
inherit - if the tag isn't found, get it from a parent
|
|
parent - use the parent of this node rather than this node
|
|
"""
|
|
|
|
if parent:
|
|
if not self._parent:
|
|
return otherwise
|
|
else:
|
|
return self._parent.get(tag, otherwise, inherit)
|
|
|
|
try:
|
|
return getattr(self, pack_rex.sub("", tag))
|
|
except:
|
|
if inherit and self._parent:
|
|
return self._parent.get(tag, otherwise, inherit)
|
|
else:
|
|
return otherwise
|
|
|
|
def hierarchy(self, tag, otherwise = None):
|
|
"""
|
|
"""
|
|
result = []
|
|
|
|
node = self
|
|
while node and node._parent:
|
|
value = node.get(tag, otherwise)
|
|
if value != None:
|
|
result.insert(0, value)
|
|
|
|
node = node._parent
|
|
|
|
return result
|
|
|
|
def children(self):
|
|
return self._children
|
|
|
|
def __iter__(self):
|
|
return self.outline_iter()
|
|
|
|
def outline_iter(self):
|
|
yield self
|
|
|
|
for child in self._children:
|
|
for node in child.outline_iter():
|
|
yield node
|
|
|
|
amp_re = """&(?!(amp|gt|lt#[0-9]+);)"""
|
|
amp_rex = re.compile(amp_re, re.I|re.MULTILINE|re.DOTALL)
|
|
|
|
class OPMLParser:
|
|
def __init__(self):
|
|
self.Reset()
|
|
self.translations = {}
|
|
|
|
def Reset(self):
|
|
self.parser = xml.parsers.expat.ParserCreate()
|
|
|
|
self.parser.StartElementHandler = self.on_start_element
|
|
self.parser.EndElementHandler = self.on_end_element
|
|
self.parser.CharacterDataHandler = self.on_char_data
|
|
|
|
self.stack = []
|
|
self.head = {}
|
|
self.additional = [] # ( key, attributes, data )
|
|
|
|
self.outline = Outline()
|
|
|
|
self.in_head = False
|
|
self.in_body = False
|
|
self.in_body = False
|
|
self.capture = None
|
|
|
|
def translate(self, f, t):
|
|
self.translations[f] = t
|
|
|
|
def feed(self, data):
|
|
if not data:
|
|
Log("ignoring empty OPML file")
|
|
raise "Empty OPML File"
|
|
|
|
try:
|
|
self.parser.Parse(data, 1)
|
|
return
|
|
except:
|
|
traceback.print_exc(file = sys.stderr)
|
|
pass
|
|
|
|
Log("fixing OPML file", attempt = 1, method = "trying to fix '&'")
|
|
data = amp_rex.sub("&", data)
|
|
|
|
try:
|
|
self.Reset()
|
|
self.parser.Parse(data, 1)
|
|
return
|
|
except:
|
|
traceback.print_exc(file = sys.stderr)
|
|
pass
|
|
|
|
if type(data) == types.StringType:
|
|
original_data = data
|
|
|
|
Log("fixing OPML file", attempt = 2, method = "flattening to UTF-8")
|
|
try:
|
|
data = original_data.decode('utf-8')
|
|
|
|
self.Reset()
|
|
self.parser.Parse(data, 1)
|
|
return
|
|
except:
|
|
traceback.print_exc(file = sys.stderr)
|
|
pass
|
|
|
|
Log("fixing OPML file", attempt = 3, method = "flattening to LATIN-1")
|
|
try:
|
|
data = original_data.decode('latin-1', 'replace')
|
|
|
|
self.Reset()
|
|
self.parser.Parse(data, 1)
|
|
return
|
|
except:
|
|
traceback.print_exc(file = sys.stderr)
|
|
pass
|
|
|
|
Log("broken OPML file ... nothing could parse it further")
|
|
raise "Broken OPML File"
|
|
|
|
# the rest of this class is internal
|
|
def on_start_element(self, name, attrs):
|
|
self.stack.append((name, attrs))
|
|
|
|
if len(self.stack) == 2 and name == "head":
|
|
self.in_head = True
|
|
if len(self.stack) == 2 and name == "body":
|
|
self.in_body = True
|
|
elif self.in_head:
|
|
self.capture = ""
|
|
elif self.in_body and name == "outline":
|
|
node = Outline(self.outline)
|
|
for key, value in attrs.iteritems():
|
|
# print key
|
|
# print "on_start_element", name, key, value
|
|
node.set(self.translations.get(key, key), value)
|
|
|
|
self.outline._children.append(node)
|
|
self.outline = node
|
|
|
|
def on_end_element(self, name):
|
|
last_name, last_attrs = self.stack.pop(-1)
|
|
|
|
if self.in_head:
|
|
if len(self.stack) == 2:
|
|
self.head[name.encode('latin-1', 'ignore')] = self.capture
|
|
self.capture = None
|
|
elif len(self.stack) == 1:
|
|
self.in_head = False
|
|
elif len(self.stack) > 2:
|
|
key = map(lambda (x, y) : x, self.stack[2:])
|
|
key.append(name)
|
|
key = "/".join(key)
|
|
|
|
self.additional.append((key, dict(last_attrs), self.capture))
|
|
elif self.in_body:
|
|
if name == "outline":
|
|
self.outline = self.outline._parent
|
|
|
|
def on_char_data(self, data):
|
|
if self.capture != None:
|
|
self.capture += data
|
|
|
|
def __iter__(self):
|
|
"""
|
|
This skips the leading element, which really isn't an outline.
|
|
"""
|
|
ii = self.outline.outline_iter()
|
|
ii.next()
|
|
|
|
return ii
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
import pprint
|
|
|
|
# here's a few OPML samples. You'll have to download these yourself
|
|
# - http://opml.scripting.com/discuss/reader$19.opml
|
|
# - http://radio.weblogs.com/0001000/gems/blogrollFlat.opml
|
|
|
|
if len(sys.argv) > 1:
|
|
fin = open(sys.argv[1], 'rb')
|
|
else:
|
|
fin = sys.stdin
|
|
|
|
data = fin.read()
|
|
fin.close()
|
|
|
|
ompl_parser = OPMLParser()
|
|
ompl_parser.translate('text', 'title')
|
|
ompl_parser.translate('url', 'htmlUrl')
|
|
ompl_parser.translate('rssUrl', 'xmlUrl')
|
|
ompl_parser.feed(data)
|
|
|
|
pprint.pprint(ompl_parser.head)
|
|
pprint.pprint(ompl_parser.additional)
|
|
|
|
for node in ompl_parser:
|
|
pprint.pprint({
|
|
'title' : node.get('title'),
|
|
'htmlUrl' : node.get('htmlUrl'),
|
|
'category' : node.get('title', otherwise = '', parent = True),
|
|
'class' : node.get('bm:class', otherwise = 'unknown', inherit = True),
|
|
'hierarchy' : node.hierarchy('title', otherwise = ''),
|
|
})
|