#!/bin/python """ opmlparser.py David Janes BlogMatrix 2004.01.18 A pre-order depth-first OPML walker. Tags are stored as class attributes, values as unicode strings. Several useful functions for returning values are provided. See the __main__ section below for an example of how to use this. """ # Copyright (c) 2004 David P. Janes. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Redistributions in any form must be accompanied by information on how # to obtain complete source code for this software and any accompanying # software that uses this software. The source code must either be # included in the distribution or be available for no more than the cost # of distribution plus a nominal fee, and must be freely redistributable # under reasonable conditions. For an executable file, complete source # code means the source code for all modules it contains. It does not # include source code for modules or files that typically accompany the # major components of the operating system on which the executable file # runs. # - Redistributions in any form must not delete, modify or # otherwise alter the code in "BlogWelcome.py" and/or "BlogManagerMixinSources.py" # - Redistributions in any form must not # disable or otherwise circumvent calls to code in "BlogWelcome.py" # and/or "BlogManagerMixinSources.py" # # THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, # ARE DISCLAIMED. IN NO EVENT SHALL SLEEPYCAT SOFTWARE BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import xml.parsers.expat import re import types import sys import traceback if __name__ == '__main__': sys.path.append("../generic") pack_re = "[^-_a-zA-Z0-9]" pack_rex = re.compile(pack_re) class Outline: def __init__(self, parent = None): self._parent = parent self._children = [] def set(self, tag, value): tag = tag.encode('latin-1', 'ignore') tag = pack_rex.sub("", tag) if not tag or tag[0] == '_': return setattr(self, tag, value) def get(self, tag, otherwise = None, inherit = False, parent = False): """Return the value for 'tag' at this node otherwise - if the tag isn't found (at all), use this value inherit - if the tag isn't found, get it from a parent parent - use the parent of this node rather than this node """ if parent: if not self._parent: return otherwise else: return self._parent.get(tag, otherwise, inherit) try: return getattr(self, pack_rex.sub("", tag)) except: if inherit and self._parent: return self._parent.get(tag, otherwise, inherit) else: return otherwise def hierarchy(self, tag, otherwise = None): """ """ result = [] node = self while node and node._parent: value = node.get(tag, otherwise) if value != None: result.insert(0, value) node = node._parent return result def children(self): return self._children def __iter__(self): return self.outline_iter() def outline_iter(self): yield self for child in self._children: for node in child.outline_iter(): yield node amp_re = """&(?!(amp|gt|lt#[0-9]+);)""" amp_rex = re.compile(amp_re, re.I|re.MULTILINE|re.DOTALL) class OPMLParser: def __init__(self): self.Reset() self.translations = {} def Reset(self): self.parser = xml.parsers.expat.ParserCreate() self.parser.StartElementHandler = self.on_start_element self.parser.EndElementHandler = self.on_end_element self.parser.CharacterDataHandler = self.on_char_data self.stack = [] self.head = {} self.additional = [] # ( key, attributes, data ) self.outline = Outline() self.in_head = False self.in_body = False self.in_body = False self.capture = None def translate(self, f, t): self.translations[f] = t def feed(self, data): if not data: Log("ignoring empty OPML file") raise "Empty OPML File" try: self.parser.Parse(data, 1) return except: traceback.print_exc(file = sys.stderr) pass Log("fixing OPML file", attempt = 1, method = "trying to fix '&'") data = amp_rex.sub("&", data) try: self.Reset() self.parser.Parse(data, 1) return except: traceback.print_exc(file = sys.stderr) pass if type(data) == types.StringType: original_data = data Log("fixing OPML file", attempt = 2, method = "flattening to UTF-8") try: data = original_data.decode('utf-8') self.Reset() self.parser.Parse(data, 1) return except: traceback.print_exc(file = sys.stderr) pass Log("fixing OPML file", attempt = 3, method = "flattening to LATIN-1") try: data = original_data.decode('latin-1', 'replace') self.Reset() self.parser.Parse(data, 1) return except: traceback.print_exc(file = sys.stderr) pass Log("broken OPML file ... nothing could parse it further") raise "Broken OPML File" # the rest of this class is internal def on_start_element(self, name, attrs): self.stack.append((name, attrs)) if len(self.stack) == 2 and name == "head": self.in_head = True if len(self.stack) == 2 and name == "body": self.in_body = True elif self.in_head: self.capture = "" elif self.in_body and name == "outline": node = Outline(self.outline) for key, value in attrs.iteritems(): # print key # print "on_start_element", name, key, value node.set(self.translations.get(key, key), value) self.outline._children.append(node) self.outline = node def on_end_element(self, name): last_name, last_attrs = self.stack.pop(-1) if self.in_head: if len(self.stack) == 2: self.head[name.encode('latin-1', 'ignore')] = self.capture self.capture = None elif len(self.stack) == 1: self.in_head = False elif len(self.stack) > 2: key = map(lambda (x, y) : x, self.stack[2:]) key.append(name) key = "/".join(key) self.additional.append((key, dict(last_attrs), self.capture)) elif self.in_body: if name == "outline": self.outline = self.outline._parent def on_char_data(self, data): if self.capture != None: self.capture += data def __iter__(self): """ This skips the leading element, which really isn't an outline. """ ii = self.outline.outline_iter() ii.next() return ii if __name__ == '__main__': import sys import pprint # here's a few OPML samples. You'll have to download these yourself # - http://opml.scripting.com/discuss/reader$19.opml # - http://radio.weblogs.com/0001000/gems/blogrollFlat.opml if len(sys.argv) > 1: fin = open(sys.argv[1], 'rb') else: fin = sys.stdin data = fin.read() fin.close() ompl_parser = OPMLParser() ompl_parser.translate('text', 'title') ompl_parser.translate('url', 'htmlUrl') ompl_parser.translate('rssUrl', 'xmlUrl') ompl_parser.feed(data) pprint.pprint(ompl_parser.head) pprint.pprint(ompl_parser.additional) for node in ompl_parser: pprint.pprint({ 'title' : node.get('title'), 'htmlUrl' : node.get('htmlUrl'), 'category' : node.get('title', otherwise = '', parent = True), 'class' : node.get('bm:class', otherwise = 'unknown', inherit = True), 'hierarchy' : node.hierarchy('title', otherwise = ''), })