ipodderx-core/opmlparser.py

#!/bin/python

"""
opmlparser.py

David Janes
BlogMatrix
2004.01.18

A pre-order depth-first OPML walker. Tags are stored as class attributes,
values as unicode strings. Several useful functions for returning
values are provided.

See the __main__ section below for an example of how to use this.
"""

# Copyright (c) 2004 David P. Janes. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# - Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
# - Redistributions in any form must be accompanied by information on how
#   to obtain complete source code for this software and any accompanying
#   software that uses this software. The source code must either be
#   included in the distribution or be available for no more than the cost
#   of distribution plus a nominal fee, and must be freely redistributable
#   under reasonable conditions. For an executable file, complete source
#   code means the source code for all modules it contains. It does not
#   include source code for modules or files that typically accompany the
#   major components of the operating system on which the executable file
#   runs.
# - Redistributions in any form must not delete, modify or
#   otherwise alter the code in "BlogWelcome.py" and/or "BlogManagerMixinSources.py"
# - Redistributions in any form must not
#   disable or otherwise circumvent calls to code in "BlogWelcome.py"
#   and/or "BlogManagerMixinSources.py"
#
# THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
# ARE DISCLAIMED. IN NO EVENT SHALL SLEEPYCAT SOFTWARE BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import xml.parsers.expat
import re
import types
import sys
import traceback

if __name__ == '__main__':
	sys.path.append("../generic")

pack_re = "[^-_a-zA-Z0-9]"
pack_rex = re.compile(pack_re)

class Outline:
	def __init__(self, parent = None):
		self._parent = parent
		self._children = []

	def set(self, tag, value):
		tag = tag.encode('latin-1', 'ignore')
		tag = pack_rex.sub("", tag)
		if not tag or tag[0] == '_': return

		setattr(self, tag, value)

	def get(self, tag, otherwise = None, inherit = False, parent = False):
		"""Return the value for 'tag' at this node

		otherwise - if the tag isn't found (at all), use this value
		inherit - if the tag isn't found, get it from a parent
		parent - use the parent of this node rather than this node
		"""

		if parent:
			if not self._parent:
				return	otherwise
			else:
				return	self._parent.get(tag, otherwise, inherit)

		try:
			return	getattr(self, pack_rex.sub("", tag))
		except:
			if inherit and self._parent:
				return	self._parent.get(tag, otherwise, inherit)
			else:
				return	otherwise

	def hierarchy(self, tag, otherwise = None):
		"""
		"""
		result = []

		node = self
		while node and node._parent:
			value = node.get(tag, otherwise)
			if value != None:
				result.insert(0, value)

			node = node._parent

		return	result

	def children(self):
		return	self._children

	def __iter__(self):
		return	self.outline_iter()

	def outline_iter(self):
		yield self

		for child in self._children:
			for node in child.outline_iter():
				yield node

amp_re = """&(?!(amp|gt|lt#[0-9]+);)"""
amp_rex = re.compile(amp_re, re.I|re.MULTILINE|re.DOTALL)

class OPMLParser:
	def __init__(self):
		self.Reset()
		self.translations = {}

	def Reset(self):
		self.parser = xml.parsers.expat.ParserCreate()

		self.parser.StartElementHandler = self.on_start_element
		self.parser.EndElementHandler = self.on_end_element
		self.parser.CharacterDataHandler = self.on_char_data

		self.stack = []
		self.head = {}
		self.additional = [] # ( key, attributes, data )

		self.outline = Outline()

		self.in_head = False
		self.in_body = False
		self.in_body = False
		self.capture = None

	def translate(self, f, t):
		self.translations[f] = t

	def feed(self, data):
		if not data:
			Log("ignoring empty OPML file")
			raise "Empty OPML File"

		try:
			self.parser.Parse(data, 1)
			return
		except:
			traceback.print_exc(file = sys.stderr)
			pass

		Log("fixing OPML file", attempt = 1, method = "trying to fix '&'")
		data = amp_rex.sub("&amp;", data)

		try:
			self.Reset()
			self.parser.Parse(data, 1)
			return
		except:
			traceback.print_exc(file = sys.stderr)
			pass

		if type(data) == types.StringType:
			original_data = data

			Log("fixing OPML file", attempt = 2, method = "flattening to UTF-8")
			try:
				data = original_data.decode('utf-8')

				self.Reset()
				self.parser.Parse(data, 1)
				return
			except:
				traceback.print_exc(file = sys.stderr)
				pass

			Log("fixing OPML file", attempt = 3, method = "flattening to LATIN-1")
			try:
				data = original_data.decode('latin-1', 'replace')

				self.Reset()
				self.parser.Parse(data, 1)
				return
			except:
				traceback.print_exc(file = sys.stderr)
				pass

		Log("broken OPML file ... nothing could parse it further")
		raise "Broken OPML File"

	# the rest of this class is internal
	def on_start_element(self, name, attrs):
		self.stack.append((name, attrs))

		if len(self.stack) == 2 and name == "head":
			self.in_head = True
		if len(self.stack) == 2 and name == "body":
			self.in_body = True
		elif self.in_head:
			self.capture = ""
		elif self.in_body and name == "outline":
			node = Outline(self.outline)
			for key, value in attrs.iteritems():
				# print key
				# print "on_start_element", name, key, value
				node.set(self.translations.get(key, key), value)

			self.outline._children.append(node)
			self.outline = node

	def on_end_element(self, name):
		last_name, last_attrs = self.stack.pop(-1)

		if self.in_head:
			if len(self.stack) == 2:
				self.head[name.encode('latin-1', 'ignore')] = self.capture
				self.capture = None
			elif len(self.stack) == 1:
				self.in_head = False
			elif len(self.stack) > 2:
				key = map(lambda (x, y) : x, self.stack[2:])
				key.append(name)
				key = "/".join(key)

				self.additional.append((key, dict(last_attrs), self.capture))
		elif self.in_body:
			if name == "outline":
				self.outline = self.outline._parent

	def on_char_data(self, data):
		if self.capture != None:
			self.capture += data

	def __iter__(self):
		"""
		This skips the leading element, which really isn't an outline.
		"""
		ii = self.outline.outline_iter()
		ii.next()

		return	ii

if __name__ == '__main__':
	import sys
	import pprint

	# here's a few OPML samples. You'll have to download these yourself
	# - http://opml.scripting.com/discuss/reader$19.opml
	# - http://radio.weblogs.com/0001000/gems/blogrollFlat.opml

	if len(sys.argv) > 1:
		fin = open(sys.argv[1], 'rb')
	else:
		fin = sys.stdin

	data = fin.read()
	fin.close()

	ompl_parser = OPMLParser()
	ompl_parser.translate('text', 'title')
	ompl_parser.translate('url', 'htmlUrl')
	ompl_parser.translate('rssUrl', 'xmlUrl')
	ompl_parser.feed(data)

	pprint.pprint(ompl_parser.head)
	pprint.pprint(ompl_parser.additional)

	for node in ompl_parser:
		pprint.pprint({
			'title' : node.get('title'),
			'htmlUrl' : node.get('htmlUrl'),
			'category' : node.get('title', otherwise = '', parent = True),
			'class' : node.get('bm:class', otherwise = 'unknown', inherit = True),
			'hierarchy' : node.hierarchy('title', otherwise = ''),
		})