#!/usr/bin/env python
"""
SAX-like API for HTML. (SAH)
Implementation of some SAX-like tools for processing HTML.
SAHContentHandler, SAHFilter and HTMLReader are equivalent to
their SAX counterparts. SAHSerializer is used to collect the
stream at the end of the SAH pipeline.
KeepElementsFilter and MarkupFilter are samples, but can be
useful nonetheless.
"""
__version__ = '0.5'
__author__ = 'Eugene Shumulinsky (exshum@gmail.com)'
__copyright__ = 'Copyright (c) 2005 Eugene Shumulinsky'
__license__ = 'Python'
import HTMLParser
from xml.sax import saxutils
class SAHContentHandler:
"""
Base SAH content handler interface.
Same handler_* function as HTMLParser.HTMLParser() plus reset().
"""
def __init__(self): pass
def reset(self): pass
def handle_starttag(self, tag, attrs): pass
def handle_startendtag(self, tag, attrs): pass
def handle_endtag(self, tag): pass
def handle_data(self, data): pass
def handle_charref(self, ref): pass
def handle_entityref(self, name): pass
def handle_comment(self, data): pass
def handle_decl(self, decl): pass
def handle_pi(self, data): pass
class SAHFilter(SAHContentHandler):
"""
Base content filter.
Forward signals to the delegate content_handler, if there
is one.
"""
def __init__(self, content_handler=None):
SAHContentHandler.__init__(self)
self._content_handler = None
self.set_content_handler(content_handler)
def set_content_handler(self, content_handler):
"""
Set delegate content_handler.
content_handler must be of instance None or SAHContentHandler.
"""
if not isinstance(content_handler,
(SAHContentHandler, None.__class__)):
raise HTMLParser.HTMLParseError(
'%s not instance of SAHContentHandler'
% content_handler.__class__)
self._content_handler = content_handler
def reset(self):
if self._content_handler:
self._content_handler.reset()
def handle_starttag(self, tag, attrs):
if self._content_handler:
self._content_handler.handle_starttag(tag, attrs)
def handle_startendtag(self, tag, attrs):
if self._content_handler:
self._content_handler.handle_startendtag(tag, attrs)
def handle_endtag(self, tag):
if self._content_handler:
self._content_handler.handle_endtag(tag)
def handle_data(self, data):
if self._content_handler:
self._content_handler.handle_data(data)
def handle_charref(self, ref):
if self._content_handler:
self._content_handler.handle_charref(ref)
def handle_entityref(self, name):
if self._content_handler:
self._content_handler.handle_entityref(name)
def handle_comment(self, data):
if self._content_handler:
self._content_handler.handle_comment(data)
def handle_decl(self, decl):
if self._content_handler:
self._content_handler.handle_decl(decl)
def handle_pi(self, data):
if self._content_handler:
self._content_handler.handle_pi(data)
class SAHSerializer(SAHContentHandler):
"""Serializes SAH signals to the file-like-object "output"."""
def __init__(self, output):
self._out = output
def handle_starttag(self, tag, attrs):
self.__handle_opentag(tag, attrs, is_empty=False)
def handle_startendtag(self, tag, attrs):
self.__handle_opentag(tag, attrs, is_empty=True)
def __handle_opentag(self, tag, attrs, is_empty):
self._out.write('<')
self._out.write(tag)
for n, v in attrs:
self._out.write(' %s=%s' % (n, saxutils.quoteattr(v)))
if is_empty:
self._out.write(' /')
self._out.write('>')
def handle_endtag(self, name):
self._out.write('%s>' % (name,))
def handle_data(self, data):
self._out.write(data)
def handle_charref(self, ref):
self._out.write('%s;' % (ref,))
def handle_entityref(self, name):
self._out.write('&%s;' % (name,))
def handle_comment(self, data):
self._out.write('' % (data,))
def handle_decl(self, decl):
self._out.write('' % decl)
def handle_pi(self, data):
self._out.write('%s>' % data)
class HTMLReader(SAHFilter, HTMLParser.HTMLParser):
"""
Parses HTML to generate SAHContentHandler callbacks.
content_handler is the recipient of the signals.
"""
def __init__(self, content_handler=None):
SAHFilter.__init__(self, content_handler)
HTMLParser.HTMLParser.__init__(self)
def reset(self):
SAHFilter.reset(self)
HTMLParser.HTMLParser.reset(self)
def parse(self, s):
"""Helper to feed() a single string and close() the reader."""
self.feed(s)
self.close()
class MarkupFilter(SAHFilter):
"""
Ignore various markup.
Ignore tags in omit_tags list and attributes in omit_attrs list.
Also ignore any signal where omit_ is true. The omit_all_markup
flag forces everything but data, charref and entityref to be ignored.
"""
def __init__(self, content_handler=None,
omit_tags=[], omit_attrs=[], omit_all_markup=False,
omit_pi=False, omit_decl=False, omit_comment=False,
omit_charref=False, omit_entityref=False, omit_data=False):
SAHFilter.__init__(self, content_handler)
self._omit_tags = omit_tags
self._omit_attrs = omit_attrs
self._omit_all_markup = omit_all_markup
self._omit_pi = omit_pi
self._omit_decl = omit_decl
self._omit_comment = omit_comment
self._omit_charref = omit_charref
self._omit_entityref = omit_entityref
self._omit_data = omit_data
def handle_starttag(self, tag, attrs):
if tag not in self._omit_tags and not self._omit_all_markup:
newattrs = []
for a in attrs:
if a[0] not in self._omit_attrs:
newattrs.append(a)
SAHFilter.handle_starttag(self, tag, newattrs)
def handle_startendtag(self, tag, attrs):
if tag not in self._omit_tags and not self._omit_all_markup:
newattrs = []
for a in attrs:
if a[0] not in self._omit_attrs:
newattrs.append(a)
SAHFilter.handle_startendtag(self, tag, newattrs)
def handle_endtag(self, tag):
if tag not in self._omit_tags and not self._omit_all_markup:
SAHFilter.handle_endtag(self, tag)
def handle_data(self, data):
if not self._omit_data:
SAHFilter.handle_data(self, data)
def handle_charref(self, ref):
if not self._omit_charref:
SAHFilter.handle_charref(self, ref)
def handle_entityref(self, name):
if not self._omit_entityref:
SAHFilter.handle_entityref(self, name)
def handle_comment(self, data):
if not self._omit_comment and not self._omit_all_markup:
SAHFilter.handle_comment(self, data)
def handle_decl(self, decl):
if not self._omit_decl and not self._omit_all_markup:
SAHFilter.handle_decl(self, decl)
def handle_pi(self, data):
if not self._omit_pi and not self._omit_all_markup:
SAHFilter.handle_pi(self, data)
class KeepElementsFilter(SAHFilter):
"""
Keep only the contents of tags from list keep_tags.
Depending on where the HTML comes from, it may not be well-formed.
Use tidy to balance the tags first for better luck with this class.
content_handler is as for SAHFilter. keep_tags is the list of tags
to pass through; defaults to ['html']. omit_wrapper, if True,
omits the wrapping tag (the one from keep_tags); defaults to False.
"""
def __init__(self, content_handler=None,
keep_tags=['html'], omit_wrapper=False):
SAHFilter.__init__(self, content_handler)
self._ignore = True
self._omit_wrapper = omit_wrapper
self._tags = keep_tags
def handle_starttag(self, tag, attrs):
if tag in self._tags:
self._ignore = False
if self._omit_wrapper:
return
if self._ignore:
return
SAHFilter.handle_starttag(self, tag, attrs)
def handle_startendtag(self, tag, attrs):
if self._ignore and (tag in self._tags and not self._omit_wrapper):
return
SAHFilter.handle_startendtag(self, tag, attrs)
def handle_endtag(self, tag):
if tag in self._tags:
self._ignore = True
if self._omit_wrapper:
return
if self._ignore:
return
SAHFilter.handle_endtag(self, tag)
def handle_data(self, data):
if self._ignore:
return
SAHFilter.handle_data(self, data)
def handle_charref(self, ref):
if self._ignore:
return
SAHFilter.handle_charref(self, ref)
def handle_entityref(self, name):
if self._ignore:
return
SAHFilter.handle_entityref(self, name)
def handle_comment(self, data):
if self._ignore:
return
SAHFilter.handle_comment(self, data)
def handle_decl(self, decl):
if self._ignore:
return
SAHFilter.handle_decl(self, decl)
def handle_pi(self, data):
if self._ignore:
return
SAHFilter.handle_pi(self, data)
def assemble_pipeline(parts):
"""
Convenience function to build SAH pipeline.
parts is a list of SAHContentHandler.
Returns tuple of first SAHContentHandler, last SAHContentHandler
"""
first = None
last = None
prev = None
try:
prev = first = last = parts[0]
for ch in parts[1:]:
last = ch
prev.set_content_handler(ch)
prev = ch
except IndexError:
pass
return first, last