#!/usr/bin/env python """ SAX-like API for HTML. (SAH) Implementation of some SAX-like tools for processing HTML. SAHContentHandler, SAHFilter and HTMLReader are equivalent to their SAX counterparts. SAHSerializer is used to collect the stream at the end of the SAH pipeline. KeepElementsFilter and MarkupFilter are samples, but can be useful nonetheless. """ __version__ = '0.5' __author__ = 'Eugene Shumulinsky (exshum@gmail.com)' __copyright__ = 'Copyright (c) 2005 Eugene Shumulinsky' __license__ = 'Python' import HTMLParser from xml.sax import saxutils class SAHContentHandler: """ Base SAH content handler interface. Same handler_* function as HTMLParser.HTMLParser() plus reset(). """ def __init__(self): pass def reset(self): pass def handle_starttag(self, tag, attrs): pass def handle_startendtag(self, tag, attrs): pass def handle_endtag(self, tag): pass def handle_data(self, data): pass def handle_charref(self, ref): pass def handle_entityref(self, name): pass def handle_comment(self, data): pass def handle_decl(self, decl): pass def handle_pi(self, data): pass class SAHFilter(SAHContentHandler): """ Base content filter. Forward signals to the delegate content_handler, if there is one. """ def __init__(self, content_handler=None): SAHContentHandler.__init__(self) self._content_handler = None self.set_content_handler(content_handler) def set_content_handler(self, content_handler): """ Set delegate content_handler. content_handler must be of instance None or SAHContentHandler. """ if not isinstance(content_handler, (SAHContentHandler, None.__class__)): raise HTMLParser.HTMLParseError( '%s not instance of SAHContentHandler' % content_handler.__class__) self._content_handler = content_handler def reset(self): if self._content_handler: self._content_handler.reset() def handle_starttag(self, tag, attrs): if self._content_handler: self._content_handler.handle_starttag(tag, attrs) def handle_startendtag(self, tag, attrs): if self._content_handler: self._content_handler.handle_startendtag(tag, attrs) def handle_endtag(self, tag): if self._content_handler: self._content_handler.handle_endtag(tag) def handle_data(self, data): if self._content_handler: self._content_handler.handle_data(data) def handle_charref(self, ref): if self._content_handler: self._content_handler.handle_charref(ref) def handle_entityref(self, name): if self._content_handler: self._content_handler.handle_entityref(name) def handle_comment(self, data): if self._content_handler: self._content_handler.handle_comment(data) def handle_decl(self, decl): if self._content_handler: self._content_handler.handle_decl(decl) def handle_pi(self, data): if self._content_handler: self._content_handler.handle_pi(data) class SAHSerializer(SAHContentHandler): """Serializes SAH signals to the file-like-object "output".""" def __init__(self, output): self._out = output def handle_starttag(self, tag, attrs): self.__handle_opentag(tag, attrs, is_empty=False) def handle_startendtag(self, tag, attrs): self.__handle_opentag(tag, attrs, is_empty=True) def __handle_opentag(self, tag, attrs, is_empty): self._out.write('<') self._out.write(tag) for n, v in attrs: self._out.write(' %s=%s' % (n, saxutils.quoteattr(v))) if is_empty: self._out.write(' /') self._out.write('>') def handle_endtag(self, name): self._out.write('' % (name,)) def handle_data(self, data): self._out.write(data) def handle_charref(self, ref): self._out.write('&#%s;' % (ref,)) def handle_entityref(self, name): self._out.write('&%s;' % (name,)) def handle_comment(self, data): self._out.write('' % (data,)) def handle_decl(self, decl): self._out.write('' % decl) def handle_pi(self, data): self._out.write('' % data) class HTMLReader(SAHFilter, HTMLParser.HTMLParser): """ Parses HTML to generate SAHContentHandler callbacks. content_handler is the recipient of the signals. """ def __init__(self, content_handler=None): SAHFilter.__init__(self, content_handler) HTMLParser.HTMLParser.__init__(self) def reset(self): SAHFilter.reset(self) HTMLParser.HTMLParser.reset(self) def parse(self, s): """Helper to feed() a single string and close() the reader.""" self.feed(s) self.close() class MarkupFilter(SAHFilter): """ Ignore various markup. Ignore tags in omit_tags list and attributes in omit_attrs list. Also ignore any signal where omit_ is true. The omit_all_markup flag forces everything but data, charref and entityref to be ignored. """ def __init__(self, content_handler=None, omit_tags=[], omit_attrs=[], omit_all_markup=False, omit_pi=False, omit_decl=False, omit_comment=False, omit_charref=False, omit_entityref=False, omit_data=False): SAHFilter.__init__(self, content_handler) self._omit_tags = omit_tags self._omit_attrs = omit_attrs self._omit_all_markup = omit_all_markup self._omit_pi = omit_pi self._omit_decl = omit_decl self._omit_comment = omit_comment self._omit_charref = omit_charref self._omit_entityref = omit_entityref self._omit_data = omit_data def handle_starttag(self, tag, attrs): if tag not in self._omit_tags and not self._omit_all_markup: newattrs = [] for a in attrs: if a[0] not in self._omit_attrs: newattrs.append(a) SAHFilter.handle_starttag(self, tag, newattrs) def handle_startendtag(self, tag, attrs): if tag not in self._omit_tags and not self._omit_all_markup: newattrs = [] for a in attrs: if a[0] not in self._omit_attrs: newattrs.append(a) SAHFilter.handle_startendtag(self, tag, newattrs) def handle_endtag(self, tag): if tag not in self._omit_tags and not self._omit_all_markup: SAHFilter.handle_endtag(self, tag) def handle_data(self, data): if not self._omit_data: SAHFilter.handle_data(self, data) def handle_charref(self, ref): if not self._omit_charref: SAHFilter.handle_charref(self, ref) def handle_entityref(self, name): if not self._omit_entityref: SAHFilter.handle_entityref(self, name) def handle_comment(self, data): if not self._omit_comment and not self._omit_all_markup: SAHFilter.handle_comment(self, data) def handle_decl(self, decl): if not self._omit_decl and not self._omit_all_markup: SAHFilter.handle_decl(self, decl) def handle_pi(self, data): if not self._omit_pi and not self._omit_all_markup: SAHFilter.handle_pi(self, data) class KeepElementsFilter(SAHFilter): """ Keep only the contents of tags from list keep_tags. Depending on where the HTML comes from, it may not be well-formed. Use tidy to balance the tags first for better luck with this class. content_handler is as for SAHFilter. keep_tags is the list of tags to pass through; defaults to ['html']. omit_wrapper, if True, omits the wrapping tag (the one from keep_tags); defaults to False. """ def __init__(self, content_handler=None, keep_tags=['html'], omit_wrapper=False): SAHFilter.__init__(self, content_handler) self._ignore = True self._omit_wrapper = omit_wrapper self._tags = keep_tags def handle_starttag(self, tag, attrs): if tag in self._tags: self._ignore = False if self._omit_wrapper: return if self._ignore: return SAHFilter.handle_starttag(self, tag, attrs) def handle_startendtag(self, tag, attrs): if self._ignore and (tag in self._tags and not self._omit_wrapper): return SAHFilter.handle_startendtag(self, tag, attrs) def handle_endtag(self, tag): if tag in self._tags: self._ignore = True if self._omit_wrapper: return if self._ignore: return SAHFilter.handle_endtag(self, tag) def handle_data(self, data): if self._ignore: return SAHFilter.handle_data(self, data) def handle_charref(self, ref): if self._ignore: return SAHFilter.handle_charref(self, ref) def handle_entityref(self, name): if self._ignore: return SAHFilter.handle_entityref(self, name) def handle_comment(self, data): if self._ignore: return SAHFilter.handle_comment(self, data) def handle_decl(self, decl): if self._ignore: return SAHFilter.handle_decl(self, decl) def handle_pi(self, data): if self._ignore: return SAHFilter.handle_pi(self, data) def assemble_pipeline(parts): """ Convenience function to build SAH pipeline. parts is a list of SAHContentHandler. Returns tuple of first SAHContentHandler, last SAHContentHandler """ first = None last = None prev = None try: prev = first = last = parts[0] for ch in parts[1:]: last = ch prev.set_content_handler(ch) prev = ch except IndexError: pass return first, last