Source code for assembl.lib.clean_input

from lxml import html
from lxml.html.clean import Cleaner
from html.parser import HTMLParser

VALID_TAGS = ['a',
              'b',
              'blockquote',
              'code',
              'del',
              'dd',
              'dl',
              'dt',
              'em',
              # We do not allow Hx tags, whould cause layout problems
              # (manageable however)
              'i',
              # We do not allow img tags, either the reference is a local
              # file (which we don't support yet), our we could link to a
              # bunch of outside scripts.
              'li',
              'ol',
              'p',
              'pre',
              's',
              'sup',
              'sub',
              'strike',
              'strong',
              'table',
              'td',
              'th',
              'tr',
              'ul',
              'br',
              'hr',
              ]

VALID_ATTRIBUTES = ['href',  # For hyperlinks
                    'alt',  # For accessiblity
                    'colspan', 'headers', 'abbr',
                    'scope', 'sorted'  # For tables
                    ]


_html_parser = HTMLParser()


def unescape(text):
    return _html_parser.unescape(text)


def _make_cleaner(tags):
    return Cleaner(
        allow_tags=tags, safe_attrs_only=True, remove_unknown_tags=False,
        add_nofollow=True)

_BASE_CLEANER = _make_cleaner(VALID_TAGS)


def _clean_html(html_value, cleaner):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            cleaner(f)
            yield html.tostring(f, encoding="unicode")
        else:
            yield f


def _lxml_remove_tag(tree, index):
    # remove a tag from a tree fragment without removing its content
    children = tree.getchildren()
    element = children[index]
    tree.remove(element)
    if element.text:
        if index:
            text_target = children[index - 1]
            text_target.tail = (text_target.tail or '') + element.text
        else:
            tree.text = (tree.text or '') + element.text
    for sub in element.iterchildren():
        tree.insert(index, sub)
        index += 1
    tail = element.tail or ''
    if element.tag in ('p', 'br'):
        tail += '\n'
    if tail:
        if index:
            text_target = tree.getchildren()[index - 1]
            text_target.tail = (text_target.tail or '') + tail
        else:
            tree.text = (tree.text or '') + tail
    return index


def _clean_attributes(node, valid_attributes):
    for attname in node.attrib.keys():
        if attname not in valid_attributes:
            del node.attrib[attname]


def _sanitize_html_rec(fragment, valid_tags, valid_attributes):
    i = 0
    while i < len(fragment):
        child = fragment.getchildren()[i]
        _sanitize_html_rec(child, valid_tags, valid_attributes)
        if child.tag in valid_tags:
            _clean_attributes(child, valid_attributes)
            i += 1
        else:
            i = _lxml_remove_tag(fragment, i)


def _sanitize_html_frags(html_value, valid_tags, valid_attributes):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            _sanitize_html_rec(f, valid_tags, valid_attributes)
            if f.tag in valid_tags:
                _clean_attributes(f, valid_attributes)
                yield html.tostring(f, encoding="unicode")
            else:
                if f.text:
                    yield f.text
                for sub in f:
                    yield html.tostring(sub, encoding="unicode")
                if f.tail:
                    yield f.tail
                if f.tag in ('p', 'br'):
                    yield '\n'
        else:
            yield f


def _sanitize_html_keep(html_value, valid_tags=VALID_TAGS, valid_attributes=VALID_ATTRIBUTES):
    return ''.join(_sanitize_html_frags(html_value, valid_tags, valid_attributes))


[docs]def sanitize_html(html_value, valid_tags=VALID_TAGS, valid_attributes=VALID_ATTRIBUTES, keep_tag_content=True): """Clean a HTML string, keeping only a subset of tags and attributes. :param [string] valid_tags: The name of tags that will be kept. :param [string] valid_attributes: The name of attributes that will be kept. Only used if keep_tag_content is true. :param bool keep_tag_content: Keep the content of tags that are removed """ if keep_tag_content: return _sanitize_html_keep(html_value, valid_tags, valid_attributes) if valid_tags is not None: cleaner = _make_cleaner(valid_tags) else: cleaner = _BASE_CLEANER return ''.join(_clean_html(html_value, cleaner))
[docs]def sanitize_text(text): """Clean a HTML string, keeping only the text.""" if '<' in text: return html.fromstring(text).text_content() return unescape(text)