# -*- coding: utf-8 -*-
# Copyright (C) 2010  Michał Masłowski  <mtjm@mtjm.eu>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""
Functions for XML and HTML parsing.
"""


try:
    import lxml.etree as etree
    assert etree
except ImportError:
    import xml.etree.ElementTree as etree

CAN_READ_HTML = True
"""`True` if `read_html` method may return non-`None` objects.

Currently this requires the ``lxml`` or ``html5lib`` package.
"""


_USE_LXML = True

try:
    from lxml.html.html5parser import HTMLParser, parse
    assert HTMLParser and parse
except ImportError:
    try:
        from lxml.html import HTMLParser, parse
    except ImportError:
        _USE_LXML = False
        try:
            import html5lib
        except ImportError:
            CAN_READ_HTML = False

from getmediumurl.compat import StringIO
from getmediumurl.reader import URLReader


def read_xml(string):
    """Return an ``ElementTree`` object from XML document `string`."""
    try:
        stringio = StringIO(string)
    except TypeError:
        stringio = StringIO(bytes(string, "utf-8"))
    return etree.parse(stringio)


def get_encoding(content_type):
    """Get ``charset`` of MIME ``Content-Type`` header or `None`."""
    for fragment in content_type.split():
        for fragment in fragment.split(";"):
            if fragment.startswith("charset="):
                return fragment[8:]


def read_html(string, encoding=None):
    """Read an HTML document `string` into an ``ElementTree`` object.

    If `string` is an instance of `URLReader`, then it is used to
    obtain content and encoding of the document.

    Returns `None` if `CAN_READ_HTML` is `False`.
    """
    if CAN_READ_HTML:
        if isinstance(string, URLReader):
            content_encoding = get_encoding(string.content_type)
            if encoding is not None and encoding != content_encoding:
                raise ValueError("specified encoding of %s "
                                 "but URLReader says %s"
                                 % (encoding, content_encoding))
            string = string.content
            encoding = content_encoding
        try:
            stringio = StringIO(string)
        except TypeError:
            stringio = StringIO(bytes(string, "utf-8"))
        if _USE_LXML:
            parser = HTMLParser(encoding=encoding)
            return parse(stringio, parser=parser)
        else:
            return html5lib.parse(stringio,
                                  treebuilder="etree",
                                  namespaceHTMLElements=False,
                                  encoding=encoding)


def unescape(string):
    """Unescape a string using some HTML entities."""
    return string.replace(u"&quot;", u'"')
