# -*- coding: utf-8 -*-
# Copyright (C) 2010, 2011  Michał Masłowski  <mtjm@mtjm.eu>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""
Module containing base class for typical plugins.
"""


from urlreader.propcache import cachedproperty

from getmediumurl.compat import urljoin, unicode
from getmediumurl.clsutils import overrides
from getmediumurl.getsubstring import get_substring, set_get_substring_defaults
from getmediumurl.medium import Medium
from getmediumurl.plugin import Plugin
from getmediumurl import xmlhtml


__all__ = ("HTMLPlugin", "HTMLMedium")


#: Values of meta tag attribute name used for medium's title.
_TITLE_METAS = frozenset((
        "DC.title",
        "title",
        "og:title",
        ))


#: Content of link tag attribute rel commonly used for thumbnail URLs.
_THUMBNAIL_LINK_RELS = frozenset((
        "videothumbnail",
        "image_src",
        ))


class _PageMixin(object):

    """Code shared by classes using data from HTML pages."""

    #: Format string used instead of `url` for internal use.
    #:
    #: On some sites, page URL can be obtained in simpler way using
    #: shorter `mediumid`, and the one returned by `url` is determined
    #: from the page content.
    #:
    #: On sites where the canonical URL can be obtained by
    #: `url_format`, this attribute should be set to the same value.
    #:
    #: If set to `None`, the `url` property is used.
    raw_url_format = None

    @cachedproperty
    def raw_page_url(self):
        """Return a simple to generate URL for the page."""
        if self.raw_url_format is not None:
            return self.raw_url_format % self.mediumid
        elif self.url_format is not None:
            return self.url_format % self.mediumid
        else:
            return self.url

    @cachedproperty
    def page_data(self):
        """Return a tuple of page content and encoding."""
        reader = self.urlreader(self.raw_page_url)
        return reader.content, xmlhtml.get_encoding(reader.content_type)

    @cachedproperty
    def page(self):
        """Return page DOM tree."""
        (content, encoding) = self.page_data
        return xmlhtml.read_html(content, encoding)

    @property
    def page_content(self):
        """Return page content."""
        return self.page_data[0]

    def get_content_substring(self, start, end,
                              include_start=None, include_end=None):
        """Return page substring with specified `start` and `end`.

        :Parameters:
          `start`
            beginning of the substring to be returned
          `end`
            end of the substring to be returned, or an iterable of
            such substrings of which the first one found will be used
          `include_start`
            if true, then the returned value with begin with `start`
          `include_end`
            if true, then the returned value with end with `end`

        :Return:
          the substring, or `None` if not found

        If `include_start` or `include_end` is `None` (default), then
        it will be set to true if it looks like start or end of an
        URL.
        """
        (content, encoding) = self.page_data
        # Dailymotion has non-UTF-8 advertisements, ignore non-ASCII
        # characters there.
        if encoding is not None:
            string_content = content.decode(encoding, "ignore")
        else:
            # An 8-bit default encoding.
            string_content = content.decode("latin_1")
        return get_substring(string_content,
                             *set_get_substring_defaults(start, end,
                                                         include_start,
                                                         include_end))

    @cachedproperty
    def _base(self):
        """Base for URL on the page."""
        for element in self.page.findall("head/base"):
            href = element.get("href", None)
            if href is not None:
                return href
        return self.raw_page_url

    def make_absolute(self, url):
        """Return `url` from the page as absolute URL."""
        value = urljoin(self._base, url)
        try:
            return value.decode("ascii")
        except AttributeError:
            return value


class HTMLMedium(Medium, _PageMixin):

    """Base class for media using an HTML page."""

    @cachedproperty
    def website(self):
        """Name of the website containing medium.

        It can be overriden by the `website_name` attribute.
        """
        if self.website_name is not None:
            return self.website_name
        try:
            return self.page.xpath("/html/head/meta[@property='og:site_name']"
                                   "/@content")[0]
        except IndexError:
            return unicode(self.__class__.__name__)

    @cachedproperty
    def title(self):
        """Medium's title, or `None` if untitled."""
        for tag in self.page.findall("head/meta"):
            if tag.get("property") == "og:title":
                return xmlhtml.unescape(tag.get("content"))
            elif tag.get("name") in _TITLE_METAS:
                return xmlhtml.unescape(tag.get("content"))
        # Return the <title> tag content.  It usually contains both
        # medium title and website name, so other tags are preferred.
        return xmlhtml.unescape(self.page.findtext("head/title"))

    @cachedproperty
    def url(self):
        """URL which was matched to the plugin, or equivalent."""
        candidate = None
        try:
            candidate = self.page.xpath("/html/head/meta[@property='og:url']"
                                        "/@content")[0]
        except IndexError:
            try:
                candidate = self.page.xpath("/html/head/link[@rel='canonical']"
                                            "/@href")[0]
            except IndexError:
                return self.url_format % self.mediumid
        # It can be a relative URL, make it an absolute one.
        return self.make_absolute(candidate)

    @cachedproperty
    def thumbnail(self):
        """URL of a thumbnail image or `None` if not found."""
        thumbnail_url = None
        for tag in self.page.findall("head/link"):
            if tag.get("rel") in _THUMBNAIL_LINK_RELS:
                thumbnail_url = tag.get("href")
                break
        # For some reason Dailymotion has a different URL here, so try
        # it only if the above fails.
        if thumbnail_url is None:
            try:
                thumbnail_url = self.page.xpath("/html/head/meta[@property"
                                                "='og:image']/@content")[0]
            except IndexError:
                assert True
        if thumbnail_url is not None:
            try:
                return thumbnail_url.decode("ascii")
            except AttributeError:
                return thumbnail_url

    @cachedproperty
    def description(self):
        """Medium description or `None` if unknown."""
        try:
            return self.page.xpath("/html/head/meta[@name='description']"
                                   "/@content")[0]
        except IndexError:
            return None

    @cachedproperty
    def license(self):
        """Medium license URL or `None` if unknown."""
        try:
            return self.page.xpath("/html/head/link[@rel='license']/@href")[0]
        except IndexError:
            for tag in self.page.getiterator("a"):
                if tag.get("rel") == "license":
                    return tag.get("href")

    @cachedproperty
    def language(self):
        """Medium language code or `None` if unknown."""
        lang = self.page.getroot().get("lang", "")
        try:
            return lang.decode("ascii")
        except AttributeError:
            return lang

    @cachedproperty
    def author_name(self):
        """Name or user identification of medium's author."""
        try:
            return self.page.xpath("/html/head/meta[@name='author']"
                                   "/@content")[0]
        except IndexError:
            return None


class HTMLPlugin(Plugin, _PageMixin):

    """Base class for plugins getting data from HTML pages.

    The plugins are assumed to be used on a *page* which is the data
    located at the URL used to make the `mediumid`.  Methods of this
    class assume that the page is an HTML document.

    It requires `getmediumurl.xmlhtml.CAN_READ_HTML` set to `True`.
    """

    #: A subclass of `HTMLMedium`, one instance of which is given as
    #: the plugin's medium with the same page passed to it.
    #:
    #: The default value is `HTMLMedium` in order to avoid ``pylint``
    #: warning about calling a `None` object.  The plugin will be
    #: disabled if it keeps this value and does not override
    #: `__iter__`.
    medium_class = HTMLMedium

    def __iter__(self):
        """Iterate media."""
        assert self.medium_class is not HTMLMedium
        medium = self.medium_class(self)
        yield medium

    @classmethod
    def disabled(cls):
        """Determine if derived plugin can be used."""
        if not overrides("__iter__", cls, HTMLPlugin) \
                and cls.medium_class is HTMLMedium:
            yield "does not override medium_class or __iter__"
        for reason in super(HTMLPlugin, cls).disabled():
            yield reason
