Source code for rp2epub.document

"""
The :py:class:`Document` Class encapsulates the original source document, plus the various metadata that can and should be
extracted: short name, dated URI, editors, document type, etc. These data are extracted from the file,
usually trying to interpret the content of the file as well as the referenced CSS files. The metadata also includes information on whether there
is scripting, whether it contains svg or MathML: these should be added to the book's package file (per the specification of EPUB).

The class instance collects the various external references that must be, eventually, added to the final book
(images, CSS files, etc.).

Finally, the HTML content (ie, the DOM tree) is also modified on the fly: HTML namespace is added, some metadata
is changed a bit to fit the HTML5 requirements, the HTML is output in XHTML, etc.

The class is invoked (and "controlled") by a `:py:class:`.DocWrapper` instance.

.. :class::

Module content
--------------
"""


from urlparse import urlparse, urljoin
import json
import sys
import traceback
from xml.etree.ElementTree import SubElement
from StringIO import StringIO
from datetime import date, datetime

from .utils import HttpSession, Utils, Logger
from .cssurls import CSSList
from .config import TO_TRANSFER
import config


# Massage the core document
[docs]class Document(object):
	"""
	Encapsulation of the top level document.

	:param driver: the caller instance
	:type driver: :py:class:`.DocWrapper`
	"""

	# noinspection PyPep8,PyPep8
	def __init__(self, driver):
		self._additional_resources = []
		self._index                = 0
		self._driver               = driver
		self._download_targets     = []

		self._title          = None
		self._properties     = None
		self._short_name     = None
		self._doc_type       = "base"
		self._dated_uri      = None
		self._date           = None
		# TODO: default should be arrays, not strings
		self._editors        = []
		self._authors        = []
		self._respec_config  = None
		self._toc		     = []
		self._nav_toc	     = []
		self._css_tr_version = 2015
		self._subtitle       = None
		self._get_document_metadata()

		css_list = self._collect_downloads()
		self._css_references       = css_list.get_download_list()
		self._css_change_patterns  = css_list.change_patterns


	@property
	def download_targets(self):
		"""Array of resources to be downloaded and added to the final book. Entries of the array are (:py:class:`xml.etree.ElementTree.Element`, attribute) pairs, referring to the element and the attribute that identifies the URL of the resources to be downloaded."""
		return self._download_targets

	@property
	def driver(self):
		"""The caller: a :py:class:`.doc2epub.DocToEpub` instance."""
		return self._driver

	@property
	def html(self):
		"""The parsed version of the top level HTML element; an :py:class:`xml.etree.ElementTree.Element` instance  """
		return self._driver.html

	@property
	def additional_resources(self):
		"""List of additional resources that must be added to the book eventually. A list of tuples, containing the internal
        reference to the resource and the media type. Built up during processing, it is used in when creating the manifest
        file of the book.
        """
		return self._additional_resources

[docs]	def add_additional_resource(self, local_name, media_type):
		"""Add a pair of local name and media type to the list of additional resources. Appends to the :py:attr:`additional_resources` list.
        :param local_name: name of the resource within the final book
        :param media_type: media type (used when the resource is added to the package file)
        """
		self._additional_resources.append((local_name, media_type))

	@property
	def css_references(self):
		"""Set of `(local_name, absolute_url)` pairs for resources gathered recursively from CSS files. These
        are CSS files themselves, or other media like logos, background images, etc, referred to via a `url` statement
        in CSS.
        """
		return self._css_references

	@property
	def css_change_patterns(self):
		"""List if `(from, to)` pairs that must be used to replace strings in the CSS files on the fly. Typically
        used to adjust the values used in `url` statements.
        """
		return self._css_change_patterns

	# noinspection PyPep8
[docs]	def extract_external_references(self):
		"""Handle the external references (images, etc) in the core file, and copy them to the book. If the content referred to is

        - has a URL is a relative one, begins with the same base, or refers to the `www.w3.org` domain (the latter is for official CSS files and logos)
        - is one of the 'accepted' media types for epub

        then the file is copied and stored in the book, the reference is changed in the document,
        and the resource is marked to be added to the manifest file. HTML files are copied as XHTML files, with a ``.xhtml`` suffix.
        """

		def final_target_media(f_session, f_target):
			if f_session.media_type == 'text/html':
				return 'application/xhtml+xml', f_target.replace('.html', '.xhtml', 1)
			else:
				return f_session.media_type, f_target

		# Retrieve the value of the reference. By making a urljoin, relative URI-s are also turned into absolute one;
		# this simplifies the issue
		# Look at generic external references like images, and, possibly copy the content
		for (element, attr) in self._download_targets:
			attr_value = element.get(attr)
			# The TO_TRANSFER array collects the 'system' references collected in Assets. Although some
			# earlier manipulation on the DOM may have already set the external references to those, the
			# HTTPSession is unnecessary (and sometimes leads to 404 anyway)
			# Bottom line: those references must be filtered out
			if attr_value is not None and all(map(lambda x: x[1] != attr_value, TO_TRANSFER)):
				# Remove the possible fragment ID. This may happen if the document refers to a fragment of another
				# file locally, for example; that should not be relevant for what follows
				attr_value = attr_value.split('#')[0]

				# The following artifact is necessary to treat the WWW level, official URIs and local ones
				ref = urljoin(self.driver.base, attr_value)

				# In some cases, primarily in the case of editors' drafts, the reference is simply on the file
				# itself; that should be forgotten
				if ref == self.driver.top_uri or ref == self.driver.base:
					continue

				parsed_ref = urlparse(ref)
				# Genuine local, relative URI
				local = True if ref.startswith(self.driver.base) else False
				# Official WWW URI-s, mainly for style sheets or possibly javascript
				www_level = True if parsed_ref.netloc == "www.w3.org" else False
				if local or www_level:
					session = HttpSession(ref, check_media_type=True)
					if session.success:
						# Find/set the right name for the target document
						path = parsed_ref.path
						if path[-1] == '/':
							# This should not really happen, but may: relying on some WWW mechanism that we cannot
							# rely on in a a book
							target = 'Assets/extras/data%s.%s' % (self._index, config.ACCEPTED_MEDIA_TYPES[session.media_type])
							self._index += 1
						elif www_level:
							# This is, mainly, for official CSS files as well as W3C logos/icons; reproducing the same path as for W3C
							target = path if path[0] != '/' else path[1:]
						elif local:
							# This is for local references, reproducing the same path as in the origin
							# Removing a possible, though erroneous, first character, just to be on the safe side
							target = attr_value if attr_value[0] != '/' else attr_value[1:]
						else:
							# In fact, this should not happen...
							target = attr_value.split('/')[-1]

						# other complication: if the target is an html file, it will have to become xhtml :-(
						# this means that the target and the media types should receive a local name, to
						# be stored and used below
						final_media_type, final_target = final_target_media(session, target)

						# We can now copy the content into the final book.
						# Note that some of the media types are not to be compressed; this is taken care in the
						# "Book" instance
						self.driver.book.write_session(target, session, self.css_change_patterns)

						# Add information about the new entry; this has to be added to the manifest file
						self._additional_resources.append((final_target, final_media_type))

						# Change the original reference
						element.set(attr, final_target)
					else:
						# That resource is not available
						# Typical situation where it happens: the document is generated from respec
						# on the fly but from a place where the diff file is not yet
						# generated (but referenced from content)
						# Take out those situations that are under the control of this script
						if not element.get(attr).startswith("Assets/"):
							element.tag = "span"
							element.attrib.pop(attr)
							if element.get("rel") is not None:
								element.attrib.pop("rel")
							Logger.warning("Link to '%s' removed (non-existing local resource or of non acceptable type)" % ref)

	###################################################################################################
	# noinspection PyPep8
[docs]	def _collect_downloads(self):
		"""
        Process a document looking for (and possibly copying) external references and making some minor
        modifications on the fly. ``(Element, attribute)`` pairs are added on the fly to the internal array of downloads
        (see :py:attr:`download_targets`).

        :returns: a :py:class:`.cssurls.CSSList` instance, with all the CSS references
        """
		# To collect the CSS references and data
		css_list = CSSList(self.driver.base)

		# Do the necessary massaging on the DOM tree to make the XHTML output o.k.
		Utils.html_to_xhtml(self.html)

		# Change the value of @about to the dated URI, which is what counts...
		self.html.set("about", self.dated_uri)

		# handle stylesheet references
		for lnk in self.html.findall(".//link[@rel='stylesheet']"):
			ref = lnk.get("href")
			if urlparse(ref).netloc == "www.w3.org":
				if not ref.endswith(".css"):
					lnk.set("href",ref + ".css")
			self._download_targets.append((lnk, 'href'))
			# The CSS reference should be stored as a possible source of further references
			css_list.add_css(lnk.get("href"))

		# Handle built-in style sheet statements; this should be added to the CSS handler, too
		for style in self.html.findall(".//style"):
			# there may be cases, though not probable, that that the type attribute is set to something different
			# then text/css
			if style.get("type") is not None and style.get("type") != "text/css":
				continue
			content = " ".join([k.strip() for k in style.itertext()]).strip()
			css_list.add_css(self.driver.base, is_file=False, content=content)

		head = self.html.find(".//head")
		book_css = SubElement(head, "link")
		book_css.set("rel", "stylesheet")
		book_css.set("href", "StyleSheets/TR/book.css")

		# This is an ugly issue which comes up very very rarely: the base element screws up things
		for element in self.html.findall(".//base"):
			head.remove(element)

		# Change the HTTP equivalent value
		Utils.set_html_meta(self.html, head)

		# change the DOM
		Utils.change_DOM(self.html)

		# Collect the additional download targets
		for (tag_name, attr) in config.EXTERNAL_REFERENCES:
			for element in self.html.findall(".//%s" % tag_name):
				self._download_targets.append((element, attr))

		# Extra care should be taken with <a> elements: only local, relative URI-s should be considered,
		# excluding the pure fragment id. Ie, it should refer to another file in the local package.
		# As a pathological case, the href == "." should also be excluded to avoid self-reference
		for element in self.html.findall(".//a[@href]"):
			ref  = element.get("href")
			pref = urlparse(ref)
			if len(pref.netloc) == 0 and len(pref.scheme) == 0 and len(pref.path) != 0 and ref != ".":
				self._download_targets.append((element, 'href'))

		return css_list

	###################################################################################################
	# Metadata; all these are filled with value through the _get_document_metadata method, called at
	# initialization time
	@property
	def title(self):
		"""The ``title`` element content."""
		return self._title

	@property
	def properties(self):
		"""The properties of the document, to be added to the manifest entry"""
		return self._properties

	@property
	def respec_config(self):
		"""The full respec configuration as a Python mapping type. This is available for newer releases
        of ReSpec, but not in older. And, of course, not available for Bikeshed sources. The value is None
        if was not made available.

        Note that the rest of the code retrieves some of the common properties (e.g., short_name), i.e.,
        the rest of the code does not make use of this property. But it may be used in the future.
        """
		return self._respec_config

	@property
	def short_name(self):
		"""'Short Name', in W3C jargon"""
		return self._short_name if self._short_name is not None else "index"

	@property
	def dated_uri(self):
		"""'Dated URI', in the W3C jargon. As a fallback, this may be set to the top URI of the document if the dated uri has not been set """
		return self._dated_uri if self._dated_uri is not None else self.driver.top_uri

	@property
	def doc_type(self):
		"""Document type, eg, one of ``REC``, ``NOTE``, ``PR``, ``PER``, ``CR``, ``WD``, or ``ED``, or the values set in ReSpec"""
		return self._doc_type

	@property
	def doc_type_info(self):
		"""Structure reflecting the various aspects of documents by doc type. This is just a shorthand for ``config.DOCTYPE_INFO[self.doc_type]``"""
		return config.DOCTYPE_INFO[self.doc_type] if self.doc_type is not None else None

	@property
	def date(self):
		"""Date of publication"""
		return self._date

	@property
	def editors(self):
		"""List of editors (name + affiliation per element)"""
		return self._editors

	@property
	def authors(self):
		"""List of authors (name + affiliation per element)"""
		return self._authors

	@property
	def toc(self):
		"""Table of content, an array of :py:class:`.utils.TOC_Item` instances. It is only the top level TOC structures;
        used for the old-school TOC file as well as for the EPUB3 navigation document in case the
        original document does not have the appropriate structures in its TOC."""
		return self._toc

	@property
	def nav_toc(self):
		"""Table of content extracted from a ``<nav>`` element (if any), that is copied almost verbatim into the
        EPUB3 navigation document. It may be empty, though, because the source does not contain the required TOC
        structure, in which case the simple TOC structure is (see :py:attr:`toc`)."""
		return self._nav_toc

	@property
	def css_tr_version(self):
		"""Version (as an integer number denoting the year) of the CSS TR version. The value is 2015 or higher"""
		return self._css_tr_version

	@property
	def subtitle(self):
		""" "W3C Note/Recommendation/Draft/ etc.": the text to be reused as a subtitle on the cover page. """
		return self._subtitle

	# noinspection PyPep8
[docs]	def _get_metadata_from_respec(self, dict_config):
		"""
		Extract metadata (date, title, editors, etc.) making use of the stored ReSpec configuration structure (this
		structure includes the data set by the user plus some data added by the ReSpec process itself).

		:returns: True or False, depending on whether the right keys are available or not
		"""
		def _get_people(key):
			def _get_person(person_struct):
				retval = person_struct["name"]
				return retval + (", %s" % person_struct["company"]) if "company" in person_struct else retval

			return [_get_person(p) for p in dict_config[key]]

		# store the full configuration for possible later reuse
		self._respec_config = dict_config

		if "specStatus" in dict_config :
			self._doc_type   = dict_config["specStatus"]
			self._short_name = dict_config["shortName"] if "shortName" in dict_config else None
			self._editors    = [] if "editors" not in dict_config else _get_people("editors")
			self._authors    = [] if "authors" not in dict_config else _get_people("authors")
			if "publishDate" in dict_config:
				self._date = datetime.strptime(dict_config["publishDate"], "%Y-%m-%d").date()
			else:
				self._date = date.today()

			aref = self.html.find(".//a[@class='u-url']")
			if aref is not None:
				self._dated_uri = aref.get('href')
			return True
		else:
			Logger.warning("Spec Status is not in the ReSpec config; falling back to generated content for metadata")
			return False

	# noinspection PyBroadException
[docs]	def _get_metadata_from_source(self):
		"""
        Extract metadata (date, title, editors, etc.) 'scraping' the source, i.e., by
        extracting the data based on class names, URI patterns, etc.

        :raises R2EError: if the content is not recognized as one of the W3C document types (WD, ED, CR, PR, PER, REC, Note, or ED)
        """
		# Short name of the document
		# Find the official short name of the document
		for aref in self.html.findall(".//a[@class='u-url']"):
			self._dated_uri = aref.get('href')
			dated_name = self._dated_uri[:-1] if self._dated_uri[-1] == '/' else self._dated_uri
			self._doc_type, self._short_name = Utils.create_shortname(dated_name.split('/')[-1])
			break

		self._date = Utils.retrieve_date(self.dated_uri)

		# Extract the editors
		self._editors = Utils.extract_editors(self.html)

		# Add the right subtitle to the cover page
		for issued in self.html.findall(".//h2[@property='dcterms:issued']"):
			self._subtitle = ""
			for t in issued.itertext():
				self._subtitle += t

	# noinspection PyPep8Naming
[docs]	def _get_CSS_TR_version(self):
		"""
		Set the CSS TR version based on the document.
		Note: at the moment this is very ugly: the path of the CSS URL is checked for a date. Hopefully, there will
		be some more 'standard' way of doing this, eventually.
		"""
		self._css_tr_version = 2015

		for lnk in self.html.findall(".//link[@rel='stylesheet']"):
			ref_details = urlparse(lnk.get("href"))
			# TODO: THIS IS TEMPORARY, SHOULD BE FIXED WHEN THINGS BECOME FINAL!!!!
			if ref_details.netloc == "www.w3.org" and "2016" in ref_details.path:
				self._css_tr_version = 2016
				return
		return

[docs]	def _get_document_metadata(self):
		"""
        Extract metadata (date, title, editors, etc.)
        """
		# noinspection PyBroadException
		def _retrieve_from_respec_config():
			"""
            :return: True or False, depending on whether the metadata could be extracted via the respec config or not
            """
			head = self.html.find(".//head")
			respec_config_element = head.find(".//script[@id='initialUserConfig']")
			if respec_config_element is not None:
				try:
					respec_config = json.loads(" ".join([j for j in respec_config_element.itertext()]))
					# The respec config extracted from the file may have been overwritten on the URL
					for key in self.driver.url_respec_setting:
						respec_config[key] = self.driver.url_respec_setting[key]
				except:
					# The error message of the parse does not seem to be all to useful:-(
					# Logger.warning("Embedded ReSpec Configuration could not be parsed as JSON\n%s" % err.getvalue())
					# exc_type, exc_value, exc_traceback = sys.exc_info()
					# err = StringIO()
					# traceback.print_exception(exc_type, exc_value, exc_traceback, file=err)
					# err.close()
					# Logger.warning("Falling back to generated content for metadata")
					Logger.warning("Embedded ReSpec Configuration could not be parsed as JSON; Falling back to generated content for metadata")
					return False

				try:
					if self._get_metadata_from_respec(respec_config):
						head.remove(respec_config_element)
						Logger.info("Using the embedded ReSpec Configuration")
						return True
					else:
						return False
				except:
					exc_type, exc_value, exc_traceback = sys.exc_info()
					err = StringIO()
					traceback.print_exception(exc_type, exc_value, exc_traceback, file=err)
					Logger.warning("Embedded ReSpec Configuration couldn't be handled due to an error \n%s" % err.getvalue())
					Logger.warning("Falling back to generated content for metadata")
					err.close()
					return False
			else:
				Logger.warning("No embedded ReSpec configuration; falling back to generated content for metadata")
				return False

		# Get the title of the document
		for title_element in self.html.findall(".//title"):
			self._title = ""
			for t in title_element.itertext():
				self._title += t
			break

		# Get the CSS version of the document
		self._get_CSS_TR_version()

		# Properties to be added to the manifest
		props = Utils.get_document_properties(self.html)
		props.add("remote-resources")
		if len(props) > 0:
			self._properties = reduce(lambda x, y: x + ' ' + y, props)

		# see if the embedded config is in the file, if so, retrieve it in the form of a directory, and then
		# remove the script from the DOM tree not to pollute the output unnecessarily
		if _retrieve_from_respec_config() is not True:
			self._get_metadata_from_source()

		# Get the 'issued_as' text that will be used as a subtitle
		self._subtitle = config.DOCTYPE_INFO[self._doc_type]["subtitle"] if self._doc_type in config.DOCTYPE_INFO else ""
		self._subtitle += ", " + self.date.strftime("%d %B, %Y")

		# Extract the table of content
		(self._toc, self._nav_toc) = Utils.extract_toc(self.html, self.short_name)