#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function, unicode_literals)

"""This is the definition of readers"""

import xmltodict
from .doc import Document, PlainTextDocument, RCV1Document, Reuters21578Document
from .helper import current_date_time, raw_text_from_xmltodict_obj


def get_readers():
    #return ["readDOC", "readPDF", "readReut21578XML", "readReut21578XMLasPlain", "read_plain", "readRCV1", "readRCV1asPlain", "readTabular", "readXML"]
    return ["read_plain", "read_xml", "read_reut21578_xml", "read_reut21578_xml_as_plain",
        "read_rcv1", "read_rcv1_as_plain"]


def read_plain(doc_full_string, doc_constructer=PlainTextDocument, name="", *args, **kwargs):
    """
    Reads the strings and transfer it.

    Parameters
    ===========
    doc_full_string: a string.
        The based string of the content of the document.

    doc_constructer: a Document constructer.
        One of the built-in Document, PlainTextDocument, RCV1Document, Reuters21578Document,
        or the one users defined.

    name: a string.
        The name of the document, a special pre-defined meta in the generated Document.

    Returns
    ========
    result: a Document.
        It should be a Document object generated by doc_constructer.
    """
    return doc_constructer(doc_full_string, name, *args, **kwargs)


def read_xml(doc_full_string, doc_constructer=Document, name="", spec=None, *args, **kwargs):
    """
    Reads the XML files and transfer it.

    Parameters
    ===========
    doc_full_string: a string.
        The based string of the content of the document.

    doc_constructer: a Document constructer.
        One of the built-in Document, PlainTextDocument, RCV1Document, Reuters21578Document,
        or the one users defined.

    name: a string.
        The name of the document, a special pre-defined meta in the generated Document.

    spec: a dict like object.
        It should provide the specification of the xml contents.
        The function reads the xml according to the spec dict and try to parse it.
        Every value of the key in the spec should be a list,
        in which the 0th one is a string telling the type chosen from ["node", "function",
        "attribute", "unevaluated"] indicating the tpye of this value and the 1st one
        tells read_xml the information of the node, function, attribute, or unevaluated.
        Here is the example of spec:

        .. code-block:: python

            dict(Author = ["node", "/REUTERS/TEXT/AUTHOR"],
                 DateTimeStamp = ["function", 'datetime'],
                 Description = ["unevaluated", ""],
                 Heading = ["node", "/REUTERS/TEXT/TITLE"],
                 ID = ["attribute", "/REUTERS/@NEWID"],
                 TOPICS = ["attribute", "/REUTERS/@TOPICS"],
                 LEWISSPLIT = ["attribute", "/REUTERS/@LEWISSPLIT"],
                 CGISPLIT = ["attribute", "/REUTERS/@CGISPLIT"],
                 OLDID = ["attribute", "/REUTERS/@OLDID"],
                 Origin = ["unevaluated", "Reuters-21578 XML"],
                 Topics = ["node", "/REUTERS/TOPICS/D"],
                 Places = ["node", "/REUTERS/PLACES/D"],
                 People = ["node", "/REUTERS/PEOPLE/D"],
                 Orgs = ["node", "/REUTERS/ORGS/D"],
                 Exchanges = ["node", "/REUTERS/EXCHANGES/D"],
                 Content = ["node", "/REUTERS/TEXT/BODY"]
            )

    Returns
    =========
    result: a Document.
        It should be a Document object generated by doc_constructer,
        with the contents in docs_source and metas parsed from spec.

    """
    if spec is None:
        spec = {}

    predefinedmetadatas = ["Name", "Author", "DateTimeStamp", "Description", "ID", "Origin", "Heading", "Language"]
    #lower_pdmd = ['name', 'author', 'datetimestamp','description', 'doc_id', 'origin', 'heading', 'language']
    spec_types = ["node", "function", "attribute", "unevaluated"]

    doc_pdmd_kwargs = {"name": name}
    doc_lmd = {}
    parsed_content = xmltodict.parse(doc_full_string)
    #print(spec)
    for key in spec:
        value = spec[key]
        if value[0] == "unevaluated":
            if key in predefinedmetadatas:
                key = key.lower() if key!="ID" else "doc_id" # avoid python name conflict
                doc_pdmd_kwargs[key] = value[1]
            else:
                doc_lmd[key] = value[1]
        elif (value[0] == "node") or (value[0] == "attribute"):
            stack = value[1].split('/')
            #print('stack', stack)
            node_value = parsed_content
            #print(node_value)
            for item in stack:
                if item == "":
                    continue
                try:
                    #print('node_value before', node_value)
                    node_value = node_value[item]
                    #print('node_value after', node_value)
                except:
                    try:
                        # magic!
                        # try to support "/codes[@class='bip:countries:1.0']/" syntax
                        item = item.split('[')
                        item = [item[0].strip(), item[1].strip().rstrip(']').split('=')]
                        item[1][1] = item[1][1][1:-1]
                        for dict_item in node_value[item[0]]:
                            if dict_item[item[1][0]] == item[1][1]:
                                node_value = dict_item
                                break
                    except:
                        node_value = ""
            #print(node_value)
            if isinstance(node_value, dict):
                node_value = raw_text_from_xmltodict_obj(node_value)
            if key in predefinedmetadatas:
                key = key.lower() if key!="ID" else "doc_id" # avoid python name conflict
                doc_pdmd_kwargs[key] = node_value
            else:
                doc_lmd[key] = node_value
        elif value[0] == "function":
            if key in predefinedmetadatas:
                key = key.lower() if key!="ID" else "doc_id" # avoid python name conflict
                doc_pdmd_kwargs[key] = value[1]()
            else:
                doc_lmd[key] = value[1]()
        else:
            # the same as "unevaluate"
            if key in predefinedmetadatas:
                key = key.lower() if key!="ID" else "doc_id" # avoid python name conflict
                doc_pdmd_kwargs[key] = value[1]
            else:
                doc_lmd[key] = value[1]

    if doc_lmd.has_key("Content"):
        doc_full_string = doc_lmd["Content"]
        #print(doc_full_string)
        doc_lmd.pop("Content", None)
    result = doc_constructer(doc_full_string, **doc_pdmd_kwargs)
    for key in doc_lmd:
        result.set_localmetadata(key, doc_lmd[key])

    return result


def read_reut21578_xml(doc_full_string, name="", *args, **kwargs):
    result = read_xml(doc_full_string, doc_constructer=Reuters21578Document,
        name = name,
        spec = dict(
            Author = ["node", "/REUTERS/TEXT/AUTHOR"],
            DateTimeStamp = ["function", current_date_time], #TODO
            Description = ["unevaluated", ""],
            Heading = ["node", "/REUTERS/TEXT/TITLE"],
            ID = ["attribute", "/REUTERS/@NEWID"],
            TOPICS = ["attribute", "/REUTERS/@TOPICS"],
            LEWISSPLIT = ["attribute", "/REUTERS/@LEWISSPLIT"],
            CGISPLIT = ["attribute", "/REUTERS/@CGISPLIT"],
            OLDID = ["attribute", "/REUTERS/@OLDID"],
            Origin = ["unevaluated", "Reuters-21578 XML"],
            Topics = ["node", "/REUTERS/TOPICS/D"],
            Places = ["node", "/REUTERS/PLACES/D"],
            People = ["node", "/REUTERS/PEOPLE/D"],
            Orgs = ["node", "/REUTERS/ORGS/D"],
            Exchanges = ["node", "/REUTERS/EXCHANGES/D"]
            )
        )
    return result


def read_reut21578_xml_as_plain(doc_full_string, name="", *args, **kwargs):
    result = read_xml(doc_full_string, doc_constructer=PlainTextDocument, name=name,
        spec = dict(
            Content = ["node", "/REUTERS/TEXT/BODY"],
            Author = ["node", "/REUTERS/TEXT/AUTHOR"],
            DateTimeStamp = ["function", current_date_time], #TODO
            Description = ["unevaluated", ""],
            Heading = ["node", "/REUTERS/TEXT/TITLE"],
            ID = ["attribute", "/REUTERS/@NEWID"],
            TOPICS = ["attribute", "/REUTERS/@TOPICS"],
            LEWISSPLIT = ["attribute", "/REUTERS/@LEWISSPLIT"],
            CGISPLIT = ["attribute", "/REUTERS/@CGISPLIT"],
            OLDID = ["attribute", "/REUTERS/@OLDID"],
            Origin = ["unevaluated", "Reuters-21578 XML"],
            Topics = ["node", "/REUTERS/TOPICS/D"],
            Places = ["node", "/REUTERS/PLACES/D"],
            People = ["node", "/REUTERS/PEOPLE/D"],
            Orgs = ["node", "/REUTERS/ORGS/D"],
            Exchanges = ["node", "/REUTERS/EXCHANGES/D"]
            )
        )
    return result


def read_rcv1(doc_full_string, name="", *args, **kwargs):
    result = read_xml(doc_full_string, doc_constructer=RCV1Document, name=name,
        spec = dict(
            Author = ["unevaluated", ""],
            DateTimeStamp = ["function", current_date_time], #TODO
            Description = ["unevaluated", ""],
            Heading = ["node", "/newsitem/title"],
            ID = ["attribute", "/newsitem/@itemid"],
            Origin = ["unevaluated", "Reuters Corpus Volume 1"],
            Publisher = ["attribute", "/newsitem/metadata/dc[@element='dc.publisher']/@value"],
            Topics = ["attribute", "/newsitem/metadata/codes[@class='bip:topics:1.0']/code/@code"],
            Industries = ["attribute", "/newsitem/metadata/codes[@class='bip:industries:1.0']/code/@code"],
            Countries = ["attribute", "/newsitem/metadata/codes[@class='bip:countries:1.0']/code/@code"]
            )
        )
    return result


def read_rcv1_as_plain(doc_full_string, name="", *args, **kwargs):
    result = read_xml(doc_full_string, doc_constructer=PlainTextDocument, name=name,
        spec = dict(
            Content = ["node", "/newsitem/text"],
            Author = ["unevaluated", ""],
            DateTimeStamp = ["function", current_date_time], #TODO
            Description = ["unevaluated", ""],
            Heading = ["node", "/newsitem/title"],
            ID = ["attribute", "/newsitem/@itemid"],
            Origin = ["unevaluated", "Reuters Corpus Volume 1"],
            Publisher = ["attribute", "/newsitem/metadata/dc[@element='dc.publisher']/@value"],
            Topics = ["attribute", "/newsitem/metadata/codes[@class='bip:topics:1.0']/code/@code"],
            Industries = ["attribute", "/newsitem/metadata/codes[@class='bip:industries:1.0']/code/@code"],
            Countries = ["attribute", "/newsitem/metadata/codes[@class='bip:countries:1.0']/code/@code"]
            )
        )
    return result
