#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys

from urllib import urlopen
#from xml.etree.ElementTree import parse
from lxml.html import parse

dict_url = "http://www.cantonese.sheik.co.uk/dictionary/search/?searchtype=4&text=%s"

tones = {
    1: "high level flat (or falling) - eg: faa1,bing1",
    2: "rising to high level - eg: tou2, hoi2",
    3: "mid level flat - eg: sai3, hei3",
    4: "low level falling - eg: naam4, wu4",
    5: "rising to mid level - eg: ngo5, jyu5",
    6: "low level flat - eg: hai6,din6",
    }


def list_extract(td, texts_selector, tone_selector="span.tone"):
    texts_raw = td.cssselect(texts_selector)
    texts = []
    for t in texts_raw:
        texts.append(t.text)
        texts.extend([t.tail for t in texts_raw[0]])
    tones = [tone.text for tone in td.cssselect(tone_selector)]
    return zip(texts, tones)


def extract(tr):
    """
    given tr, a tr HtmlElement from dict_url, returns its chinese
    (trad) character and a dict of information about it

expects the tr to be the tr like so:

<tr>
<td><a href="http://www.cantonese.sheik.co.uk/dictionary/words/48292/"><img src="http://www.cantonese.sheik.co.uk/images/icons/book.gif" border="0" align="absmiddle" hspace="2" vspace="3" alt="desc">
</a></td>
<td><span class="chinesemed">人見人愛</span></td>
<td valign="top"><span class="listjyutping">jan<span class="tone">4</span> gin<span class="tone">3</span> jan<span class="tone">4</span> oi<span class="tone">3</span></span></td>
<td valign="top"><span class="listpinyin">ren<span class="tone">2</span> jian<span class="tone">4</span> ren<span class="tone">2</span> ai<span class="tone">4</span></span></td>
<td> &nbsp; everyone will love; lovable; likable</td>
</tr>
    """

    if len(tr) != 5:
        print "warning: tr %s with text %s was odd" % (tr, tr.text_content())
        return {}, tr

    link_el, char_el, jp_el, py_el, eng_el = tr

    link = link_el[0].attrib["href"]
    
    zh_HK = char_el[0].text

    en = eng_el.text.strip().strip(u"\xa0") if eng_el.text is not None else "none?!" # usually for mandarin-only stuff

    jp = list_extract(jp_el, "span.listjyutping")

    py = list_extract(py_el, "span.listpingying")

    return zh_HK, {"utf-8": zh_HK,
                   "jyutping": jp,
                   "pinying": py,
                   "english": en,
                   "link": link,
                   }


def cantodict_scrape(link):
    """scrapes cantodict char dict page for details"""

    if not link.endswith("?full=true"):
        link += "?full=true"
    data = urlopen(link)
    html = parse(data).getroot()
    
    html.make_links_absolute()

    details = {"link": link}

    simple_scrapes = {
        "english": ("td.wordmeaning", "text_content()"),
        "typedesc": ("span.typedesc", "text"),
        "zh_HK": ("td.chinesebigger", "text_content()"),
        "stroke_count": ("div.charstrokecount", "text"),
        "radical": ("div.charradical", "text_content()"),
        "level": ("div.charlevel", "text"),
#        "": ("", "text"),
        }

    for key, (selector, attr) in simple_scrapes.iteritems():
        selection = html.cssselect(selector)
        for el in selection:
            if key not in details:
                details[key] = ""
            details[key] = details[key] \
                + eval("el." + attr).strip()

    #english one is not so simple
    if "english" in details and "Stroke count" in details["english"]:
        english = details["english"]
        interesting_part = ""
        for line in english.split("\n"):
            sc_idx = line.find("Stroke count:")
            interesting_part += line[:sc_idx].strip() + " "
            if sc_idx >= 0:
                break
        details["english"] = interesting_part

    for key in ("jyutping", "pinyin"):
        details[key] = ""
        els = html.cssselect("span.card%s" % key)
        for el in els:
            details[key] += el.text_content()

    details["examples"] = []
    for example_el in html.cssselect("div.example_in_block"):
        if len(example_el) != 6:
            continue
        link_el, sound_el, span_wordexample, ignored, ignored2, meaning_el \
            = example_el
        example = {}
        if "href" in link_el.attrib:
            example["link"] = link_el.attrib["href"]
        if "href" in sound_el.attrib:
            example["sound_link"] = sound_el.attrib["href"]
        example["zh_HK"] = span_wordexample.text_content().strip()
        example["english"] = meaning_el.text_content().strip()
        details["examples"].append(example)

    wdcl = html.cssselect("div.wd_code_links")[0]
    details["codepoint"] = wdcl[-1].text

    return details, link, html


def lookup(word, full=False):
    """uses %s to lookup word""" % dict_url
    data = urlopen(dict_url % word)
    html = parse(data).getroot()
    chars = html.cssselect("span.chinesemed")

    tables = {}
    for c in chars:
        table = c.getparent().getparent().getparent()
        if table not in tables:
            tables[table] = []
        tables[table].append(c)

    char_table = chars[0].getparent().getparent().getparent()

    extracted_chars = [extract(c.getparent().getparent())
                       for c in tables[char_table]]

    if full:
        for other_table in tables:
            if other_table != char_table:
                extracted_chars.extend([extract(c.getparent().getparent())
                                        for c in tables[other_table]])

    char_details = {}
    for zh_HK, info in extracted_chars:
        link = info.get("link")
        if link is not None:
            char_details[zh_HK] = cantodict_scrape(link)

    return extracted_chars, char_details


def output(word, fd=None):
    if fd is None:
        fd = sys.stdout

    extracted_chars, char_details = lookup(word)

    keys = set()
    for zh_HK, (details, link, html) in char_details.iteritems():
        keys.update(details.keys())

    for uninteresting in ("typedesc",):
        if uninteresting in keys:
            keys.discard(uninteresting)

    #handle "examples" key specially
    if "examples" in keys:
        for zh_HK, (details, link, html) in char_details.iteritems():
            examples = details["examples"]
            interesting_keys = ["zh_HK", "english"]
            for example in examples:
                if "sound_link" in example:
                    for k in ["sound_link"] + interesting_keys:
                        details["incontext" + k] = example[k]
                        keys.add("incontext" + k)
                    break
                else:
                    for k in interesting_keys:
                        details["incontext" + k] = example[k]
                        keys.add("incontext" + k)
            del details["examples"]
        keys.discard("examples")

    ordered_keys = [
        "zh_HK",
        "jyutping",
        "english",
        "sound_link",
        "incontextzh_HK",
        "incontextenglish",
        "incontextsound_link",
        "pinyin",
        "codepoint",
        ]

    ordered_keys.extend(sorted(keys.difference(ordered_keys)))

    fd.write(",".join(ordered_keys))
    fd.write("\n")

    for zh_HK, (details, link, html) in char_details.iteritems():
        fd.write(",".join([details.get(k, "").strip() for k in ordered_keys]))
        fd.write("\n")
        

if __name__ == "__main__":
    # fix for the below fix
    if "LANG" not in os.environ:
        os.environ["LANG"] = "en_GB.utf8"
    # from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe
    import codecs, locale
    sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')

    for word in sys.argv[1:]:
        output(word, sys.stdout)