#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
from urllib import urlopen
#from xml.etree.ElementTree import parse
from lxml.html import parse
dict_url = "http://www.cantonese.sheik.co.uk/dictionary/search/?searchtype=4&text=%s"
tones = {
1: "high level flat (or falling) - eg: faa1,bing1",
2: "rising to high level - eg: tou2, hoi2",
3: "mid level flat - eg: sai3, hei3",
4: "low level falling - eg: naam4, wu4",
5: "rising to mid level - eg: ngo5, jyu5",
6: "low level flat - eg: hai6,din6",
}
def list_extract(td, texts_selector, tone_selector="span.tone"):
texts_raw = td.cssselect(texts_selector)
texts = []
for t in texts_raw:
texts.append(t.text)
texts.extend([t.tail for t in texts_raw[0]])
tones = [tone.text for tone in td.cssselect(tone_selector)]
return zip(texts, tones)
def extract(tr):
"""
given tr, a tr HtmlElement from dict_url, returns its chinese
(trad) character and a dict of information about it
expects the tr to be the tr like so:
|
人見人愛 |
jan4 gin3 jan4 oi3 |
ren2 jian4 ren2 ai4 |
everyone will love; lovable; likable |
"""
if len(tr) != 5:
print "warning: tr %s with text %s was odd" % (tr, tr.text_content())
return {}, tr
link_el, char_el, jp_el, py_el, eng_el = tr
link = link_el[0].attrib["href"]
zh_HK = char_el[0].text
en = eng_el.text.strip().strip(u"\xa0") if eng_el.text is not None else "none?!" # usually for mandarin-only stuff
jp = list_extract(jp_el, "span.listjyutping")
py = list_extract(py_el, "span.listpingying")
return zh_HK, {"utf-8": zh_HK,
"jyutping": jp,
"pinying": py,
"english": en,
"link": link,
}
def cantodict_scrape(link):
"""scrapes cantodict char dict page for details"""
if not link.endswith("?full=true"):
link += "?full=true"
data = urlopen(link)
html = parse(data).getroot()
html.make_links_absolute()
details = {"link": link}
simple_scrapes = {
"english": ("td.wordmeaning", "text_content()"),
"typedesc": ("span.typedesc", "text"),
"zh_HK": ("td.chinesebigger", "text_content()"),
"stroke_count": ("div.charstrokecount", "text"),
"radical": ("div.charradical", "text_content()"),
"level": ("div.charlevel", "text"),
# "": ("", "text"),
}
for key, (selector, attr) in simple_scrapes.iteritems():
selection = html.cssselect(selector)
for el in selection:
if key not in details:
details[key] = ""
details[key] = details[key] \
+ eval("el." + attr).strip()
#english one is not so simple
if "english" in details and "Stroke count" in details["english"]:
english = details["english"]
interesting_part = ""
for line in english.split("\n"):
sc_idx = line.find("Stroke count:")
interesting_part += line[:sc_idx].strip() + " "
if sc_idx >= 0:
break
details["english"] = interesting_part
for key in ("jyutping", "pinyin"):
details[key] = ""
els = html.cssselect("span.card%s" % key)
for el in els:
details[key] += el.text_content()
details["examples"] = []
for example_el in html.cssselect("div.example_in_block"):
if len(example_el) != 6:
continue
link_el, sound_el, span_wordexample, ignored, ignored2, meaning_el \
= example_el
example = {}
if "href" in link_el.attrib:
example["link"] = link_el.attrib["href"]
if "href" in sound_el.attrib:
example["sound_link"] = sound_el.attrib["href"]
example["zh_HK"] = span_wordexample.text_content().strip()
example["english"] = meaning_el.text_content().strip()
details["examples"].append(example)
wdcl = html.cssselect("div.wd_code_links")[0]
details["codepoint"] = wdcl[-1].text
return details, link, html
def lookup(word, full=False):
"""uses %s to lookup word""" % dict_url
data = urlopen(dict_url % word)
html = parse(data).getroot()
chars = html.cssselect("span.chinesemed")
tables = {}
for c in chars:
table = c.getparent().getparent().getparent()
if table not in tables:
tables[table] = []
tables[table].append(c)
char_table = chars[0].getparent().getparent().getparent()
extracted_chars = [extract(c.getparent().getparent())
for c in tables[char_table]]
if full:
for other_table in tables:
if other_table != char_table:
extracted_chars.extend([extract(c.getparent().getparent())
for c in tables[other_table]])
char_details = {}
for zh_HK, info in extracted_chars:
link = info.get("link")
if link is not None:
char_details[zh_HK] = cantodict_scrape(link)
return extracted_chars, char_details
def output(word, fd=None):
if fd is None:
fd = sys.stdout
extracted_chars, char_details = lookup(word)
keys = set()
for zh_HK, (details, link, html) in char_details.iteritems():
keys.update(details.keys())
for uninteresting in ("typedesc",):
if uninteresting in keys:
keys.discard(uninteresting)
#handle "examples" key specially
if "examples" in keys:
for zh_HK, (details, link, html) in char_details.iteritems():
examples = details["examples"]
interesting_keys = ["zh_HK", "english"]
for example in examples:
if "sound_link" in example:
for k in ["sound_link"] + interesting_keys:
details["incontext" + k] = example[k]
keys.add("incontext" + k)
break
else:
for k in interesting_keys:
details["incontext" + k] = example[k]
keys.add("incontext" + k)
del details["examples"]
keys.discard("examples")
ordered_keys = [
"zh_HK",
"jyutping",
"english",
"sound_link",
"incontextzh_HK",
"incontextenglish",
"incontextsound_link",
"pinyin",
"codepoint",
]
ordered_keys.extend(sorted(keys.difference(ordered_keys)))
fd.write(",".join(ordered_keys))
fd.write("\n")
for zh_HK, (details, link, html) in char_details.iteritems():
fd.write(",".join([details.get(k, "").strip() for k in ordered_keys]))
fd.write("\n")
if __name__ == "__main__":
# fix for the below fix
if "LANG" not in os.environ:
os.environ["LANG"] = "en_GB.utf8"
# from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe
import codecs, locale
sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
for word in sys.argv[1:]:
output(word, sys.stdout)