#!/usr/bin/env python # -*- coding: utf-8 -*- import os import sys from urllib import urlopen #from xml.etree.ElementTree import parse from lxml.html import parse dict_url = "http://www.cantonese.sheik.co.uk/dictionary/search/?searchtype=4&text=%s" tones = { 1: "high level flat (or falling) - eg: faa1,bing1", 2: "rising to high level - eg: tou2, hoi2", 3: "mid level flat - eg: sai3, hei3", 4: "low level falling - eg: naam4, wu4", 5: "rising to mid level - eg: ngo5, jyu5", 6: "low level flat - eg: hai6,din6", } def list_extract(td, texts_selector, tone_selector="span.tone"): texts_raw = td.cssselect(texts_selector) texts = [] for t in texts_raw: texts.append(t.text) texts.extend([t.tail for t in texts_raw[0]]) tones = [tone.text for tone in td.cssselect(tone_selector)] return zip(texts, tones) def extract(tr): """ given tr, a tr HtmlElement from dict_url, returns its chinese (trad) character and a dict of information about it expects the tr to be the tr like so: desc 人見人愛 jan4 gin3 jan4 oi3 ren2 jian4 ren2 ai4   everyone will love; lovable; likable """ if len(tr) != 5: print "warning: tr %s with text %s was odd" % (tr, tr.text_content()) return {}, tr link_el, char_el, jp_el, py_el, eng_el = tr link = link_el[0].attrib["href"] zh_HK = char_el[0].text en = eng_el.text.strip().strip(u"\xa0") if eng_el.text is not None else "none?!" # usually for mandarin-only stuff jp = list_extract(jp_el, "span.listjyutping") py = list_extract(py_el, "span.listpingying") return zh_HK, {"utf-8": zh_HK, "jyutping": jp, "pinying": py, "english": en, "link": link, } def cantodict_scrape(link): """scrapes cantodict char dict page for details""" if not link.endswith("?full=true"): link += "?full=true" data = urlopen(link) html = parse(data).getroot() html.make_links_absolute() details = {"link": link} simple_scrapes = { "english": ("td.wordmeaning", "text_content()"), "typedesc": ("span.typedesc", "text"), "zh_HK": ("td.chinesebigger", "text_content()"), "stroke_count": ("div.charstrokecount", "text"), "radical": ("div.charradical", "text_content()"), "level": ("div.charlevel", "text"), # "": ("", "text"), } for key, (selector, attr) in simple_scrapes.iteritems(): selection = html.cssselect(selector) for el in selection: if key not in details: details[key] = "" details[key] = details[key] \ + eval("el." + attr).strip() #english one is not so simple if "english" in details and "Stroke count" in details["english"]: english = details["english"] interesting_part = "" for line in english.split("\n"): sc_idx = line.find("Stroke count:") interesting_part += line[:sc_idx].strip() + " " if sc_idx >= 0: break details["english"] = interesting_part for key in ("jyutping", "pinyin"): details[key] = "" els = html.cssselect("span.card%s" % key) for el in els: details[key] += el.text_content() details["examples"] = [] for example_el in html.cssselect("div.example_in_block"): if len(example_el) != 6: continue link_el, sound_el, span_wordexample, ignored, ignored2, meaning_el \ = example_el example = {} if "href" in link_el.attrib: example["link"] = link_el.attrib["href"] if "href" in sound_el.attrib: example["sound_link"] = sound_el.attrib["href"] example["zh_HK"] = span_wordexample.text_content().strip() example["english"] = meaning_el.text_content().strip() details["examples"].append(example) wdcl = html.cssselect("div.wd_code_links")[0] details["codepoint"] = wdcl[-1].text return details, link, html def lookup(word, full=False): """uses %s to lookup word""" % dict_url data = urlopen(dict_url % word) html = parse(data).getroot() chars = html.cssselect("span.chinesemed") tables = {} for c in chars: table = c.getparent().getparent().getparent() if table not in tables: tables[table] = [] tables[table].append(c) char_table = chars[0].getparent().getparent().getparent() extracted_chars = [extract(c.getparent().getparent()) for c in tables[char_table]] if full: for other_table in tables: if other_table != char_table: extracted_chars.extend([extract(c.getparent().getparent()) for c in tables[other_table]]) char_details = {} for zh_HK, info in extracted_chars: link = info.get("link") if link is not None: char_details[zh_HK] = cantodict_scrape(link) return extracted_chars, char_details def output(word, fd=None): if fd is None: fd = sys.stdout extracted_chars, char_details = lookup(word) keys = set() for zh_HK, (details, link, html) in char_details.iteritems(): keys.update(details.keys()) for uninteresting in ("typedesc",): if uninteresting in keys: keys.discard(uninteresting) #handle "examples" key specially if "examples" in keys: for zh_HK, (details, link, html) in char_details.iteritems(): examples = details["examples"] interesting_keys = ["zh_HK", "english"] for example in examples: if "sound_link" in example: for k in ["sound_link"] + interesting_keys: details["incontext" + k] = example[k] keys.add("incontext" + k) break else: for k in interesting_keys: details["incontext" + k] = example[k] keys.add("incontext" + k) del details["examples"] keys.discard("examples") ordered_keys = [ "zh_HK", "jyutping", "english", "sound_link", "incontextzh_HK", "incontextenglish", "incontextsound_link", "pinyin", "codepoint", ] ordered_keys.extend(sorted(keys.difference(ordered_keys))) fd.write(",".join(ordered_keys)) fd.write("\n") for zh_HK, (details, link, html) in char_details.iteritems(): fd.write(",".join([details.get(k, "").strip() for k in ordered_keys])) fd.write("\n") if __name__ == "__main__": # fix for the below fix if "LANG" not in os.environ: os.environ["LANG"] = "en_GB.utf8" # from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe import codecs, locale sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace') for word in sys.argv[1:]: output(word, sys.stdout)