import re from string import punctuation from collections import defaultdict from wikialign import * """ Extract the text, without infoboxes and interwiki links, from a Wikipedia article. """ languagecodes = """(aa|ab|af|ak|als|am|an|ang|ar|arc|as|ast|av|ay|az|ba|bar|bat-smg|bcl|be|bg|bh|bi|bm|bn|bo|bpy|br|bs|bug|bxr|ca|cbk-zam|cdo|ce|ceb|ch|cho|chr|chy|closed-zh-tw|co|cr|crh|cs|csb|cu|cv|cy|da|de|diq|dsb|dv|dz|e|ee|el|eml|en|eo|es|et|eu|fa|ff|fi|fiu-vro|fj|fo|fr|frp|fur|fy|ga|gd|gl|glk|gn|got|gu|gv|ha|hak|haw|he|hi|ho|hr|hsb|ht|hu|hy|hz|ia|id|ie|ig|ii|ik|ilo|io|is|it|iu|ja|jbo|jv|ka|kab|kg|ki|kj|kk|kl|km|kn|ko|kr|ks|ksh|ku|kv|kw|ky|la|lad|lb|lbe|lg|li|lij|lmo|ln|lo|lt|lv|map-bms|mg|mh|mi|mk|ml|mn|mo|mr|ms|mt|mus|my|mzn|na|nah|nan|nap|nds|nds-nl|ne|new|ng|nl|nn|no|nov|nrm|nv|ny|oc|om|or|os|pa|pag|pam|pap|pdc|pi|pih|pl|pms|ps|pt|qu|rm|rmy|rn|ro|roa-rup|roa-tara|ru|rw|sa|sc|scn|sco|sd|se|sg|sh|si|simple|sk|sl|sm|sn|so|sq|sr|ss|st|stq|su|sv|sw|ta|te|tet|tg|th|ti|tk|tl|tlh|tn|to|tokipona|tpi|tr|ts|tt|tum|tw|ty|udm|ug|uk|ur|uz|ve|vec|vi|vls|vo|Vosa|wa|war|wo|wuu|xal|xh|yi|yo|za|zea|zh|zh-min-nan|zh-yue|zu)""" def remove_language_links(text): linkRE = re.compile('''\[\[%s:([^\]]+)\]\]''' % languagecodes) return linkRE.sub('', text) def remove_images(text): """ Image links can suck because they have embedded links, like this: [[Image:Joyce.png|right|thumb|upright|A [[Minangkabau]] woman in traditional dress]] So we nuke them. """ return '\n'.join([line for line in text.splitlines() if not line.startswith('[[Image')]) def retrieve(code, title): text = download_article(code, title).decode('utf-8') text = remove_language_links(text) text = strip_info_boxes(text) return text #id = retrieve('id','Indonesia') #en = retrieve('en','Indonesia') #print id #print remove_images(open('id').read()) depuncRE = re.compile('['+punctuation+']') class Article: def __init__(self, code, title): self.wikitext = download_article(code, title).decode('utf-8') self.text = self.clean() self.words = depuncRE.split(self.text) self.title = title def clean(self): text = extract_text_from_article(self.wikitext) text = remove_language_links(text) text = strip_info_boxes(text) return text en = Article('en','Israel') es = Article('es','Israel') for w in sorted(set(en.words).intersection(set(es.words))): print w