import re from string import punctuation from collections import defaultdict from wikialign import * puncRE = re.compile('['+punctuation+']') id = download_article('id','Barack_Obama').decode('utf-8') en = download_article('en','Barack_Obama').decode('utf-8') def chunks(text): return puncRE.split(text) code = '('+'|'.join(codes2names.keys()+['simple','zh-yue','Image','pdc'])+')' linkRE = re.compile('''\[\[%s:([^\]]+)\]\]''' % code) def clean(text): return strip_info_boxes(linkRE.sub('', text)) id = clean(id) en = clean(en) print 'INDONESIAN WORDS' print ' '.join(sorted(set(id.split()), key=len)) #print 'ENGLISH WORDS' #print ' '.join(sorted(set(en.split()), key=len))