print getmysterytext() print getmysterytext() print getmysterytext()[:400] print getmysterytext()[:1000] print getmysterytext()[:1200] print getmysterytext()[:1000] print getmysterytext()[:1200] s = u"""MåshelƆ Ma AŋfƏm ŋa Råru Ta Åmari Ma Råwuni KƏpet KÅWASƏR ÅŊKAŊ Ɔwa ta salata kåsƆthnƐ åyiki a komånƐ aŋ fƏm akƏpet, Ɔwa yi åmari mƏthƏnånƐ a komånƐ ŋa-e, ŋa yi åŋgbeth ŋa råwankom, målompi, yi måthƆfƏl ka nƆru. Ɔwa, kå kålå agbåp yi kåsay åmari ma aŋfƏm akƏpet må po kårå kådifƏthånƐ åke po lƏsƏr tåtemp ta aŋfƏm akƏpet, Ɔwa kƏ po kårå ka råru åre, pƏ yi naŋrå aŋfƏm akƏpet maŋba naŋ råwankom ra kåfƆf, kåtåŋ Ɛdina, kƏte ba rƏnes kƏpa ye, åŋe ŋå tha yi måfela ma aŋfƏm akƏpet thƆŋ bƐ nƆru. Ɔwa Ɔ fisa ti, kåmå a te gbƆndar aŋfƏm rƏ tåy o tåy bƐ mƏ yƆ ŋa aŋ thånthonƐ tƏ kƏfumpƏr aŋe bƏt åŋfƆsƆ mƏ thålƏr ŋa-e, Ɔwa tåsoma, a yi tƏkƏ bot ƐthƆ mƏkunkƏla ŋa-e. Ɔwa a yi tƏkƏ sakƏthi åkƐra åke dƐr o dƐr, tƏkƏ yƆ tåthƆf ta råru bƐ tƏmanƐ. Ɔwa tåsoma aŋfƏm ŋa tåthƆf bƐ aŋ wopånƐ ka råru, aŋ bot ka måsekrånƐ maŋ tƏkƏ beŋ kåmå aŋ gbasi Ɔfu fƏlen malane ma kåsƆŋ aŋfƏm akƏpet åmari akomånƐ ŋa, ka ayiki yi to wuni kƏpet Ɔ yi-e, kƏpa runiŋaŋi, yi bom ŋaŋ a thƏnanƐ. Ɔwa aŋ bƐŋ tƏkƏ""" set(s) for x in set(s):print x, name(x) from unicodedate import name from unicodedata import name for x in set(s):print x, name(x) [letter for letter in set(x) if re.match('\w', letter)] import re [letter for letter in set(x) if re.match('\w', letter)] [letter for letter in set(x) if re.match('\w+', letter)] [letter for letter in set(x) if re.match('\S+', letter)] [letter for letter in set(s) if re.match('\S+', letter)] for c in [letter for letter in set(s) if re.match('\S+', letter)]: print c, name(c) s print s t = u"Ɔ LATIN CAPITAL LETTER OPEN O Ə LATIN CAPITAL LETTER SCHWA Ɛ LATIN CAPITAL LETTER OPEN E t = u"əɛɔ" for c in t: print t, name(t) for c in list(t): print t, name(t) t = unicode(t) t for x in t: x for c in list(t): print t, name(t) for x in t: x for x in t: print x for x in t: print x, print name(x) for x in t: print x, name(x) def getmysterytext(): from BeautifulSoup import UnicodeDammit import os import random udhr = '/home/pat/l/udhr/data/' langs = os.listdir(udhr) lang = random.choice(langs) return UnicodeDammit(open(udhr + os.sep + lang).read()).unicode import udhr udhr.getmysterytext() udhr.getmysterytext('udhr_mag.txt') udhr.getsampletext('udhr_mag.txt') print udhr.getsampletext('udhr_mag.txt') from text import freq freq( udhr.getsampletext('udhr_mag.txt')) from text import top for x in top(3, freq( udhr.getsampletext('udhr_mag.txt'))): print x top(3, freq( udhr.getsampletext('udhr_mag.txt'))) freq( udhr.getsampletext('udhr_mag.txt')) # sum up the squares of a sequence, divide by the number of elements, # return the square root of that. def foo(seq): # i don't know what it's called. total = 0 for elem in seq: total += elem*elem return math.sqrt(float(total)/len(seq)) import math foo([1, 2, 34, 5, 99]) s = 'udhr_auv.txt' import re re.sub('udhr_', '', s) re.sub('(udhr_|.txt)', '', s) import os os.listdir('.') for x in os.listdir('.'): print type(x) from vectorspace import model from vectorsearch import model model('/home/pat/l/udhr/data/udhr_jpn.txt') model('/home/pat/l/udhr/data/udhr_mli.txt') model('/home/pat/l/udhr/data/udhr_swa.txt') model('/home/pat/l/udhr/data/udhr_eng.txt') import os os.listdir('/home/pat/l/udhr/data') from random import choice; random.choice(os.listdir('/home/pat/l/udhr/data')) choice(os.listdir('/home/pat/l/udhr/data')) data= '/home/pat/l/udhr/data/' model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data')) ) for k, v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items(): print k, v [(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()] sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()]) for v,k in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()]): print k, v for v,k in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()])[-10]: print k, v for x in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()])[-10]: print x for x in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()]): print x for x in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()])[-10]: print x for x in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()])[-10]: print x for v,k in sorted([(v,k) for k,v in model(data + os.sep + choice(os.listdir('/home/pat/l/udhr/data'))).items()]): print k,v