#!/usr/bin/env python # -*- coding: utf-8 -*- """ Suppose you want to learn a new writing system. Ingredients: * Some text (in Unicode, of course) * Python2.5 First, read the text: """ import sys, codecs text = open(sys.argv[1]) #text = codecs.getreader('utf-8')(text) from BeautifulSoup import UnicodeDammit text = UnicodeDammit(text.read()).unicode text = ''.join([line for line in text]) print text print "^text" """ Now, build a model of all the letters in the text. defaultdict powers, activate. """ from collections import defaultdict def lettermodel(text): model = defaultdict(lambda: 1) for letter in list(text): model[letter] += 1 return model histogram = lettermodel(text) """ So, model has a histogram of all the letters. Let's look up the frequencies of a random word from the text. """ import re wordRE = re.compile('\w+', re.UNICODE) def words(text): return re.findall(wordRE, text.lower()) wordlist = words(text) from random import choice aword = choice(wordlist) for letter in aword: print letter, histogram[letter] """ That gives us results such as: κ 1196 α 2760 ὶ 650 β 260 α 2760 σ 1411 ι 1341 λ 1015 έ 486 ω 423 ν 3039 So what do we do with these? The idea is: there is an efficient way to learn a new language: every word that you learn should "buy" as much as possible. So, you should learn words that: * are common * contain common letters So, let's naively rank words by multiplying these two values. First, generalize that modeling code from Norvig: """ def train(features): model = defaultdict(lambda: 1) for f in features: model[f] += 1 return model """ Now, build word and letter models: """ lettermodel = train(list(text)) wordmodel = train(words(text)) def letterscore(word): score = 0 for letter in word: score += lettermodel[letter] return score ranking = {} for word in wordmodel: ranking[word] = letterscore(word) * wordmodel[word] for score, word in sorted([(v,k) for k,v in ranking.items()])[-100:]: print word