#!/usr/bin/env python # -*- coding: utf-8 -*- from math import sqrt from text import freq, trigrams, tokenize, depunctuate def model(filename): text = open(filename).read() text = unicode(text) cleaned = ' '.join(tokenize(depunctuate(text))) print cleaned return freq(trigrams(cleaned)) def randmodel(): import os from random import choice data= '/home/pat/l/udhr/data/' randlang = choice(os.listdir('/home/pat/l/udhr/data')) return model(data + os.sep + randlang) def scalar(vec): total = 0 for elem in vec: total += vec[elem] * vec[elem] return sqrt(total) def sim(v, w): total = 0 for elem in v: if elem in w: total += v[elem] * w[elem] return float(total) / (scalar(v) * scalar(w)) if __name__ == "__main__": import sys a, b = sys.argv[1:3] #for k,v in model(a).items(): print k, v print sim(model(a),model(b))