#!/usr/bin/env python #! -*- coding: utf-8 -*- import random import os from text import freq, depunctuate, tokenize from fileinput import input from glob import glob #from BeautifulSoup import UnicodeDammit as utf def count(txt): txt = ' '.join(tokenize(txt)) return freq(list(depunctuate(txt))) def randudhr(): udhr = glob('udhr/*.txt') randomlang = random.choice(udhr) return unicode(open(randomlang).read()) def comparehist(h1, h2): a = set(h1.keys()) b = set(h2.keys()) both = sorted(list(a.union(b))) for c in both:print c, res = [] for e in both: if e in h1 and e in h2: res.append( (h1[e], e, h2[e])) return res if __name__ == "__main__": import sys a = count(randudhr()) b = count(randudhr()) #for l,c in a.items(): print l, c #for l,c in b.items(): print l, c for a, letter, b in comparehist(a,b): print "%s\t%s\t%s" % (a, letter, b) #comparehist(a,b)