#!/usr/bin/env python # -*- coding: utf-8 -*- # Make a guess at a transliteration mapping, given a lexicon. from unicodedata import name import sys, codecs from textual import freq sys.stdout = codecs.getwriter('utf-8')(sys.stdout) sys.stdin = codecs.getreader('utf-8')(sys.stdin) # '/media/SEAGATE/pat/l/lex/en2hi.txt' lexfile = sys.argv[1] # This lexicon is extracted from the Hindi Wikipedia en2hi = open(lexfile).read() en2hi = en2hi.decode('utf-8') e2h = {} for line in en2hi.splitlines(): # lines look like: # en बिहार Bihar try: code, hi, en = line.split('\t') en = en.lower() e2h[en] = hi except ValueError: continue # now e2h contains a lexicon: # { u'bihar' : u'बिहार' , ... } correspondences = [] # this will hold all letter-letter correspondences: [('b', 'ब'), ('h', 'ह') ... ] for e,h in e2h.items(): all = [] all.extend(zip(list(e),list(h))) # because letter-letter correspondences may be better at the end of the word than the # beginning, we zip reversed versions of the words too: all.extend(zip(reversed(list(e)), reversed(list(h)))) correspondences.extend(all) fq = freq(correspondences) qf = [(fq,pair) for pair,fq in freq(correspondences).items()] sqf = sorted(qf) for f, pair in sqf[-100:]: print name(pair[0]), pair[0], pair[1]