#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, codecs from snag import snag sys.path.append('/home/pat/repo/wikipedia') from extract_names import interwiki_re #sys.stdout = codecs.getwriter('utf-8')(sys.stdout) """ A crude implementation of part of: AUTHOR: Koehn, Philipp and Knight, Kevin TITLE: Learning a Translation Lexicon from Monolingual Corpora Abstract: This paper presents work on the task of constructing a word-level translation lexicon purely from unrelated monolingual corpora. We combine various clues such as cognates, similar context, preservation of word similarity, and word frequency. Experimental results for the construction of a German-English noun lexicon are reported. Booktitle: Proceedings of ACL Workshop on Unsupervised Lexical Acquisition Citeulike-article-id: 2324020 Keywords: lexical_acquisition, nlp Priority: 4 Year: 2002 URL: http://citeseer.ist.psu.edu/509449.html """ input = ' '.join(sys.argv[1:]) print "query is of type: " print type(query) pages = {} for query in input.split(): pages[query] = snag(query) for k,v in pages.items(): start = v.index('') def nopunc(w): punctuation = u":<>{}<>/?=[]" if not set(w).intersection(set(punctuation)): return True else: return False wordsets = {} for query, text in pages.items(): #print text[-500:].encode('utf-8') words = text.decode('utf-8').split() words = [w for w in words if not interwiki_re.match(w) and nopunc(w)] wordsets[query] = set(words) a, b = wordsets.values()[0], wordsets.values()[1] for x in sorted(a.intersection(b)): print x