#!/usr/bin/env python # -*- coding: utf-8 -*- import wpalign import urllib2 import codecs import sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout) def wikiurl(code, topic): topic = urllib2.quote(topic) url = u"http://%s.wikipedia.org/wiki/Special:Export/%s" % (code, topic.encode('utf-8')) return url def download_article(code, topic, userAgent="innocentrobot"): url = wikiurl(code, topic) req = urllib2.Request(url, None, {'User-agent': userAgent}) wikitext = urllib2.urlopen(req).read() return wikitext.decode('utf-8') source_code, source_title = sys.argv[1].decode('utf-8').split(':') target_code, target_title = sys.argv[2].decode('utf-8').split(':') source = download_article(source_code,source_title) target = download_article(target_code,target_title) source = wpalign.Article(source) target = wpalign.Article(target) similar = [] threshhold = 2 for s in source.sentences: for t in target.sentences: common = len(s.links.intersection(t.links)) if common > threshhold: similar.append((common,s,t)) print similar for common, s, t in sorted(similar): print s.text print print t.text print '\n-------\n'