#!/usr/bin/env python # -*- coding:utf-8 -*- from languages import names2codes, codes2names import urllib2 import sys import re from collections import defaultdict from BeautifulSoup import BeautifulStoneSoup def wikiurl(code, topic): topic = urllib2.quote(topic) url = u"http://%s.wikipedia.org/wiki/Special:Export/%s" % (code, topic.encode('utf-8')) return url def download_article(code, topic, userAgent="innocentrobot"): url = wikiurl(code, topic) req = urllib2.Request(url, None, {'User-agent': userAgent}) wikitext = urllib2.urlopen(req).read() open('/tmp/foo.txt','w').write(wikitext) return wikitext def extract_target_topic(code, wikitext): try: code in codes2names.keys() except KeyError: sys.stdout.write('language code for pattern unknown.') sys.exit() linkRE = re.compile('''\[\[ %s: ([^\]]+) \]\]''' % code, re.VERBOSE) return linkRE.search(wikitext).groups(1)[0] def download_article_pair(sourcecode, targetcode, sourcetopic): """given a request like 'en it Italy', get the en:Italy article; from that discover and then download it:Italia""" sourcearticle = download_article(sourcecode, sourcetopic) targettopic = extract_target_topic(targetcode, sourcearticle) targetarticle = download_article(targetcode, targettopic) return {sourcecode:sourcearticle, targetcode:targetarticle} """ 5. Extract sentences from both pages. """ def strip_info_boxes(wikitext): """infoboxes are templates in wikitext {{like this}}""" return re.sub('{{[^}]+}}', '', wikitext) def extract_text_from_article(article): soup = BeautifulStoneSoup(article) text = soup.page.revision.text.renderContents() return text def sentences(text): # @@TODO desuckulate sentenceRE = re.compile('([\.\!\?])\s') return sentenceRE.split(text) def annotate_sentences_with_links(sentence): links = {} for sentence in sentences(article): links[sentence] = extract_links(sentence) #def remove_wikitext_quoting(wikitext): # """wikipedia markup uses the notation '''bold''' and ''italic'', # which conflicts with Python's commenting syntax, just nuke it""" # multiplequoteRE = re.compile(('""+'|"''+") # multiplequoteRE.sub( def extract_links(sentence): linkRE = re.compile(""" \[\[ (([ \w\(\)]+)\|)? ([ \w]+) \]\]""", re.VERBOSE) linklist = linkRE.findall(sentence) links = [] for link in linklist: if link[0].endswith('|'): links.append(link[1]) else: links.append(link[2]) return set(links) anotherlinkRE = re.compile(''' \[\[ ([^\]]+) \]\] ''', re.VERBOSE) def compare_sentences(source, target): ranks = defaultdict(list) for s in source: for t in target: ranks[s].append(link_similarity(s,t)) class LinkedSentence: """a sentence from Wikipedia together with a list of its links""" def __init__(self, sentence): self.sentence = sentence self.links = extract_links(self.sentence) def similarity(source,target): return len(source.links.intersection(target.links)) wikisentence = "Here is a typical sentence in Wikipedia with a [[link]] and [[another|some other link]]." wikisentence2 = "Here is a different one with a [[link]] and [[Bruce Lee]]." wikisentence3 = "[[Bruce Lee]] was never in a film with [[Bambi]]." linked = LinkedSentence(wikisentence) linked2 = LinkedSentence(wikisentence2) linked3 = LinkedSentence(wikisentence3) samples = linked, linked2, linked3 #terminal_punctuationRE = re.compile(u"""[,,﹐ ، ߸ ᠂ ᠈ 、﹑、 ;;;﹔ ؛ \::﹕ ։ ܃-܈ ፡ ፣-፦ ᠄ ᠅ ៖ ᭝ ᛫-᛭ !!﹗ ‼ ⁉ ߹ ᥄ ??﹖ ⁈ ⁇ ؟ ܉ ፧ ᥅ ‽ ..﹒ ۔ ܁ ܂ ። ᠃ ᠉ ᙮ 。。 · । ॥ ꡶ ꡷ ၊ ။ ។ ៕ ᭞ ᭟ ܀ ፨ ᭚ ᭛ 𐎟 𐏐 𐤟 𒑰-𒑳 ׃ ܊ ܌ ๚ ๛ ༈ །-༒ ៚ ᙭]""") """ if __name__ == "__main__": (sourcecode, topic, targetcode) = sys.argv[1:4] for x in (sourcecode, topic, targetcode): print x pair = download_article_pair(sourcecode, topic, targetcode) print pair.keys() for v in pair.values(): print v[:1000] """