#!/usr/bin/env python # -*- coding: utf-8 -*- import re """ We wish to extract a list of translated sentences from two Wikipedia articles. 1. Spider the articles 2. Extract sentences 3. Extract links 4. Represent the sentences as sets of bilingual links 5. Return pairs of sentences ordered by rank intersection """ def spider_article(language_code, title): pass class Sentence: def __init__(self, text): self.text = text self.links = set(extract_wikitext_links(self.text)) def strip_info_boxes(wikitext): """infoboxes are templates in wikitext {{like this}}""" return re.sub('{{[^}]+}}', '', wikitext) class Article: def __init__(self, text): self.text = strip_info_boxes(text) self.sentences = [Sentence(sentence) for sentence in split_into_sentences(self.text)] def split_into_sentences(markup): sentences = markup.split('.') # @@TODO haha return sentences link_pattern= """\[\[ # [[ ( # title [^\]\|\:]+ # several of anything but ]|: ) # end title \|? # bar? ( # anchor? [^\]\[\|\:]+ )? # end anchor? \]\] # ]] """ linkRE = re.compile(link_pattern, re.VERBOSE) def extract_wikitext_links(sentence): return [match[0] for match in linkRE.findall(sentence)] # @@TODO handle right to left languages def number_of_links_in_common(source, target): return len(source.links.intersect(target.links)) def extract_links(sentence): pass def enrich_links(links): pass def compare_sentences(source, target): pass def _test(): import doctest doctest.testmod() if __name__ == "__main__": _test()