#!usr/bin/env ruby1.9 # -*- coding: utf-8 -*- require 'cgi' require 'rubygems' require 'hpricot' require 'open-uri' def wikipedia_article_path title return CGI.escape(title).gsub('+','_') end def wiki_url language, title url = "http://%s.wikipedia.org/wiki/Special:Export/%s" return url % [language, wikipedia_article_path(title)] end def spider_wikipedia_text code, title url = wiki_url(code, title) doc = Hpricot(open(url, {'User-Agent'=>'Innocentrobot'})) return doc.search('text').inner_html end class String def sentences split(/(\.)/) end end class WikipediaArticle attr_accessor :code, :title, :text, :sentences def initialize code, title @code = code @title = title @text = spider_wikipedia_text(@code, @title) @sentences = @text.sentences end end en = WikipediaArticle.new('en', 'House') puts en.code, en.title puts en.text en.sentences.each { |sent| puts sent }