# encoding: utf-8
require 'rubygems'
require 'nokogiri'
require 'guess_html_encoding'
module Readability
class Document
DEFAULT_OPTIONS = {
:retry_length => 250,
:min_text_length => 25,
:remove_unlikely_candidates => true,
:weight_classes => true,
:clean_conditionally => true,
:remove_empty_nodes => true,
:min_image_width => 130,
:min_image_height => 80,
:ignore_image_format => [],
:blacklist => nil,
:whitelist => nil
}.freeze
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
:replaceBrsRe => /(
]*>[ \n\r\t]*){2,}/i,
:replaceFontsRe => /<(\/?)font[^>]*>/i,
:trimRe => /^\s+|\s+$/,
:normalizeRe => /\s{2,}/,
:killBreaksRe => /(
(\s| ?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@options = DEFAULT_OPTIONS.merge(options)
@input = input
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
@options[:encoding] = @input.encoding.to_s
end
@input = @input.gsub(REGEXES[:replaceBrsRe], '
').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] @best_candidate_has_image = true make_html handle_exclusions!(@options[:whitelist], @options[:blacklist]) end def images(content=nil, reload=false) begin require 'fastimage' rescue LoadError raise "Please install fastimage in order to use the #images feature." end @best_candidate_has_image = false if reload prepare_candidates list_images = [] tested_images = [] content = @best_candidate[:elem] unless reload return list_images if content.nil? elements = content.css("img").map(&:attributes) elements.each do |element| next unless element["src"] url = element["src"].value height = element["height"].nil? ? 0 : element["height"].value.to_i width = element["width"].nil? ? 0 : element["width"].value.to_i if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) image = get_image_size(url) next unless image else image = {:width => width, :height => height} end image[:format] = File.extname(url).gsub(".", "") if tested_images.include?(url) debug("Image was tested: #{url}") next end tested_images.push(url) if image_meets_criteria?(image) list_images << url else debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") end end (list_images.empty? and content != @html) ? images(@html, true) : list_images end def images_with_fqdn_uris!(source_uri) images_with_fqdn_uris(@html, source_uri) end def images_with_fqdn_uris(document = @html.dup, source_uri) uri = URI.parse(source_uri) host = uri.host scheme = uri.scheme port = uri.port # defaults to 80 base = "#{scheme}://#{host}:#{port}/" images = [] document.css("img").each do |elem| begin elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil images << elem['src'].to_s rescue URI::InvalidURIError => exc elem.remove end end images(document,true) end def get_image_size(url) w, h = FastImage.size(url) raise "Couldn't get size." if w.nil? || h.nil? {:width => w, :height => h} rescue => e debug("Image error: #{e}") nil end def image_meets_criteria?(image) return false if options[:ignore_image_format].include?(image[:format].downcase) image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) end def title title = @html.css("title").first title ? title.text : nil end # Look through the @html document looking for the author # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted) # Returns nil if no author is detected def author # Let's grab this author: # author_elements = @html.xpath('//meta[@name = "dc.creator"]') unless author_elements.empty? author_elements.each do |element| return element['content'].strip if element['content'] end end # Now let's try to grab this #
# author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]') unless author_elements.empty? author_elements.each do |element| return element.text.strip if element.text end end # Now let's try to grab this # Danny Banks (rel) # TODO: strip out the (rel)? author_elements = @html.xpath('//a[@rel = "author"]') unless author_elements.empty? author_elements.each do |element| return element.text.strip if element.text end end author_elements = @html.xpath('//*[@id = "author"]') unless author_elements.empty? author_elements.each do |element| return element.text.strip if element.text end end end def content(remove_unlikely_candidates = :default) @remove_unlikely_candidates = false if remove_unlikely_candidates == false prepare_candidates article = get_article(@candidates, @best_candidate) cleaned_article = sanitize(article, @candidates, options) if article.text.strip.length < options[:retry_length] if @remove_unlikely_candidates @remove_unlikely_candidates = false elsif @weight_classes @weight_classes = false elsif @clean_conditionally @clean_conditionally = false else # nothing we can do return cleaned_article end make_html content else cleaned_article end end def get_article(candidates, best_candidate) # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max output = Nokogiri::XML::Node.new('div', @html) best_candidate[:elem].parent.children.each do |sibling| append = false append = true if sibling == best_candidate[:elem] append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold if sibling.name.downcase == "p" link_density = get_link_density(sibling) node_content = sibling.text node_length = node_content.length append = if node_length > 80 && link_density < 0.25 true elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ true end end if append sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase) output << sibling_dup end end output end def select_best_candidate(candidates) sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } debug("Top 5 candidates:") sorted_candidates[0...5].each do |candidate| debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}") end best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 } debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}") best_candidate end def get_link_density(elem) link_length = elem.css("a").map(&:text).join("").length text_length = elem.text.length link_length / text_length.to_f end def class_weight(e) weight = 0 return weight unless @weight_classes if e[:class] && e[:class] != "" weight -= 25 if e[:class] =~ REGEXES[:negativeRe] weight += 25 if e[:class] =~ REGEXES[:positiveRe] end if e[:id] && e[:id] != "" weight -= 25 if e[:id] =~ REGEXES[:negativeRe] weight += 25 if e[:id] =~ REGEXES[:positiveRe] end weight end ELEMENT_SCORES = { 'div' => 5, 'blockquote' => 3, 'form' => -3, 'th' => -5 }.freeze def score_node(elem) content_score = class_weight(elem) content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0) { :content_score => content_score, :elem => elem } end def debug(str) puts str if options[:debug] end def sanitize(node, candidates, options = {}) node.css("h1, h2, h3, h4, h5, h6").each do |header| header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 end node.css("form, object, iframe, embed").each do |elem| elem.remove end if @options[:remove_empty_nodes] # removetags that have no text content - this will also remove p tags that contain only images. node.css("p").each do |elem| elem.remove if elem.content.strip.empty? end end # Conditionally clean