(\s| ?)*){1,}/, + :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i + } + + attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image + + def initialize(input, options = {}) + @options = DEFAULT_OPTIONS.merge(options) + @input = input + + if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding] + @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding] + @options[:encoding] = @input.encoding.to_s + end + + @input = @input.gsub(REGEXES[:replaceBrsRe], '

').gsub(REGEXES[:replaceFontsRe], '<\1span>') + @remove_unlikely_candidates = @options[:remove_unlikely_candidates] + @weight_classes = @options[:weight_classes] + @clean_conditionally = @options[:clean_conditionally] + @best_candidate_has_image = true + make_html + handle_exclusions!(@options[:whitelist], @options[:blacklist]) + end + + def images(content=nil, reload=false) + begin + require 'fastimage' + rescue LoadError + raise "Please install fastimage in order to use the #images feature." + end + + @best_candidate_has_image = false if reload + + prepare_candidates + list_images = [] + tested_images = [] + content = @best_candidate[:elem] unless reload + + return list_images if content.nil? + elements = content.css("img").map(&:attributes) + + elements.each do |element| + next unless element["src"] + + url = element["src"].value + height = element["height"].nil? ? 0 : element["height"].value.to_i + width = element["width"].nil? ? 0 : element["width"].value.to_i + + if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) + image = get_image_size(url) + next unless image + else + image = {:width => width, :height => height} + end + + image[:format] = File.extname(url).gsub(".", "") + + if tested_images.include?(url) + debug("Image was tested: #{url}") + next + end + + tested_images.push(url) + if image_meets_criteria?(image) + list_images << url + else + debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") + end + end + + (list_images.empty? and content != @html) ? images(@html, true) : list_images + end + + def images_with_fqdn_uris!(source_uri) + images_with_fqdn_uris(@html, source_uri) + end + + def images_with_fqdn_uris(document = @html.dup, source_uri) + uri = URI.parse(source_uri) + host = uri.host + scheme = uri.scheme + port = uri.port # defaults to 80 + + base = "#{scheme}://#{host}:#{port}/" + + images = [] + document.css("img").each do |elem| + begin + elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + images << elem['src'].to_s + rescue URI::InvalidURIError => exc + elem.remove + end + end + + images(document,true) + end + + def get_image_size(url) + w, h = FastImage.size(url) + raise "Couldn't get size." if w.nil? || h.nil? + {:width => w, :height => h} + rescue => e + debug("Image error: #{e}") + nil + end + + def image_meets_criteria?(image) + return false if options[:ignore_image_format].include?(image[:format].downcase) + image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) + end + + def title + title = @html.css("title").first + title ? title.text : nil + end + + # Look through the @html document looking for the author + # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted) + # Returns nil if no author is detected + def author + # Let's grab this author: + # + author_elements = @html.xpath('//meta[@name = "dc.creator"]') + unless author_elements.empty? + author_elements.each do |element| + return element['content'].strip if element['content'] + end + end + + # Now let's try to grab this + # ByAustin Fonacier + #

+ author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]') + unless author_elements.empty? + author_elements.each do |element| + return element.text.strip if element.text + end + end + + # Now let's try to grab this + # Danny Banks (rel) + # TODO: strip out the (rel)? + author_elements = @html.xpath('//a[@rel = "author"]') + unless author_elements.empty? + author_elements.each do |element| + return element.text.strip if element.text + end + end + + author_elements = @html.xpath('//*[@id = "author"]') + unless author_elements.empty? + author_elements.each do |element| + return element.text.strip if element.text + end + end + end + + def content(remove_unlikely_candidates = :default) + @remove_unlikely_candidates = false if remove_unlikely_candidates == false + + prepare_candidates + article = get_article(@candidates, @best_candidate) + + cleaned_article = sanitize(article, @candidates, options) + if article.text.strip.length < options[:retry_length] + if @remove_unlikely_candidates + @remove_unlikely_candidates = false + elsif @weight_classes + @weight_classes = false + elsif @clean_conditionally + @clean_conditionally = false + else + # nothing we can do + return cleaned_article + end + + make_html + content + else + cleaned_article + end + end + + def get_article(candidates, best_candidate) + # Now that we have the top candidate, look through its siblings for content that might also be related. + # Things like preambles, content split by ads that we removed, etc. + + sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max + output = Nokogiri::XML::Node.new('div', @html) + best_candidate[:elem].parent.children.each do |sibling| + append = false + append = true if sibling == best_candidate[:elem] + append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold + + if sibling.name.downcase == "p" + link_density = get_link_density(sibling) + node_content = sibling.text + node_length = node_content.length + + append = if node_length > 80 && link_density < 0.25 + true + elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ + true + end + end + + if append + sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects + sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase) + output << sibling_dup + end + end + + output + end + + def select_best_candidate(candidates) + sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } + + debug("Top 5 candidates:") + sorted_candidates[0...5].each do |candidate| + debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}") + end + + best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 } + debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}") + + best_candidate + end + + def get_link_density(elem) + link_length = elem.css("a").map(&:text).join("").length + text_length = elem.text.length + link_length / text_length.to_f + end + + def class_weight(e) + weight = 0 + return weight unless @weight_classes + + if e[:class] && e[:class] != "" + weight -= 25 if e[:class] =~ REGEXES[:negativeRe] + weight += 25 if e[:class] =~ REGEXES[:positiveRe] + end + + if e[:id] && e[:id] != "" + weight -= 25 if e[:id] =~ REGEXES[:negativeRe] + weight += 25 if e[:id] =~ REGEXES[:positiveRe] + end + + weight + end + + ELEMENT_SCORES = { + 'div' => 5, + 'blockquote' => 3, + 'form' => -3, + 'th' => -5 + }.freeze + + def score_node(elem) + content_score = class_weight(elem) + content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0) + { :content_score => content_score, :elem => elem } + end + + def debug(str) + puts str if options[:debug] + end + + def sanitize(node, candidates, options = {}) + node.css("h1, h2, h3, h4, h5, h6").each do |header| + header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 + end + + node.css("form, object, iframe, embed").each do |elem| + elem.remove + end + + if @options[:remove_empty_nodes] + # remove

tags that have no text content - this will also remove p tags that contain only images. + node.css("p").each do |elem| + elem.remove if elem.content.strip.empty? + end + end + + # Conditionally clean s,
s, and
s + clean_conditionally(node, candidates, "table, ul, div") + + # We'll sanitize all elements using a whitelist + base_whitelist = @options[:tags] || %w[div p] + # We'll add whitespace instead of block elements, + # so a
b will have a nice space between them + base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] + + # Use a hash for speed (don't want to make a million calls to include?) + whitelist = Hash.new + base_whitelist.each {|tag| whitelist[tag] = true } + replace_with_whitespace = Hash.new + base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } + + ([node] + node.css("*")).each do |el| + # If element is in whitelist, delete all its attributes + if whitelist[el.node_name] + el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } + + # Otherwise, replace the element with its contents + else + # If element is root, replace the node as a text node + if el.parent.nil? + node = Nokogiri::XML::Text.new(el.text, el.document) + break + else + if replace_with_whitespace[el.node_name] + el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document)) + else + el.swap(Nokogiri::XML::Text.new(el.text, el.document)) + end + end + end + + end + + s = Nokogiri::XML::Node::SaveOptions + save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML + html = node.serialize(:save_with => save_opts) + + # Get rid of duplicate whitespace + return html.gsub(/[\r\n\f]+/, "\n" ) + end + + def clean_conditionally(node, candidates, selector) + return unless @clean_conditionally + node.css(selector).each do |el| + weight = class_weight(el) + content_score = candidates[el] ? candidates[el][:content_score] : 0 + name = el.name.downcase + + if weight + content_score < 0 + el.remove + debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") + elsif el.text.count(",") < 10 + counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } + counts["li"] -= 100 + + # For every img under a noscript tag discount one from the count to avoid double counting + counts["img"] -= el.css("noscript").css("img").length + + content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace + link_density = get_link_density(el) + + reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) + if reason + debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") + el.remove + end + end + end + end + + def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) + if (counts["img"] > counts["p"]) && (counts["img"] > 1) + "too many images" + elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" + "more
s than
s" + elsif counts["input"] > (counts["p"] / 3).to_i + "less than 3x
s than s" + elsif (content_length < options[:min_text_length]) && (counts["img"] != 1) + "too short a content length without a single image" + elsif weight < 25 && link_density > 0.2 + "too many links for its weight (#{weight})" + elsif weight >= 25 && link_density > 0.5 + "too many links for its weight (#{weight})" + elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 + "s with too short a content length, or too many s" + else + nil + end + end + + private + + # 제거항목 추가항목을 지정한다. + def handle_exclusions!(whitelist, blacklist) + return unless whitelist || blacklist + + if blacklist + elems = @html.css(blacklist) + if elems + elems.each do |e| + e.remove + end + end + end + + if whitelist + elems = @html.css(whitelist).to_s + + if body = @html.at_css('body') + body.inner_html = elems + end + end + + @input = @html.to_s + end + + # 코멘트가 제거된 기본 html 노드 반환 + def make_html(whitelist=nil, blacklist=nil) + @html = Nokogiri::HTML(@input, nil, @options[:encoding]) + # In case document has no body, such as from empty string or redirect + @html = Nokogiri::HTML('', nil, @options[:encoding]) if @html.css('body').length == 0 + # Remove html comment tags + @html.xpath('//comment()').each { |i| i.remove } + end + + + def prepare_candidates + @html.css("script, style").each { |i| i.remove } + remove_unlikely_candidates! if @remove_unlikely_candidates + transform_misused_divs_into_paragraphs! + + @candidates = score_paragraphs(options[:min_text_length]) + @best_candidate = select_best_candidate(@candidates) + end + + # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.) + def remove_unlikely_candidates! + @html.css("*").each do |elem| + str = "#{elem[:class]}#{elem[:id]}" + if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body') + debug("Removing unlikely candidate - #{str}") + elem.remove + end + end + end + + # 잘못 사용되고 있는 DIV를 p로 변환한다. + def transform_misused_divs_into_paragraphs! + @html.css("*").each do |elem| + if elem.name.downcase == "div" + # transform
s that do not contain other block elements into
s + if elem.inner_html !~ REGEXES[:divToPElementsRe] + debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); + elem.name = "p" + end + else + # wrap text nodes in p tags +# elem.children.each do |child| +# if child.text? +# debug("wrapping text node with a p") +# child.swap("
#{child.text}
") +# end +# end + end + end + end + + # 가능노드에 점수를 매긴다. + def score_paragraphs(min_text_length) + candidates = {} + @html.css("p,td").each do |elem| + parent_node = elem.parent + grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil + inner_text = elem.text + + # If this paragraph is less than 25 characters, don't even count it. + next if inner_text.length < min_text_length + + candidates[parent_node] ||= score_node(parent_node) + candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node + + content_score = 1 + content_score += inner_text.split(',').length + content_score += [(inner_text.length / 100).to_i, 3].min + + candidates[parent_node][:content_score] += content_score + candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node + end + + # Scale the final candidates score based on link density. Good content should have a + # relatively small link density (5% or less) and be mostly unaffected by this operation. + candidates.each do |elem, candidate| + candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem)) + end + + candidates + end + end +end diff --git a/lib/title_finder.ex b/lib/title_finder.ex new file mode 100644 index 0000000..8a1305f --- /dev/null +++ b/lib/title_finder.ex @@ -0,0 +1,64 @@ +defmodule Readability.TitleFinder do + @moduledoc """ + The TitleFinder engine traverse the HTML tree searching for finding title. + """ + + @title_suffix ~r/(\-)|(\:\:)|(\|)/ + @h_tag_selector "h1, h2, h3" + + @type html_tree :: tuple | list + + def title(html_tree) do + maybe_title = tag_title(html_tree) + if length(String.split(maybe_title, " ")) <= 4 do + maybe_title = og_title(html_tree) + end + maybe_title || h_tag_title(html_tree) + end + + @doc """ + Find title from title tag + """ + + @spec tag_title(html_tree) :: binary + + def tag_title(html_tree) do + html_tree + |> Floki.find("title") + |> to_clean_text + end + + @doc """ + Find title from og:title property of meta tag + """ + + @spec og_title(html_tree) :: binary + + def og_title(html_tree) do + html_tree + |> Floki.find("meta[property=og:title]") + |> Floki.attribute("content") + |> to_clean_text + end + + @doc """ + Find title from h tag + """ + + @spec h_tag_title(html_tree, String.t) :: binary + + def h_tag_title(html_tree, selector \\@h_tag_selector) do + html_tree + |> Floki.find(selector) + |> hd + |> to_clean_text + end + + defp to_clean_text(html_tree) do + title_text = html_tree + |> Floki.text + |> String.split(@title_suffix) + |> hd + |> String.strip + end +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..2720fad --- /dev/null +++ b/mix.exs @@ -0,0 +1,34 @@ +defmodule Readability.Mixfile do + use Mix.Project + + def project do + [app: :readability, + version: "0.0.1", + elixir: "~> 1.2", + build_embedded: Mix.env == :prod, + start_permanent: Mix.env == :prod, + deps: deps] + end + + # Configuration for the OTP application + # + # Type "mix help compile.app" for more information + def application do + [applications: [:logger, + :floki + ]] + end + + # Dependencies can be Hex packages: + # + # {:mydep, "~> 0.3.0"} + # + # Or git/path repositories: + # + # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} + # + # Type "mix help deps" for more examples and options + defp deps do + [{:floki, "~> 0.8.0"}] + end +end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..7874674 --- /dev/null +++ b/mix.lock @@ -0,0 +1,2 @@ +%{"floki": {:hex, :floki, "0.8.0"}, + "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}} diff --git a/test/features/nytimes.html b/test/features/nytimes.html new file mode 100644 index 0000000..a46d649 --- /dev/null +++ b/test/features/nytimes.html @@ -0,0 +1,1198 @@ + + + + + + + + + + + +Pence questions Obama at House GOP conference - washingtonpost.com + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

Change Preferences | Sign Out

Sign In | Register Now

TODAY'S NEWSPAPER
Subscribe | PostPoints

+ + + +

SEARCH:

| Search Archives

+washingtonpost.com + > Politics +

+ + + +

+ +

+ + + + + +

Pence questions Obama at House GOP conference

+ +

+ + + + + + + + + + + + +

+ + +

Discussion Policy

+ +

+ Comments that include profanity or personal attacks or other inappropriate comments or material will be removed from the site. Additionally, entries that are unsigned or contain "signatures" by someone other than the actual author will be removed. Finally, we will take steps to block users who violate any of our posting standards, terms of use or privacy policies or any other policies governing this site. Please review the full rules governing commentaries and discussions. You are fully responsible for the content that you post. +

+ +

+ + + +

+ +

+ + +

+ +Friday, January 29, 2010; 1:47 PM +

+ +

+After addressing the GOP House Issues Conference in Baltimore on Friday, President Obama took a series questions from the lawmakers. Here is a transcript of one of the questions posed to the president: +

REP. MIKE PENCE (R-Ind.): We are pleased to have you return (inaudible) a year ago. House Republicans said then we would make you two promises. Number one, that most people in this room and their families would pray for you and your beautiful family just about every day for the four years. I want to assure you we're keeping that promise.

+OBAMA: I appreciate that. +

+PENCE: Number two, (inaudible) to you, Mr. President, was that door (ph) was always open. And we hope that by evidence of our invitation to you that we can demonstrate that (inaudible). +

+Mr. President, (inaudible) us in this conference yesterday, on the way into Baltimore, stopped by the Salvation Army homeless facility here in Baltimore yesterday. +

+I met a little boy, an African-American boy, in the 8th grade, named David Carter Jr. +

+When he heard that I would be seeing you today, his eyes lit up like I haven't seen. And I told him if he wrote you a letter, I'd give it to you. And I have. +

+But I had a conversation with little David Jr. and David Sr. And their families are struggling in this economy. His dad said words to me, Mr. President, that I'll never forget. About my age, and he said -- he said, "Congressman, it's not like it was when we were coming up." He said, "There's just no jobs." +

+Now, last year, about the time you met with us, unemployment was 7.5 percent in this country. Your administration and your party in Congress told us that we'd have to borrow more than $700 billion to pay for a so-called stimulus bill that was a piecemeal list of projects and boutique tax cuts, all of which we were told had to be passed or unemployment would go to 8 percent, as your administration said. +

+Well, unemployment is 10 percent now, as you well know, Mr. President. Here in Baltimore, it's considerably higher. +

+Now, Republicans offered a stimulus bill at the same time. It cost half as much as the Democratic proposal in Congress. And using your economic analyst models, it would have created twice the jobs at half the cost. It essentially was across-the-board tax relief, Mr. President. +

+Now, we know you've come to Baltimore today and you've -- you've raised this -- a tax credit which was last promoted by President Jimmy Carter. +

+ +

+ +
+

+CONTINUED + +1 + + + + + + + + > +

+ + +

+ + + +

+ +

More in the Politics Section

+ +

2008 Fundraising

See who is giving to the '08 presidential candidates.

Research Donations ... By Name, Occupation, Employer or Zip Code
...By State

+ +

Latest Politics Blog Updates

The Fix: President Obama takes on House Republicans (VIDEO) (updated 13 minutes ago)
The Trail: Senate ethics panel has four probes underway (updated 1 hour ago)
Index of Politics Columns and Blogs | Opinions Section

+ +

+ + +

+ +

+ + + + + + + + + +

+ + + + + + + + + + +

+ + + + +

+ + + +

Jobs | Cars | Real Estate | Rentals | Classifieds

SEARCH:

Search Archives

The Washington Post Company: Information and Other Post Co. Websites

Help | Contact Us

+ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/readability_test.exs b/test/readability_test.exs new file mode 100644 index 0000000..7623e59 --- /dev/null +++ b/test/readability_test.exs @@ -0,0 +1,8 @@ +defmodule ReadabilityTest do + use ExUnit.Case + doctest Readability + + test "the truth" do + assert 1 + 1 == 2 + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start() diff --git a/test/title_finder_test.exs b/test/title_finder_test.exs new file mode 100644 index 0000000..e9a8b2d --- /dev/null +++ b/test/title_finder_test.exs @@ -0,0 +1,45 @@ +defmodule Readability.TitleFinderTest do + use ExUnit.Case, async: true + + doctest Readability + + @html """ + + + Tag title - test + + + +

h1 title

h2 title

+ + + """ + + test "extract og title" do + title = Readability.TitleFinder.og_title(@html) + assert title == "og title" + end + + test "extract tag title" do + title = Readability.TitleFinder.tag_title(@html) + assert title == "Tag title" + end + + test "extract h1 tag title" do + title = Readability.TitleFinder.h_tag_title(@html) + assert title == "h1 title" + end + + test "extrat h2 tag title" do + title = Readability.TitleFinder.h_tag_title(@html, "h2") + assert title == "h2 title" + end + + test "extract most proper title" do + title = Readability.TitleFinder.title(@html) + assert title == "og title" + end +end