initial commit
This commit is contained in:
commit
d91604a519
|
@ -0,0 +1,5 @@
|
||||||
|
/_build
|
||||||
|
/cover
|
||||||
|
/deps
|
||||||
|
erl_crash.dump
|
||||||
|
*.ez
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Readability
|
||||||
|
|
||||||
|
**TODO: Add description**
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
If [available in Hex](https://hex.pm/docs/publish), the package can be installed as:
|
||||||
|
|
||||||
|
1. Add readability to your list of dependencies in `mix.exs`:
|
||||||
|
|
||||||
|
def deps do
|
||||||
|
[{:readability, "~> 0.0.1"}]
|
||||||
|
end
|
||||||
|
|
||||||
|
2. Ensure readability is started before your application:
|
||||||
|
|
||||||
|
def application do
|
||||||
|
[applications: [:readability]]
|
||||||
|
end
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
# This file is responsible for configuring your application
|
||||||
|
# and its dependencies with the aid of the Mix.Config module.
|
||||||
|
use Mix.Config
|
||||||
|
|
||||||
|
# This configuration is loaded before any dependency and is restricted
|
||||||
|
# to this project. If another project depends on this project, this
|
||||||
|
# file won't be loaded nor affect the parent project. For this reason,
|
||||||
|
# if you want to provide default values for your application for
|
||||||
|
# 3rd-party users, it should be done in your "mix.exs" file.
|
||||||
|
|
||||||
|
# You can configure for your application as:
|
||||||
|
#
|
||||||
|
# config :readability, key: :value
|
||||||
|
#
|
||||||
|
# And access this configuration in your application as:
|
||||||
|
#
|
||||||
|
# Application.get_env(:readability, :key)
|
||||||
|
#
|
||||||
|
# Or configure a 3rd-party app:
|
||||||
|
#
|
||||||
|
# config :logger, level: :info
|
||||||
|
#
|
||||||
|
|
||||||
|
# It is also possible to import configuration files, relative to this
|
||||||
|
# directory. For example, you can emulate configuration per environment
|
||||||
|
# by uncommenting the line below and defining dev.exs, test.exs and such.
|
||||||
|
# Configuration from the imported file will override the ones defined
|
||||||
|
# here (which is why it is important to import them last).
|
||||||
|
#
|
||||||
|
# import_config "#{Mix.env}.exs"
|
|
@ -0,0 +1,58 @@
|
||||||
|
defmodule Readability.Document do
|
||||||
|
@default_options [retry_length: 250,
|
||||||
|
min_text_length: 25,
|
||||||
|
remove_unlikely_candidates: true,
|
||||||
|
weight_classes: true,
|
||||||
|
clean_conditionally: true,
|
||||||
|
remove_empty_nodes: true,
|
||||||
|
min_image_width: 130,
|
||||||
|
min_image_height: 80,
|
||||||
|
ignore_image_format: [],
|
||||||
|
blacklist: nil,
|
||||||
|
whitelist: nil
|
||||||
|
]
|
||||||
|
|
||||||
|
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||||
|
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
|
||||||
|
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||||
|
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
||||||
|
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
|
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
|
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
|
||||||
|
trimRe: ~r/^\s+|\s+$/,
|
||||||
|
normalizeRe: ~r/\s{2,}/,
|
||||||
|
killBreaksRe: ~r/(<br\s*\/?>(\s| ?)*){1,}/,
|
||||||
|
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
||||||
|
]
|
||||||
|
|
||||||
|
def html do
|
||||||
|
page
|
||||||
|
|> String.replace(@regexes[:replaceBrsRe], "</p><p>")
|
||||||
|
|> String.replace(@regexes[:replaceFontsRe], "<\1span>")
|
||||||
|
|> Floki.find("html")
|
||||||
|
|> Floki.filter_out(:comment)
|
||||||
|
end
|
||||||
|
|
||||||
|
def title do
|
||||||
|
html |> Floki.find("title") |> Floki.text
|
||||||
|
end
|
||||||
|
|
||||||
|
def content do
|
||||||
|
html
|
||||||
|
|> Floki.filter_out("script")
|
||||||
|
|> Floki.filter_out("style")
|
||||||
|
end
|
||||||
|
|
||||||
|
def page do
|
||||||
|
{:ok, f} = File.read("test/features/nytimes.html")
|
||||||
|
f
|
||||||
|
end
|
||||||
|
|
||||||
|
def default_options do
|
||||||
|
@default_options
|
||||||
|
end
|
||||||
|
|
||||||
|
def regexes do
|
||||||
|
@regexes
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,10 @@
|
||||||
|
defmodule Readability do
|
||||||
|
alias Readability.TitleFinder
|
||||||
|
|
||||||
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
def title(html) when is_binary(html), do: parse(html) |> title
|
||||||
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
|
defp parse(raw_html), do: Floki.parse(raw_html)
|
||||||
|
end
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,522 @@
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
|
require 'rubygems'
|
||||||
|
require 'nokogiri'
|
||||||
|
require 'guess_html_encoding'
|
||||||
|
|
||||||
|
module Readability
|
||||||
|
class Document
|
||||||
|
DEFAULT_OPTIONS = {
|
||||||
|
:retry_length => 250,
|
||||||
|
:min_text_length => 25,
|
||||||
|
:remove_unlikely_candidates => true,
|
||||||
|
:weight_classes => true,
|
||||||
|
:clean_conditionally => true,
|
||||||
|
:remove_empty_nodes => true,
|
||||||
|
:min_image_width => 130,
|
||||||
|
:min_image_height => 80,
|
||||||
|
:ignore_image_format => [],
|
||||||
|
:blacklist => nil,
|
||||||
|
:whitelist => nil
|
||||||
|
}.freeze
|
||||||
|
|
||||||
|
REGEXES = {
|
||||||
|
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||||
|
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
|
||||||
|
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||||
|
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
|
||||||
|
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
|
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
|
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
||||||
|
:trimRe => /^\s+|\s+$/,
|
||||||
|
:normalizeRe => /\s{2,}/,
|
||||||
|
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
||||||
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
||||||
|
}
|
||||||
|
|
||||||
|
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
|
||||||
|
|
||||||
|
def initialize(input, options = {})
|
||||||
|
@options = DEFAULT_OPTIONS.merge(options)
|
||||||
|
@input = input
|
||||||
|
|
||||||
|
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
|
||||||
|
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
|
||||||
|
@options[:encoding] = @input.encoding.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
||||||
|
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
|
||||||
|
@weight_classes = @options[:weight_classes]
|
||||||
|
@clean_conditionally = @options[:clean_conditionally]
|
||||||
|
@best_candidate_has_image = true
|
||||||
|
make_html
|
||||||
|
handle_exclusions!(@options[:whitelist], @options[:blacklist])
|
||||||
|
end
|
||||||
|
|
||||||
|
def images(content=nil, reload=false)
|
||||||
|
begin
|
||||||
|
require 'fastimage'
|
||||||
|
rescue LoadError
|
||||||
|
raise "Please install fastimage in order to use the #images feature."
|
||||||
|
end
|
||||||
|
|
||||||
|
@best_candidate_has_image = false if reload
|
||||||
|
|
||||||
|
prepare_candidates
|
||||||
|
list_images = []
|
||||||
|
tested_images = []
|
||||||
|
content = @best_candidate[:elem] unless reload
|
||||||
|
|
||||||
|
return list_images if content.nil?
|
||||||
|
elements = content.css("img").map(&:attributes)
|
||||||
|
|
||||||
|
elements.each do |element|
|
||||||
|
next unless element["src"]
|
||||||
|
|
||||||
|
url = element["src"].value
|
||||||
|
height = element["height"].nil? ? 0 : element["height"].value.to_i
|
||||||
|
width = element["width"].nil? ? 0 : element["width"].value.to_i
|
||||||
|
|
||||||
|
if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
|
||||||
|
image = get_image_size(url)
|
||||||
|
next unless image
|
||||||
|
else
|
||||||
|
image = {:width => width, :height => height}
|
||||||
|
end
|
||||||
|
|
||||||
|
image[:format] = File.extname(url).gsub(".", "")
|
||||||
|
|
||||||
|
if tested_images.include?(url)
|
||||||
|
debug("Image was tested: #{url}")
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
tested_images.push(url)
|
||||||
|
if image_meets_criteria?(image)
|
||||||
|
list_images << url
|
||||||
|
else
|
||||||
|
debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
(list_images.empty? and content != @html) ? images(@html, true) : list_images
|
||||||
|
end
|
||||||
|
|
||||||
|
def images_with_fqdn_uris!(source_uri)
|
||||||
|
images_with_fqdn_uris(@html, source_uri)
|
||||||
|
end
|
||||||
|
|
||||||
|
def images_with_fqdn_uris(document = @html.dup, source_uri)
|
||||||
|
uri = URI.parse(source_uri)
|
||||||
|
host = uri.host
|
||||||
|
scheme = uri.scheme
|
||||||
|
port = uri.port # defaults to 80
|
||||||
|
|
||||||
|
base = "#{scheme}://#{host}:#{port}/"
|
||||||
|
|
||||||
|
images = []
|
||||||
|
document.css("img").each do |elem|
|
||||||
|
begin
|
||||||
|
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
|
||||||
|
images << elem['src'].to_s
|
||||||
|
rescue URI::InvalidURIError => exc
|
||||||
|
elem.remove
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
images(document,true)
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_image_size(url)
|
||||||
|
w, h = FastImage.size(url)
|
||||||
|
raise "Couldn't get size." if w.nil? || h.nil?
|
||||||
|
{:width => w, :height => h}
|
||||||
|
rescue => e
|
||||||
|
debug("Image error: #{e}")
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def image_meets_criteria?(image)
|
||||||
|
return false if options[:ignore_image_format].include?(image[:format].downcase)
|
||||||
|
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
|
||||||
|
end
|
||||||
|
|
||||||
|
def title
|
||||||
|
title = @html.css("title").first
|
||||||
|
title ? title.text : nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Look through the @html document looking for the author
|
||||||
|
# Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
|
||||||
|
# Returns nil if no author is detected
|
||||||
|
def author
|
||||||
|
# Let's grab this author:
|
||||||
|
# <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
|
||||||
|
author_elements = @html.xpath('//meta[@name = "dc.creator"]')
|
||||||
|
unless author_elements.empty?
|
||||||
|
author_elements.each do |element|
|
||||||
|
return element['content'].strip if element['content']
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Now let's try to grab this
|
||||||
|
# <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
|
||||||
|
# <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
|
||||||
|
author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
|
||||||
|
unless author_elements.empty?
|
||||||
|
author_elements.each do |element|
|
||||||
|
return element.text.strip if element.text
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Now let's try to grab this
|
||||||
|
# <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
|
||||||
|
# TODO: strip out the (rel)?
|
||||||
|
author_elements = @html.xpath('//a[@rel = "author"]')
|
||||||
|
unless author_elements.empty?
|
||||||
|
author_elements.each do |element|
|
||||||
|
return element.text.strip if element.text
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
author_elements = @html.xpath('//*[@id = "author"]')
|
||||||
|
unless author_elements.empty?
|
||||||
|
author_elements.each do |element|
|
||||||
|
return element.text.strip if element.text
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def content(remove_unlikely_candidates = :default)
|
||||||
|
@remove_unlikely_candidates = false if remove_unlikely_candidates == false
|
||||||
|
|
||||||
|
prepare_candidates
|
||||||
|
article = get_article(@candidates, @best_candidate)
|
||||||
|
|
||||||
|
cleaned_article = sanitize(article, @candidates, options)
|
||||||
|
if article.text.strip.length < options[:retry_length]
|
||||||
|
if @remove_unlikely_candidates
|
||||||
|
@remove_unlikely_candidates = false
|
||||||
|
elsif @weight_classes
|
||||||
|
@weight_classes = false
|
||||||
|
elsif @clean_conditionally
|
||||||
|
@clean_conditionally = false
|
||||||
|
else
|
||||||
|
# nothing we can do
|
||||||
|
return cleaned_article
|
||||||
|
end
|
||||||
|
|
||||||
|
make_html
|
||||||
|
content
|
||||||
|
else
|
||||||
|
cleaned_article
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_article(candidates, best_candidate)
|
||||||
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
|
# Things like preambles, content split by ads that we removed, etc.
|
||||||
|
|
||||||
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
||||||
|
output = Nokogiri::XML::Node.new('div', @html)
|
||||||
|
best_candidate[:elem].parent.children.each do |sibling|
|
||||||
|
append = false
|
||||||
|
append = true if sibling == best_candidate[:elem]
|
||||||
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
||||||
|
|
||||||
|
if sibling.name.downcase == "p"
|
||||||
|
link_density = get_link_density(sibling)
|
||||||
|
node_content = sibling.text
|
||||||
|
node_length = node_content.length
|
||||||
|
|
||||||
|
append = if node_length > 80 && link_density < 0.25
|
||||||
|
true
|
||||||
|
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if append
|
||||||
|
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
|
||||||
|
sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
||||||
|
output << sibling_dup
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
output
|
||||||
|
end
|
||||||
|
|
||||||
|
def select_best_candidate(candidates)
|
||||||
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
||||||
|
|
||||||
|
debug("Top 5 candidates:")
|
||||||
|
sorted_candidates[0...5].each do |candidate|
|
||||||
|
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
||||||
|
end
|
||||||
|
|
||||||
|
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
|
||||||
|
debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
|
||||||
|
|
||||||
|
best_candidate
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_link_density(elem)
|
||||||
|
link_length = elem.css("a").map(&:text).join("").length
|
||||||
|
text_length = elem.text.length
|
||||||
|
link_length / text_length.to_f
|
||||||
|
end
|
||||||
|
|
||||||
|
def class_weight(e)
|
||||||
|
weight = 0
|
||||||
|
return weight unless @weight_classes
|
||||||
|
|
||||||
|
if e[:class] && e[:class] != ""
|
||||||
|
weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
|
||||||
|
weight += 25 if e[:class] =~ REGEXES[:positiveRe]
|
||||||
|
end
|
||||||
|
|
||||||
|
if e[:id] && e[:id] != ""
|
||||||
|
weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
|
||||||
|
weight += 25 if e[:id] =~ REGEXES[:positiveRe]
|
||||||
|
end
|
||||||
|
|
||||||
|
weight
|
||||||
|
end
|
||||||
|
|
||||||
|
ELEMENT_SCORES = {
|
||||||
|
'div' => 5,
|
||||||
|
'blockquote' => 3,
|
||||||
|
'form' => -3,
|
||||||
|
'th' => -5
|
||||||
|
}.freeze
|
||||||
|
|
||||||
|
def score_node(elem)
|
||||||
|
content_score = class_weight(elem)
|
||||||
|
content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
|
||||||
|
{ :content_score => content_score, :elem => elem }
|
||||||
|
end
|
||||||
|
|
||||||
|
def debug(str)
|
||||||
|
puts str if options[:debug]
|
||||||
|
end
|
||||||
|
|
||||||
|
def sanitize(node, candidates, options = {})
|
||||||
|
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
||||||
|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
||||||
|
end
|
||||||
|
|
||||||
|
node.css("form, object, iframe, embed").each do |elem|
|
||||||
|
elem.remove
|
||||||
|
end
|
||||||
|
|
||||||
|
if @options[:remove_empty_nodes]
|
||||||
|
# remove <p> tags that have no text content - this will also remove p tags that contain only images.
|
||||||
|
node.css("p").each do |elem|
|
||||||
|
elem.remove if elem.content.strip.empty?
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Conditionally clean <table>s, <ul>s, and <div>s
|
||||||
|
clean_conditionally(node, candidates, "table, ul, div")
|
||||||
|
|
||||||
|
# We'll sanitize all elements using a whitelist
|
||||||
|
base_whitelist = @options[:tags] || %w[div p]
|
||||||
|
# We'll add whitespace instead of block elements,
|
||||||
|
# so a<br>b will have a nice space between them
|
||||||
|
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
|
||||||
|
|
||||||
|
# Use a hash for speed (don't want to make a million calls to include?)
|
||||||
|
whitelist = Hash.new
|
||||||
|
base_whitelist.each {|tag| whitelist[tag] = true }
|
||||||
|
replace_with_whitespace = Hash.new
|
||||||
|
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
|
||||||
|
|
||||||
|
([node] + node.css("*")).each do |el|
|
||||||
|
# If element is in whitelist, delete all its attributes
|
||||||
|
if whitelist[el.node_name]
|
||||||
|
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
||||||
|
|
||||||
|
# Otherwise, replace the element with its contents
|
||||||
|
else
|
||||||
|
# If element is root, replace the node as a text node
|
||||||
|
if el.parent.nil?
|
||||||
|
node = Nokogiri::XML::Text.new(el.text, el.document)
|
||||||
|
break
|
||||||
|
else
|
||||||
|
if replace_with_whitespace[el.node_name]
|
||||||
|
el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
|
||||||
|
else
|
||||||
|
el.swap(Nokogiri::XML::Text.new(el.text, el.document))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
s = Nokogiri::XML::Node::SaveOptions
|
||||||
|
save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
|
||||||
|
html = node.serialize(:save_with => save_opts)
|
||||||
|
|
||||||
|
# Get rid of duplicate whitespace
|
||||||
|
return html.gsub(/[\r\n\f]+/, "\n" )
|
||||||
|
end
|
||||||
|
|
||||||
|
def clean_conditionally(node, candidates, selector)
|
||||||
|
return unless @clean_conditionally
|
||||||
|
node.css(selector).each do |el|
|
||||||
|
weight = class_weight(el)
|
||||||
|
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
||||||
|
name = el.name.downcase
|
||||||
|
|
||||||
|
if weight + content_score < 0
|
||||||
|
el.remove
|
||||||
|
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
||||||
|
elsif el.text.count(",") < 10
|
||||||
|
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
||||||
|
counts["li"] -= 100
|
||||||
|
|
||||||
|
# For every img under a noscript tag discount one from the count to avoid double counting
|
||||||
|
counts["img"] -= el.css("noscript").css("img").length
|
||||||
|
|
||||||
|
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
||||||
|
link_density = get_link_density(el)
|
||||||
|
|
||||||
|
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
||||||
|
if reason
|
||||||
|
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
||||||
|
el.remove
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
|
||||||
|
if (counts["img"] > counts["p"]) && (counts["img"] > 1)
|
||||||
|
"too many images"
|
||||||
|
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
||||||
|
"more <li>s than <p>s"
|
||||||
|
elsif counts["input"] > (counts["p"] / 3).to_i
|
||||||
|
"less than 3x <p>s than <input>s"
|
||||||
|
elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
|
||||||
|
"too short a content length without a single image"
|
||||||
|
elsif weight < 25 && link_density > 0.2
|
||||||
|
"too many links for its weight (#{weight})"
|
||||||
|
elsif weight >= 25 && link_density > 0.5
|
||||||
|
"too many links for its weight (#{weight})"
|
||||||
|
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
||||||
|
"<embed>s with too short a content length, or too many <embed>s"
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
# 제거항목 추가항목을 지정한다.
|
||||||
|
def handle_exclusions!(whitelist, blacklist)
|
||||||
|
return unless whitelist || blacklist
|
||||||
|
|
||||||
|
if blacklist
|
||||||
|
elems = @html.css(blacklist)
|
||||||
|
if elems
|
||||||
|
elems.each do |e|
|
||||||
|
e.remove
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if whitelist
|
||||||
|
elems = @html.css(whitelist).to_s
|
||||||
|
|
||||||
|
if body = @html.at_css('body')
|
||||||
|
body.inner_html = elems
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@input = @html.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
# 코멘트가 제거된 기본 html 노드 반환
|
||||||
|
def make_html(whitelist=nil, blacklist=nil)
|
||||||
|
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
|
||||||
|
# In case document has no body, such as from empty string or redirect
|
||||||
|
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
|
||||||
|
# Remove html comment tags
|
||||||
|
@html.xpath('//comment()').each { |i| i.remove }
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_candidates
|
||||||
|
@html.css("script, style").each { |i| i.remove }
|
||||||
|
remove_unlikely_candidates! if @remove_unlikely_candidates
|
||||||
|
transform_misused_divs_into_paragraphs!
|
||||||
|
|
||||||
|
@candidates = score_paragraphs(options[:min_text_length])
|
||||||
|
@best_candidate = select_best_candidate(@candidates)
|
||||||
|
end
|
||||||
|
|
||||||
|
# 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.)
|
||||||
|
def remove_unlikely_candidates!
|
||||||
|
@html.css("*").each do |elem|
|
||||||
|
str = "#{elem[:class]}#{elem[:id]}"
|
||||||
|
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
|
||||||
|
debug("Removing unlikely candidate - #{str}")
|
||||||
|
elem.remove
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# 잘못 사용되고 있는 DIV를 p로 변환한다.
|
||||||
|
def transform_misused_divs_into_paragraphs!
|
||||||
|
@html.css("*").each do |elem|
|
||||||
|
if elem.name.downcase == "div"
|
||||||
|
# transform <div>s that do not contain other block elements into <p>s
|
||||||
|
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
||||||
|
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
||||||
|
elem.name = "p"
|
||||||
|
end
|
||||||
|
else
|
||||||
|
# wrap text nodes in p tags
|
||||||
|
# elem.children.each do |child|
|
||||||
|
# if child.text?
|
||||||
|
# debug("wrapping text node with a p")
|
||||||
|
# child.swap("<p>#{child.text}</p>")
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# 가능노드에 점수를 매긴다.
|
||||||
|
def score_paragraphs(min_text_length)
|
||||||
|
candidates = {}
|
||||||
|
@html.css("p,td").each do |elem|
|
||||||
|
parent_node = elem.parent
|
||||||
|
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
||||||
|
inner_text = elem.text
|
||||||
|
|
||||||
|
# If this paragraph is less than 25 characters, don't even count it.
|
||||||
|
next if inner_text.length < min_text_length
|
||||||
|
|
||||||
|
candidates[parent_node] ||= score_node(parent_node)
|
||||||
|
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
|
||||||
|
|
||||||
|
content_score = 1
|
||||||
|
content_score += inner_text.split(',').length
|
||||||
|
content_score += [(inner_text.length / 100).to_i, 3].min
|
||||||
|
|
||||||
|
candidates[parent_node][:content_score] += content_score
|
||||||
|
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
|
||||||
|
end
|
||||||
|
|
||||||
|
# Scale the final candidates score based on link density. Good content should have a
|
||||||
|
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
||||||
|
candidates.each do |elem, candidate|
|
||||||
|
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
|
||||||
|
end
|
||||||
|
|
||||||
|
candidates
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,64 @@
|
||||||
|
defmodule Readability.TitleFinder do
|
||||||
|
@moduledoc """
|
||||||
|
The TitleFinder engine traverse the HTML tree searching for finding title.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@title_suffix ~r/(\-)|(\:\:)|(\|)/
|
||||||
|
@h_tag_selector "h1, h2, h3"
|
||||||
|
|
||||||
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
def title(html_tree) do
|
||||||
|
maybe_title = tag_title(html_tree)
|
||||||
|
if length(String.split(maybe_title, " ")) <= 4 do
|
||||||
|
maybe_title = og_title(html_tree)
|
||||||
|
end
|
||||||
|
maybe_title || h_tag_title(html_tree)
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Find title from title tag
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec tag_title(html_tree) :: binary
|
||||||
|
|
||||||
|
def tag_title(html_tree) do
|
||||||
|
html_tree
|
||||||
|
|> Floki.find("title")
|
||||||
|
|> to_clean_text
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Find title from og:title property of meta tag
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec og_title(html_tree) :: binary
|
||||||
|
|
||||||
|
def og_title(html_tree) do
|
||||||
|
html_tree
|
||||||
|
|> Floki.find("meta[property=og:title]")
|
||||||
|
|> Floki.attribute("content")
|
||||||
|
|> to_clean_text
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Find title from h tag
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec h_tag_title(html_tree, String.t) :: binary
|
||||||
|
|
||||||
|
def h_tag_title(html_tree, selector \\@h_tag_selector) do
|
||||||
|
html_tree
|
||||||
|
|> Floki.find(selector)
|
||||||
|
|> hd
|
||||||
|
|> to_clean_text
|
||||||
|
end
|
||||||
|
|
||||||
|
defp to_clean_text(html_tree) do
|
||||||
|
title_text = html_tree
|
||||||
|
|> Floki.text
|
||||||
|
|> String.split(@title_suffix)
|
||||||
|
|> hd
|
||||||
|
|> String.strip
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,34 @@
|
||||||
|
defmodule Readability.Mixfile do
|
||||||
|
use Mix.Project
|
||||||
|
|
||||||
|
def project do
|
||||||
|
[app: :readability,
|
||||||
|
version: "0.0.1",
|
||||||
|
elixir: "~> 1.2",
|
||||||
|
build_embedded: Mix.env == :prod,
|
||||||
|
start_permanent: Mix.env == :prod,
|
||||||
|
deps: deps]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Configuration for the OTP application
|
||||||
|
#
|
||||||
|
# Type "mix help compile.app" for more information
|
||||||
|
def application do
|
||||||
|
[applications: [:logger,
|
||||||
|
:floki
|
||||||
|
]]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Dependencies can be Hex packages:
|
||||||
|
#
|
||||||
|
# {:mydep, "~> 0.3.0"}
|
||||||
|
#
|
||||||
|
# Or git/path repositories:
|
||||||
|
#
|
||||||
|
# {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"}
|
||||||
|
#
|
||||||
|
# Type "mix help deps" for more examples and options
|
||||||
|
defp deps do
|
||||||
|
[{:floki, "~> 0.8.0"}]
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,2 @@
|
||||||
|
%{"floki": {:hex, :floki, "0.8.0"},
|
||||||
|
"mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,8 @@
|
||||||
|
defmodule ReadabilityTest do
|
||||||
|
use ExUnit.Case
|
||||||
|
doctest Readability
|
||||||
|
|
||||||
|
test "the truth" do
|
||||||
|
assert 1 + 1 == 2
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1 @@
|
||||||
|
ExUnit.start()
|
|
@ -0,0 +1,45 @@
|
||||||
|
defmodule Readability.TitleFinderTest do
|
||||||
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
|
doctest Readability
|
||||||
|
|
||||||
|
@html """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Tag title - test</title>
|
||||||
|
<meta property='og:title' content='og title | test'>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
<h1>h1 title</h1>
|
||||||
|
<h2>h2 title</h2>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
test "extract og title" do
|
||||||
|
title = Readability.TitleFinder.og_title(@html)
|
||||||
|
assert title == "og title"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extract tag title" do
|
||||||
|
title = Readability.TitleFinder.tag_title(@html)
|
||||||
|
assert title == "Tag title"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extract h1 tag title" do
|
||||||
|
title = Readability.TitleFinder.h_tag_title(@html)
|
||||||
|
assert title == "h1 title"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extrat h2 tag title" do
|
||||||
|
title = Readability.TitleFinder.h_tag_title(@html, "h2")
|
||||||
|
assert title == "h2 title"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extract most proper title" do
|
||||||
|
title = Readability.TitleFinder.title(@html)
|
||||||
|
assert title == "og title"
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue