95 lines
3.5 KiB
Elixir
95 lines
3.5 KiB
Elixir
|
defmodule Readability.ContentFinder do
|
||
|
@moduledoc """
|
||
|
ContentFinder uses a variety of metrics for finding the content
|
||
|
that is most likely to be the stuff a user wants to read.
|
||
|
Then return it wrapped up in a div.
|
||
|
"""
|
||
|
|
||
|
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||
|
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
|
||
|
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||
|
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
||
|
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||
|
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||
|
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
|
||
|
trimRe: ~r/^\s+|\s+$/,
|
||
|
normalizeRe: ~r/\s{2,}/,
|
||
|
killBreaksRe: ~r/(<br\s*\/?>(\s| ?)*){1,}/,
|
||
|
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
||
|
]
|
||
|
|
||
|
@type html_tree :: tuple | list
|
||
|
|
||
|
@spec content(html_tree) :: html_tree
|
||
|
|
||
|
def content(html_tree, options \\ []) do
|
||
|
candidate = html_tree
|
||
|
|> preapre_cadidates
|
||
|
|
||
|
best_candidate = candidate
|
||
|
|> select_best_candidate
|
||
|
|
||
|
candidate
|
||
|
|> fix_relative_uris
|
||
|
end
|
||
|
|
||
|
defp preapre_cadidates(html_tree) do
|
||
|
html_tree
|
||
|
|> Floki.filter_out("script")
|
||
|
|> Floki.filter_out("style")
|
||
|
|> remove_unlikely_candidates
|
||
|
|> transform_misused_divs_into_paragraphs
|
||
|
end
|
||
|
|
||
|
@doc """
|
||
|
Remove unlikely tag nodes
|
||
|
"""
|
||
|
|
||
|
@spec remove_unlikely_candidates(html_tree) :: html_tree
|
||
|
|
||
|
def remove_unlikely_candidates(content) when is_binary(content), do: content
|
||
|
def remove_unlikely_candidates([]), do: []
|
||
|
def remove_unlikely_candidates([h|t]) do
|
||
|
case remove_unlikely_candidates(h) do
|
||
|
nil -> remove_unlikely_candidates(t)
|
||
|
html_tree -> [html_tree|remove_unlikely_candidates(t)]
|
||
|
end
|
||
|
end
|
||
|
def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
|
||
|
cond do
|
||
|
unlikely_candidate?(tag_name, attrs) -> nil
|
||
|
true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
|
||
|
end
|
||
|
end
|
||
|
defp unlikely_candidate?(tag_name, attrs) do
|
||
|
idclass_str = attrs
|
||
|
|> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
|
||
|
fn(attr) -> elem(attr, 1) end)
|
||
|
|> Enum.join("")
|
||
|
str = tag_name <> idclass_str
|
||
|
str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
|
||
|
end
|
||
|
|
||
|
def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
|
||
|
def transform_misused_divs_into_paragraphs([]), do: []
|
||
|
def transform_misused_divs_into_paragraphs([h|t]) do
|
||
|
[transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
|
||
|
end
|
||
|
def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
|
||
|
if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
|
||
|
{tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
|
||
|
end
|
||
|
defp misused_divs?("div", inner_tree) do
|
||
|
!(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
|
||
|
end
|
||
|
defp misused_divs?(_, _), do: false
|
||
|
|
||
|
defp select_best_candidate(html_tree) do
|
||
|
html_tree
|
||
|
end
|
||
|
|
||
|
defp fix_relative_uris(html_tree) do
|
||
|
html_tree
|
||
|
end
|
||
|
end
|