2016-04-15 11:51:29 +00:00
|
|
|
defmodule Readability do
|
2016-04-17 12:26:51 +00:00
|
|
|
@moduledoc """
|
2016-04-24 07:14:31 +00:00
|
|
|
Readability library for extracting & curating articles.
|
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
```elixir
|
|
|
|
@type html :: binary
|
|
|
|
|
2016-05-07 09:23:19 +00:00
|
|
|
# Just pass url
|
|
|
|
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
|
|
|
|
|
2016-04-24 09:40:35 +00:00
|
|
|
# Extract title
|
2016-04-24 07:14:31 +00:00
|
|
|
Readability.title(html)
|
|
|
|
|
2016-04-28 06:19:11 +00:00
|
|
|
# Extract authors.
|
|
|
|
Readability.authors(html)
|
|
|
|
|
2016-04-24 09:40:35 +00:00
|
|
|
# Extract only text from article
|
|
|
|
article = html
|
|
|
|
|> Readability.article
|
2016-04-24 07:14:31 +00:00
|
|
|
|> Readability.readable_text
|
|
|
|
|
2016-04-24 09:40:35 +00:00
|
|
|
# Extract article with transformed html
|
|
|
|
article = html
|
|
|
|
|> Readability.article
|
2016-04-24 07:14:31 +00:00
|
|
|
|> Readability.raw_html
|
|
|
|
```
|
2016-04-17 12:26:51 +00:00
|
|
|
"""
|
|
|
|
|
2016-04-15 11:51:29 +00:00
|
|
|
alias Readability.TitleFinder
|
2016-04-28 06:13:03 +00:00
|
|
|
alias Readability.AuthorFinder
|
2016-04-17 12:26:51 +00:00
|
|
|
alias Readability.ArticleBuilder
|
2016-05-07 09:23:19 +00:00
|
|
|
alias Readability.Summary
|
2016-04-28 06:13:03 +00:00
|
|
|
alias Readability.Helper
|
2016-04-17 12:26:51 +00:00
|
|
|
|
2018-02-09 03:42:08 +00:00
|
|
|
@default_options [
|
|
|
|
retry_length: 250,
|
|
|
|
min_text_length: 25,
|
|
|
|
remove_unlikely_candidates: true,
|
|
|
|
weight_classes: true,
|
|
|
|
clean_conditionally: true,
|
|
|
|
remove_empty_nodes: true,
|
|
|
|
min_image_width: 130,
|
|
|
|
min_image_height: 80,
|
|
|
|
ignore_image_format: [],
|
|
|
|
blacklist: nil,
|
|
|
|
whitelist: nil,
|
|
|
|
page_url: nil
|
|
|
|
]
|
|
|
|
|
|
|
|
@regexes [
|
|
|
|
unlikely_candidate:
|
|
|
|
~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
|
|
|
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
|
|
|
|
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
|
|
|
negative:
|
|
|
|
~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
|
|
|
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
|
|
|
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
|
|
|
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
|
|
|
replace_xml_version: ~r/<\?xml.*\?>/i,
|
|
|
|
normalize: ~r/\s{2,}/,
|
|
|
|
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
2018-06-30 00:59:43 +00:00
|
|
|
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
|
|
|
|
img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
|
2018-02-09 03:42:08 +00:00
|
|
|
]
|
2016-04-15 11:51:29 +00:00
|
|
|
|
2018-02-09 03:22:17 +00:00
|
|
|
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
|
2017-04-14 13:11:44 +00:00
|
|
|
|
2016-04-15 11:51:29 +00:00
|
|
|
@type html_tree :: tuple | list
|
2016-05-07 09:23:19 +00:00
|
|
|
@type raw_html :: binary
|
|
|
|
@type url :: binary
|
2016-04-17 12:26:51 +00:00
|
|
|
@type options :: list
|
2017-04-14 13:11:44 +00:00
|
|
|
@type headers :: list[tuple]
|
2016-04-15 11:51:29 +00:00
|
|
|
|
2016-05-07 09:23:19 +00:00
|
|
|
@doc """
|
|
|
|
summarize the primary readable content of a webpage.
|
|
|
|
"""
|
2018-02-09 03:42:08 +00:00
|
|
|
@spec summarize(url, options) :: Summary.t()
|
2016-05-07 09:23:19 +00:00
|
|
|
def summarize(url, opts \\ []) do
|
2018-02-09 03:42:08 +00:00
|
|
|
opts = Keyword.merge(opts, page_url: url)
|
|
|
|
httpoison_options = Application.get_env(:readability, :httpoison_options, [])
|
2017-04-14 13:11:44 +00:00
|
|
|
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
|
|
|
|
|
|
|
|
case is_response_markup(headers) do
|
|
|
|
true ->
|
2018-06-30 00:59:43 +00:00
|
|
|
html_tree =
|
|
|
|
raw
|
|
|
|
|> Helper.normalize(url: url)
|
2017-04-14 13:11:44 +00:00
|
|
|
|
2018-02-09 03:42:08 +00:00
|
|
|
article_tree =
|
|
|
|
html_tree
|
|
|
|
|> ArticleBuilder.build(opts)
|
|
|
|
|
|
|
|
%Summary{
|
|
|
|
title: title(html_tree),
|
|
|
|
authors: authors(html_tree),
|
|
|
|
article_html: readable_html(article_tree),
|
|
|
|
article_text: readable_text(article_tree)
|
2017-04-14 13:11:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
_ ->
|
2018-02-09 03:42:08 +00:00
|
|
|
%Summary{title: nil, authors: nil, article_html: nil, article_text: raw}
|
2017-04-14 13:11:44 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
|
|
|
Extract MIME Type from headers
|
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
iex> mime = Readability.mime(headers_list)
|
|
|
|
"text/html"
|
|
|
|
"""
|
|
|
|
@spec mime(headers) :: String.t()
|
|
|
|
def mime(headers \\ []) do
|
|
|
|
headers
|
|
|
|
|> Enum.find(
|
2018-02-09 03:42:08 +00:00
|
|
|
# default
|
|
|
|
{"Content-Type", "text/plain"},
|
|
|
|
fn {key, _} -> String.downcase(key) == "content-type" end
|
|
|
|
)
|
2017-04-14 13:11:44 +00:00
|
|
|
|> elem(1)
|
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
|
|
|
Return true if Content-Type in provided headers list is a markup type,
|
|
|
|
else false
|
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
|
|
|
|
true
|
|
|
|
"""
|
|
|
|
@spec is_response_markup(headers) :: boolean
|
|
|
|
def is_response_markup(headers) do
|
|
|
|
mime(headers) =~ @markup_mimes
|
2016-05-07 09:23:19 +00:00
|
|
|
end
|
|
|
|
|
2016-04-24 09:40:35 +00:00
|
|
|
@doc """
|
|
|
|
Extract title
|
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
iex> title = Readability.title(html_str)
|
|
|
|
"Some title in html"
|
|
|
|
"""
|
2016-04-28 06:13:03 +00:00
|
|
|
@spec title(binary | html_tree) :: binary
|
2016-05-07 09:23:19 +00:00
|
|
|
def title(raw_html) when is_binary(raw_html) do
|
2018-02-09 03:42:08 +00:00
|
|
|
raw_html
|
|
|
|
|> Helper.normalize()
|
|
|
|
|> title
|
2016-05-07 09:23:19 +00:00
|
|
|
end
|
2016-04-15 11:51:29 +00:00
|
|
|
|
2018-02-09 03:42:08 +00:00
|
|
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
2016-04-28 06:13:03 +00:00
|
|
|
|
|
|
|
@doc """
|
|
|
|
Extract authors
|
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
iex> authors = Readability.authors(html_str)
|
|
|
|
["José Valim", "chrismccord"]
|
|
|
|
"""
|
|
|
|
@spec authors(binary | html_tree) :: list[binary]
|
|
|
|
def authors(html) when is_binary(html), do: html |> parse |> authors
|
|
|
|
def authors(html_tree), do: AuthorFinder.find(html_tree)
|
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
@doc """
|
|
|
|
Using a variety of metrics (content score, classname, element types), find the content that is
|
|
|
|
most likely to be the stuff a user wants to read
|
2016-04-24 09:40:35 +00:00
|
|
|
|
|
|
|
## Example
|
|
|
|
|
|
|
|
iex> article_tree = Redability(html_str)
|
|
|
|
# returns article that is tuple
|
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
"""
|
2016-04-24 09:40:35 +00:00
|
|
|
@spec article(binary, options) :: html_tree
|
|
|
|
def article(raw_html, opts \\ []) do
|
2016-04-17 12:26:51 +00:00
|
|
|
opts = Keyword.merge(@default_options, opts)
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
raw_html
|
2018-02-09 03:42:08 +00:00
|
|
|
|> Helper.normalize()
|
2016-04-17 12:26:51 +00:00
|
|
|
|> ArticleBuilder.build(opts)
|
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
2016-04-28 06:13:03 +00:00
|
|
|
return attributes, tags cleaned html
|
2016-04-17 12:26:51 +00:00
|
|
|
"""
|
2016-04-28 06:13:03 +00:00
|
|
|
@spec readable_html(html_tree) :: binary
|
|
|
|
def readable_html(html_tree) do
|
|
|
|
html_tree
|
2017-02-05 09:48:26 +00:00
|
|
|
|> Helper.remove_attrs(regexes(:protect_attrs))
|
2016-04-28 06:13:03 +00:00
|
|
|
|> raw_html
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
2016-04-24 09:40:35 +00:00
|
|
|
return only text binary from html_tree
|
2016-04-17 12:26:51 +00:00
|
|
|
"""
|
2016-04-28 06:13:03 +00:00
|
|
|
@spec readable_text(html_tree) :: binary
|
2016-04-24 07:14:31 +00:00
|
|
|
def readable_text(html_tree) do
|
2016-04-24 05:32:43 +00:00
|
|
|
# TODO: Remove image caption when extract only text
|
2016-04-17 12:26:51 +00:00
|
|
|
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
|
|
|
html_str = html_tree |> raw_html
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
Regex.replace(tags_to_br, html_str, &"\n#{&1}")
|
|
|
|
|> Floki.parse()
|
|
|
|
|> Floki.text()
|
2018-07-24 09:13:08 +00:00
|
|
|
|> String.trim()
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
@doc """
|
|
|
|
return raw html binary from html_tree
|
|
|
|
"""
|
|
|
|
@spec raw_html(html_tree) :: binary
|
|
|
|
def raw_html(html_tree) do
|
2018-07-18 15:00:47 +00:00
|
|
|
html_tree |> Floki.raw_html(encode: false)
|
2016-04-28 06:13:03 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
|
|
|
|
2017-02-05 09:48:26 +00:00
|
|
|
def regexes(key), do: @regexes[key]
|
2016-04-17 12:26:51 +00:00
|
|
|
|
|
|
|
def default_options, do: @default_options
|
2016-04-15 11:51:29 +00:00
|
|
|
end
|