readability/lib/readability.ex

155 lines
4.5 KiB
Elixir
Raw Normal View History

2016-04-15 11:51:29 +00:00
defmodule Readability do
2016-04-17 12:26:51 +00:00
@moduledoc """
2016-04-24 07:14:31 +00:00
Readability library for extracting & curating articles.
## Example
```elixir
@type html :: binary
2016-04-24 09:40:35 +00:00
# Extract title
2016-04-24 07:14:31 +00:00
Readability.title(html)
2016-04-28 06:19:11 +00:00
# Extract authors.
Readability.authors(html)
2016-04-24 09:40:35 +00:00
# Extract only text from article
article = html
|> Readability.article
2016-04-24 07:14:31 +00:00
|> Readability.readable_text
2016-04-24 09:40:35 +00:00
# Extract article with transformed html
article = html
|> Readability.article
2016-04-24 07:14:31 +00:00
|> Readability.raw_html
```
2016-04-17 12:26:51 +00:00
"""
2016-04-15 11:51:29 +00:00
alias Readability.TitleFinder
2016-04-28 06:13:03 +00:00
alias Readability.AuthorFinder
2016-04-17 12:26:51 +00:00
alias Readability.ArticleBuilder
2016-04-28 06:13:03 +00:00
alias Readability.Helper
2016-04-17 12:26:51 +00:00
@default_options [retry_length: 250,
min_text_length: 25,
remove_unlikely_candidates: true,
weight_classes: true,
clean_conditionally: true,
remove_empty_nodes: true,
min_image_width: 130,
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
whitelist: nil
]
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
normalize: ~r/\s{2,}/,
2016-04-28 06:13:03 +00:00
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
2016-04-17 12:26:51 +00:00
]
2016-04-15 11:51:29 +00:00
@type html_tree :: tuple | list
2016-04-17 12:26:51 +00:00
@type options :: list
2016-04-15 11:51:29 +00:00
2016-04-24 09:40:35 +00:00
@doc """
Extract title
## Example
iex> title = Readability.title(html_str)
"Some title in html"
"""
2016-04-28 06:13:03 +00:00
@spec title(binary | html_tree) :: binary
def title(html) when is_binary(html), do: html |> normalize |> title
2016-04-15 11:51:29 +00:00
def title(html_tree), do: TitleFinder.title(html_tree)
2016-04-28 06:13:03 +00:00
@doc """
Extract authors
## Example
iex> authors = Readability.authors(html_str)
["José Valim", "chrismccord"]
"""
@spec authors(binary | html_tree) :: list[binary]
def authors(html) when is_binary(html), do: html |> parse |> authors
def authors(html_tree), do: AuthorFinder.find(html_tree)
2016-04-17 12:26:51 +00:00
@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read
2016-04-24 09:40:35 +00:00
## Example
iex> article_tree = Redability(html_str)
# returns article that is tuple
2016-04-17 12:26:51 +00:00
"""
2016-04-24 09:40:35 +00:00
@spec article(binary, options) :: html_tree
def article(raw_html, opts \\ []) do
2016-04-17 12:26:51 +00:00
opts = Keyword.merge(@default_options, opts)
raw_html
2016-04-28 06:13:03 +00:00
|> normalize
2016-04-17 12:26:51 +00:00
|> ArticleBuilder.build(opts)
end
@doc """
2016-04-28 06:13:03 +00:00
return attributes, tags cleaned html
2016-04-17 12:26:51 +00:00
"""
2016-04-28 06:13:03 +00:00
@spec readable_html(html_tree) :: binary
def readable_html(html_tree) do
html_tree
|> Helper.remove_attrs(regexes[:protect_attrs])
|> raw_html
2016-04-17 12:26:51 +00:00
end
@doc """
2016-04-24 09:40:35 +00:00
return only text binary from html_tree
2016-04-17 12:26:51 +00:00
"""
2016-04-28 06:13:03 +00:00
@spec readable_text(html_tree) :: binary
2016-04-24 07:14:31 +00:00
def readable_text(html_tree) do
2016-04-24 05:32:43 +00:00
# TODO: Remove image caption when extract only text
2016-04-17 12:26:51 +00:00
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
2016-04-24 05:32:43 +00:00
|> String.strip
2016-04-17 12:26:51 +00:00
end
2016-04-28 06:13:03 +00:00
@doc """
return raw html binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
2016-04-24 09:40:35 +00:00
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec parse(binary) :: html_tree
2016-04-28 06:13:03 +00:00
def normalize(raw_html) do
2016-04-24 09:40:35 +00:00
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
2016-04-28 06:13:03 +00:00
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
2016-04-17 12:26:51 +00:00
def regexes, do: @regexes
def default_options, do: @default_options
2016-04-15 11:51:29 +00:00
end