readability/lib/readability.ex

defmodule Readability do
  @moduledoc """
  Readability library for extracting & curating articles.

  ## Example

  ```elixir
  @type html :: binary

  # Just pass url
  %Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)

  # Extract title
  Readability.title(html)

  # Extract authors.
  Readability.authors(html)

  # Extract only text from article
  article = html
            |> Readability.article
            |> Readability.readable_text

  # Extract article with transformed html
  article = html
            |> Readability.article
            |> Readability.raw_html
  ```
  """

  alias Readability.TitleFinder
  alias Readability.AuthorFinder
  alias Readability.ArticleBuilder
  alias Readability.Summary
  alias Readability.Helper

  @default_options [
    retry_length: 250,
    min_text_length: 25,
    remove_unlikely_candidates: true,
    weight_classes: true,
    clean_conditionally: true,
    remove_empty_nodes: true,
    min_image_width: 130,
    min_image_height: 80,
    ignore_image_format: [],
    blacklist: nil,
    whitelist: nil,
    page_url: nil
  ]

  @regexes [
    unlikely_candidate:
      ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
    ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
    positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
    negative:
      ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
    div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
    replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
    replace_fonts: ~r/<(\/?)font[^>]*>/i,
    replace_xml_version: ~r/<\?xml.*\?>/i,
    normalize: ~r/\s{2,}/,
    video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
    protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
    img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
  ]

  @markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i

  @type html_tree :: tuple | list
  @type raw_html :: binary
  @type url :: binary
  @type options :: list
  @type headers :: list[tuple]

  @doc """
  Extract MIME Type from headers

  ## Example

      iex> mime = Readability.mime(headers_list)
      "text/html"
  """
  @spec mime(headers) :: String.t()
  def mime(headers \\ []) do
    headers
    |> Enum.find(
      # default
      {"Content-Type", "text/plain"},
      fn {key, _} -> String.downcase(key) == "content-type" end
    )
    |> elem(1)
  end

  @doc """
  Return true if Content-Type in provided headers list is a markup type,
  else false

  ## Example

      iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
      true
  """
  @spec is_response_markup(headers) :: boolean
  def is_response_markup(headers) do
    mime(headers) =~ @markup_mimes
  end

  @doc """
  Extract title

  ## Example

      iex> title = Readability.title(html_str)
      "Some title in html"
  """
  @spec title(binary | html_tree) :: binary
  def title(raw_html) when is_binary(raw_html) do
    raw_html
    |> Helper.normalize()
    |> title
  end

  def title(html_tree), do: TitleFinder.title(html_tree)

  @doc """
  Extract authors

  ## Example

      iex> authors = Readability.authors(html_str)
      ["José Valim", "chrismccord"]
  """
  @spec authors(binary | html_tree) :: list[binary]
  def authors(html) when is_binary(html), do: html |> parse |> authors
  def authors(html_tree), do: AuthorFinder.find(html_tree)

  @doc """
  Using a variety of metrics (content score, classname, element types), find the content that is
  most likely to be the stuff a user wants to read

  ## Example

      iex> article_tree = Redability(html_str)
      # returns article that is tuple

  """
  @spec article(binary, options) :: html_tree
  def article(raw_html, opts \\ []) do
    opts = Keyword.merge(@default_options, opts)

    raw_html
    |> Helper.normalize()
    |> ArticleBuilder.build(opts)
  end

  @doc """
  return attributes, tags cleaned html
  """
  @spec readable_html(html_tree) :: binary
  def readable_html(html_tree) do
    html_tree
    |> Helper.remove_attrs(regexes(:protect_attrs))
    |> raw_html
  end

  @doc """
  return only text binary from html_tree
  """
  @spec readable_text(html_tree) :: binary
  def readable_text(html_tree) do
    # TODO: Remove image caption when extract only text
    tags_to_br = ~r/<\/(p|div|article|h\d)/i
    html_str = html_tree |> raw_html

    Regex.replace(tags_to_br, html_str, &"\n#{&1}")
    |> Floki.parse()
    |> Floki.text()
    |> String.trim()
  end

  @doc """
  return raw html binary from html_tree
  """
  @spec raw_html(html_tree) :: binary
  def raw_html(html_tree) do
    html_tree |> Floki.raw_html(encode: false)
  end

  def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)

  def regexes(key), do: @regexes[key]

  def default_options, do: @default_options
end
initial commit 2016-04-15 11:51:29 +00:00			`defmodule Readability do`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`@moduledoc """`
add doc 2016-04-24 07:14:31 +00:00			`Readability library for extracting & curating articles.`

			`## Example`

			```elixir
			`@type html :: binary`

add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`# Just pass url`
			`%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)`

add document 2016-04-24 09:40:35 +00:00			`# Extract title`
add doc 2016-04-24 07:14:31 +00:00			`Readability.title(html)`

add authors extractor doc 2016-04-28 06:19:11 +00:00			`# Extract authors.`
			`Readability.authors(html)`

add document 2016-04-24 09:40:35 +00:00			`# Extract only text from article`
			`article = html`
			`\|> Readability.article`
add doc 2016-04-24 07:14:31 +00:00			`\|> Readability.readable_text`

add document 2016-04-24 09:40:35 +00:00			`# Extract article with transformed html`
			`article = html`
			`\|> Readability.article`
add doc 2016-04-24 07:14:31 +00:00			`\|> Readability.raw_html`
			```
add candidate builder add test 2016-04-17 12:26:51 +00:00			`"""`

initial commit 2016-04-15 11:51:29 +00:00			`alias Readability.TitleFinder`
add authors finder 2016-04-28 06:13:03 +00:00			`alias Readability.AuthorFinder`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`alias Readability.ArticleBuilder`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`alias Readability.Summary`
add authors finder 2016-04-28 06:13:03 +00:00			`alias Readability.Helper`
add candidate builder add test 2016-04-17 12:26:51 +00:00
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`@default_options [`
			`retry_length: 250,`
			`min_text_length: 25,`
			`remove_unlikely_candidates: true,`
			`weight_classes: true,`
			`clean_conditionally: true,`
			`remove_empty_nodes: true,`
			`min_image_width: 130,`
			`min_image_height: 80,`
			`ignore_image_format: [],`
			`blacklist: nil,`
			`whitelist: nil,`
			`page_url: nil`
			`]`

			`@regexes [`
			`unlikely_candidate:`
			`~r/combx\|comment\|community\|disqus\|extra\|foot\|header\|hidden\|lightbox\|modal\|menu\|meta\|nav\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popup/i,`
			`ok_maybe_its_a_candidate: ~r/and\|article\|body\|column\|main\|shadow/i,`
			`positive: ~r/article\|body\|content\|entry\|hentry\|main\|page\|pagination\|post\|text\|blog\|story/i,`
			`negative:`
			`~r/hidden\|^hid\|combx\|comment\|com-\|contact\|foot\|footer\|footnote\|link\|masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopping\|tags\|tool\|utility\|widget/i,`
			`div_to_p_elements: ~r/<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,`
			`replace_brs: ~r/(<br[^>]>[ \n\r\t]){2,}/i,`
			`replace_fonts: ~r/<(\/?)font[^>]*>/i,`
			`replace_xml_version: ~r/<\?xml.*\?>/i,`
			`normalize: ~r/\s{2,}/,`
			`video: ~r/\/\/(www\.)?(dailymotion\|youtube\|youtube-nocookie\|player\.vimeo)\.com/i,`
Convert relative img paths into absolute Fixes #27 2018-06-30 00:59:43 +00:00			`protect_attrs: ~r/^(?!id\|rel\|for\|summary\|title\|href\|src\|alt\|srcdoc)/i,`
			`img_tag_src: ~r/(<img.src=['"])([^'"]+)(['"][^>]>)/Ui`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`]`
initial commit 2016-04-15 11:51:29 +00:00
When we regex-check the MIME header we should also support zero space between the type and the charset, say "text/html;charset=utf-8". 2018-02-09 03:22:17 +00:00			`@markup_mimes ~r/^(application\|text)\/[a-z\-_\.\+]+ml(;\scharset=.)?$/i`
added ability to handle text-based responses added fix for content-type with charset updated function names to match elixir naming conventions (is_ vs ?) minor version bump added default content-type of text/plain when header is missing 2017-04-14 13:11:44 +00:00
initial commit 2016-04-15 11:51:29 +00:00			`@type html_tree :: tuple \| list`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`@type raw_html :: binary`
			`@type url :: binary`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`@type options :: list`
added ability to handle text-based responses added fix for content-type with charset updated function names to match elixir naming conventions (is_ vs ?) minor version bump added default content-type of text/plain when header is missing 2017-04-14 13:11:44 +00:00			`@type headers :: list[tuple]`
initial commit 2016-04-15 11:51:29 +00:00
added ability to handle text-based responses added fix for content-type with charset updated function names to match elixir naming conventions (is_ vs ?) minor version bump added default content-type of text/plain when header is missing 2017-04-14 13:11:44 +00:00			`@doc """`
			`Extract MIME Type from headers`

			`## Example`

			`iex> mime = Readability.mime(headers_list)`
			`"text/html"`
			`"""`
			`@spec mime(headers) :: String.t()`
			`def mime(headers \\ []) do`
			`headers`
			`\|> Enum.find(`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`# default`
			`{"Content-Type", "text/plain"},`
			`fn {key, _} -> String.downcase(key) == "content-type" end`
			`)`
added ability to handle text-based responses added fix for content-type with charset updated function names to match elixir naming conventions (is_ vs ?) minor version bump added default content-type of text/plain when header is missing 2017-04-14 13:11:44 +00:00			`\|> elem(1)`
			`end`

			`@doc """`
			`Return true if Content-Type in provided headers list is a markup type,`
			`else false`

			`## Example`

			`iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])`
			`true`
			`"""`
			`@spec is_response_markup(headers) :: boolean`
			`def is_response_markup(headers) do`
			`mime(headers) =~ @markup_mimes`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`end`

add document 2016-04-24 09:40:35 +00:00			`@doc """`
			`Extract title`

			`## Example`

			`iex> title = Readability.title(html_str)`
			`"Some title in html"`
			`"""`
add authors finder 2016-04-28 06:13:03 +00:00			`@spec title(binary \| html_tree) :: binary`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`def title(raw_html) when is_binary(raw_html) do`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`raw_html`
			`\|> Helper.normalize()`
			`\|> title`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`end`
initial commit 2016-04-15 11:51:29 +00:00
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`def title(html_tree), do: TitleFinder.title(html_tree)`
add authors finder 2016-04-28 06:13:03 +00:00
			`@doc """`
			`Extract authors`

			`## Example`

			`iex> authors = Readability.authors(html_str)`
			`["José Valim", "chrismccord"]`
			`"""`
			`@spec authors(binary \| html_tree) :: list[binary]`
			`def authors(html) when is_binary(html), do: html \|> parse \|> authors`
			`def authors(html_tree), do: AuthorFinder.find(html_tree)`

add candidate builder add test 2016-04-17 12:26:51 +00:00			`@doc """`
			`Using a variety of metrics (content score, classname, element types), find the content that is`
			`most likely to be the stuff a user wants to read`
add document 2016-04-24 09:40:35 +00:00
			`## Example`

			`iex> article_tree = Redability(html_str)`
			`# returns article that is tuple`

add candidate builder add test 2016-04-17 12:26:51 +00:00			`"""`
add document 2016-04-24 09:40:35 +00:00			`@spec article(binary, options) :: html_tree`
			`def article(raw_html, opts \\ []) do`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`opts = Keyword.merge(@default_options, opts)`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add candidate builder add test 2016-04-17 12:26:51 +00:00			`raw_html`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`\|> Helper.normalize()`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`\|> ArticleBuilder.build(opts)`
			`end`

			`@doc """`
add authors finder 2016-04-28 06:13:03 +00:00			`return attributes, tags cleaned html`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`"""`
add authors finder 2016-04-28 06:13:03 +00:00			`@spec readable_html(html_tree) :: binary`
			`def readable_html(html_tree) do`
			`html_tree`
fix some bug and update deps 2017-02-05 09:48:26 +00:00			`\|> Helper.remove_attrs(regexes(:protect_attrs))`
add authors finder 2016-04-28 06:13:03 +00:00			`\|> raw_html`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`

			`@doc """`
add document 2016-04-24 09:40:35 +00:00			`return only text binary from html_tree`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`"""`
add authors finder 2016-04-28 06:13:03 +00:00			`@spec readable_text(html_tree) :: binary`
add doc 2016-04-24 07:14:31 +00:00			`def readable_text(html_tree) do`
add test 2016-04-24 05:32:43 +00:00			`# TODO: Remove image caption when extract only text`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`tags_to_br = ~r/<\/(p\|div\|article\|h\d)/i`
			`html_str = html_tree \|> raw_html`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
			`Regex.replace(tags_to_br, html_str, &"\n#{&1}")`
			`\|> Floki.parse()`
			`\|> Floki.text()`
update deps and deprecated 2018-07-24 09:13:08 +00:00			`\|> String.trim()`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`

add authors finder 2016-04-28 06:13:03 +00:00			`@doc """`
			`return raw html binary from html_tree`
			`"""`
			`@spec raw_html(html_tree) :: binary`
			`def raw_html(html_tree) do`
Made tests pass, Floki updated to allow encoding of special characters of entities, update readability to disable this. 2018-07-18 15:00:47 +00:00			`html_tree \|> Floki.raw_html(encode: false)`
add authors finder 2016-04-28 06:13:03 +00:00			`end`

			`def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)`

fix some bug and update deps 2017-02-05 09:48:26 +00:00			`def regexes(key), do: @regexes[key]`
add candidate builder add test 2016-04-17 12:26:51 +00:00
			`def default_options, do: @default_options`
initial commit 2016-04-15 11:51:29 +00:00			`end`