readability/lib/readability/helper.ex

defmodule Readability.Helper do
  @moduledoc """
  Helpers for parsing, updating, removing html tree
  """

  @type html_tree :: tuple | list

  @doc """
  Change existing tags by selector
  """
  @spec change_tag(html_tree, String.t(), String.t()) :: html_tree
  def change_tag(content, _, _) when is_binary(content), do: content
  def change_tag([], _, _), do: []

  def change_tag([h | t], selector, tag) do
    [change_tag(h, selector, tag) | change_tag(t, selector, tag)]
  end

  def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
    {tag, attrs, change_tag(inner_tree, tag_name, tag)}
  end

  def change_tag({tag_name, attrs, html_tree}, selector, tag) do
    {tag_name, attrs, change_tag(html_tree, selector, tag)}
  end

  @doc """
  Remove html attributes
  """
  @spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
  def remove_attrs(content, _) when is_binary(content), do: content
  def remove_attrs([], _), do: []

  def remove_attrs([h | t], t_attrs) do
    [remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)]
  end

  def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
    reject_fun =
      cond do
        is_binary(target_attr) ->
          fn attr -> elem(attr, 0) == target_attr end

        Regex.regex?(target_attr) ->
          fn attr -> elem(attr, 0) =~ target_attr end

        is_list(target_attr) ->
          fn attr -> Enum.member?(target_attr, elem(attr, 0)) end

        true ->
          fn attr -> attr end
      end

    {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
  end

  @doc """
  Remove tags
  """
  @spec remove_tag(html_tree, fun) :: html_tree
  def remove_tag(content, _) when is_binary(content), do: content
  def remove_tag([], _), do: []

  def remove_tag([h | t], fun) do
    node = remove_tag(h, fun)

    if node == [] do
      remove_tag(t, fun)
    else
      [node | remove_tag(t, fun)]
    end
  end

  def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
    if fun.(html_tree) do
      []
    else
      {tag, attrs, remove_tag(inner_tree, fun)}
    end
  end

  @doc """
  Count only text length
  """
  @spec text_length(html_tree) :: number
  def text_length(html_tree) do
    html_tree |> Floki.text() |> String.trim() |> String.length()
  end

  @doc """
  Check html_tree can be candidate or not.
  """
  @spec candidate_tag?(html_tree) :: boolean
  def candidate_tag?({tag, _, _} = html_tree) do
    Enum.any?(["p", "td"], fn candidate_tag ->
      tag == candidate_tag &&
        text_length(html_tree) >= Readability.default_options()[:min_text_length]
    end)
  end

  @doc """
  Normalize and Parse to html tree(tuple or list)) from binary html
  """
  @spec normalize(binary, list) :: html_tree
  def normalize(raw_html, opts \\ []) do
    raw_html
    |> String.replace(Readability.regexes(:replace_xml_version), "")
    |> String.replace(Readability.regexes(:replace_brs), "</p><p>")
    |> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
    |> String.replace(Readability.regexes(:normalize), " ")
    |> transform_img_paths(opts[:url])
    |> Floki.parse()
    |> Floki.filter_out(:comment)
    |> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
  end

  # Turn relative `img` tag paths into absolute if possible
  defp transform_img_paths(html_str, nil), do: html_str

  defp transform_img_paths(html_str, url) do
    Readability.regexes(:img_tag_src)
    |> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))
  end

  defp build_img_path(url, _str, pre_src, src, post_src) do
    new_src =
      case URI.parse(src) do
        %URI{host: nil} ->
          base_url = base_url(url)
          scrubbed_src = String.trim_leading(src, "/")

          base_url <> "/" <> scrubbed_src

        _ ->
          src
      end

    pre_src <> new_src <> post_src
  end

  # Get the base url of a given url, including its scheme.
  # E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides
  # would return http://elixir-lang.org
  defp base_url(url) do
    scheme_regex = ~r/^(https?:\/\/)?(.*)/i
    path_regex = ~r/^([^\/]+)(.*)/i

    url_without_scheme = Regex.replace(scheme_regex, url, "\\2")
    base_url = Regex.replace(path_regex, url_without_scheme, "\\1")

    scheme = URI.parse(url).scheme || "http"

    scheme <> "://" <> base_url
  end
end
add filter algorithms 2016-04-17 06:28:33 +00:00			`defmodule Readability.Helper do`
			`@moduledoc """`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`Helpers for parsing, updating, removing html tree`
add filter algorithms 2016-04-17 06:28:33 +00:00			`"""`

			`@type html_tree :: tuple \| list`

			`@doc """`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`Change existing tags by selector`
add filter algorithms 2016-04-17 06:28:33 +00:00			`"""`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`@spec change_tag(html_tree, String.t(), String.t()) :: html_tree`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`def change_tag(content, _, _) when is_binary(content), do: content`
			`def change_tag([], _, _), do: []`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
			`def change_tag([h \| t], selector, tag) do`
			`[change_tag(h, selector, tag) \| change_tag(t, selector, tag)]`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add filter algorithms 2016-04-17 06:28:33 +00:00			`def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do`
			`{tag, attrs, change_tag(inner_tree, tag_name, tag)}`
			`end`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add filter algorithms 2016-04-17 06:28:33 +00:00			`def change_tag({tag_name, attrs, html_tree}, selector, tag) do`
			`{tag_name, attrs, change_tag(html_tree, selector, tag)}`
			`end`
add candidate builder add test 2016-04-17 12:26:51 +00:00
add document 2016-04-24 09:40:35 +00:00			`@doc """`
			`Remove html attributes`
			`"""`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`@spec remove_attrs(html_tree, String.t() \| [String.t()] \| Regex.t()) :: html_tree`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`def remove_attrs(content, _) when is_binary(content), do: content`
			`def remove_attrs([], _), do: []`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
			`def remove_attrs([h \| t], t_attrs) do`
			`[remove_attrs(h, t_attrs) \| remove_attrs(t, t_attrs)]`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add candidate builder add test 2016-04-17 12:26:51 +00:00			`def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do`
fix some bug and update deps 2017-02-05 09:48:26 +00:00			`reject_fun =`
			`cond do`
			`is_binary(target_attr) ->`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`fn attr -> elem(attr, 0) == target_attr end`

fix some bug and update deps 2017-02-05 09:48:26 +00:00			`Regex.regex?(target_attr) ->`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`fn attr -> elem(attr, 0) =~ target_attr end`

fix some bug and update deps 2017-02-05 09:48:26 +00:00			`is_list(target_attr) ->`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`fn attr -> Enum.member?(target_attr, elem(attr, 0)) end`

			`true ->`
			`fn attr -> attr end`
fix some bug and update deps 2017-02-05 09:48:26 +00:00			`end`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add candidate builder add test 2016-04-17 12:26:51 +00:00			`{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}`
			`end`

			`@doc """`
			`Remove tags`
			`"""`
			`@spec remove_tag(html_tree, fun) :: html_tree`
			`def remove_tag(content, _) when is_binary(content), do: content`
			`def remove_tag([], _), do: []`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
			`def remove_tag([h \| t], fun) do`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`node = remove_tag(h, fun)`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
Ensure `remove_tag` returns a valid html_tree If the entire input is stripped out, this used to return `nil` which caused downstream parsing to fail. Instead, return `[]` which is the Floki representation of an empty tree. Fixes #36 2018-10-11 00:29:48 +00:00			`if node == [] do`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`remove_tag(t, fun)`
			`else`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`[node \| remove_tag(t, fun)]`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`
			`end`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00
add candidate builder add test 2016-04-17 12:26:51 +00:00			`def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do`
			`if fun.(html_tree) do`
Ensure `remove_tag` returns a valid html_tree If the entire input is stripped out, this used to return `nil` which caused downstream parsing to fail. Instead, return `[]` which is the Floki representation of an empty tree. Fixes #36 2018-10-11 00:29:48 +00:00			`[]`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`else`
			`{tag, attrs, remove_tag(inner_tree, fun)}`
			`end`
			`end`

			`@doc """`
add document 2016-04-24 09:40:35 +00:00			`Count only text length`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`"""`
			`@spec text_length(html_tree) :: number`
			`def text_length(html_tree) do`
update deps and deprecated 2018-07-24 09:13:08 +00:00			`html_tree \|> Floki.text() \|> String.trim() \|> String.length()`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end`

			`@doc """`
			`Check html_tree can be candidate or not.`
			`"""`
			`@spec candidate_tag?(html_tree) :: boolean`
Manually compare tag type for candidate The match? method is no longer available starting Floki 0.15.0. 2017-10-28 09:38:57 +00:00			`def candidate_tag?({tag, _, _} = html_tree) do`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`Enum.any?(["p", "td"], fn candidate_tag ->`
			`tag == candidate_tag &&`
			`text_length(html_tree) >= Readability.default_options()[:min_text_length]`
add candidate builder add test 2016-04-17 12:26:51 +00:00			`end)`
			`end`

add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`@doc """`
			`Normalize and Parse to html tree(tuple or list)) from binary html`
			`"""`
Convert relative img paths into absolute Fixes #27 2018-06-30 00:59:43 +00:00			`@spec normalize(binary, list) :: html_tree`
			`def normalize(raw_html, opts \\ []) do`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`raw_html`
fix some bug and update deps 2017-02-05 09:48:26 +00:00			`\|> String.replace(Readability.regexes(:replace_xml_version), "")`
			`\|> String.replace(Readability.regexes(:replace_brs), "</p><p>")`
			`\|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")`
			`\|> String.replace(Readability.regexes(:normalize), " ")`
Convert relative img paths into absolute Fixes #27 2018-06-30 00:59:43 +00:00			`\|> transform_img_paths(opts[:url])`
Add Elixir 1.6 formatter config file and formatted the codebase 2018-02-09 03:42:08 +00:00			`\|> Floki.parse()`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`\|> Floki.filter_out(:comment)`
Strip out atom tags Standard tags are returned by Mochiweb as binaries. The atom tags are for special case parsing (such as php includes). Since that's not oging to be part of the article, simply exclude those while normalizing. Fixes #30 See also: Mochiweb parser: https://github.com/mochi/mochiweb/blob/9608d786efe474b862d3399d99c200bd36fc8942/src/mochiweb_html.erl#L345 2018-10-11 01:10:29 +00:00			`\|> remove_tag(fn {tag, _, _} -> is_atom(tag) end)`
add summarize function this closes #4, closes #3 2016-05-07 09:23:19 +00:00			`end`
Convert relative img paths into absolute Fixes #27 2018-06-30 00:59:43 +00:00
			# Turn relative `img` tag paths into absolute if possible
			`defp transform_img_paths(html_str, nil), do: html_str`

			`defp transform_img_paths(html_str, url) do`
			`Readability.regexes(:img_tag_src)`
			`\|> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))`
			`end`

			`defp build_img_path(url, _str, pre_src, src, post_src) do`
			`new_src =`
			`case URI.parse(src) do`
			`%URI{host: nil} ->`
			`base_url = base_url(url)`
			`scrubbed_src = String.trim_leading(src, "/")`

			`base_url <> "/" <> scrubbed_src`

			`_ ->`
			`src`
			`end`

			`pre_src <> new_src <> post_src`
			`end`

			`# Get the base url of a given url, including its scheme.`
			`# E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides`
			`# would return http://elixir-lang.org`
			`defp base_url(url) do`
			`scheme_regex = ~r/^(https?:\/\/)?(.*)/i`
			`path_regex = ~r/^([^\/]+)(.*)/i`

			`url_without_scheme = Regex.replace(scheme_regex, url, "\\2")`
			`base_url = Regex.replace(path_regex, url_without_scheme, "\\1")`

			`scheme = URI.parse(url).scheme \|\| "http"`

			`scheme <> "://" <> base_url`
			`end`
add filter algorithms 2016-04-17 06:28:33 +00:00			`end`