diff --git a/lib/readability.ex b/lib/readability.ex index 94e1716..6c869fe 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -62,7 +62,8 @@ defmodule Readability do replace_xml_version: ~r/<\?xml.*\?>/i, normalize: ~r/\s{2,}/, video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, - protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i + protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i, + img_tag_src: ~r/(]*>)/Ui ] @markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i @@ -84,7 +85,9 @@ defmodule Readability do case is_response_markup(headers) do true -> - html_tree = Helper.normalize(raw) + html_tree = + raw + |> Helper.normalize(url: url) article_tree = html_tree diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index afce5dd..c82880c 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -101,14 +101,54 @@ defmodule Readability.Helper do @doc """ Normalize and Parse to html tree(tuple or list)) from binary html """ - @spec normalize(binary) :: html_tree - def normalize(raw_html) do + @spec normalize(binary, list) :: html_tree + def normalize(raw_html, opts \\ []) do raw_html |> String.replace(Readability.regexes(:replace_xml_version), "") |> String.replace(Readability.regexes(:replace_brs), "

") |> String.replace(Readability.regexes(:replace_fonts), "<\1span>") |> String.replace(Readability.regexes(:normalize), " ") + |> transform_img_paths(opts[:url]) |> Floki.parse() |> Floki.filter_out(:comment) end + + # Turn relative `img` tag paths into absolute if possible + defp transform_img_paths(html_str, nil), do: html_str + + defp transform_img_paths(html_str, url) do + Readability.regexes(:img_tag_src) + |> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4)) + end + + defp build_img_path(url, _str, pre_src, src, post_src) do + new_src = + case URI.parse(src) do + %URI{host: nil} -> + base_url = base_url(url) + scrubbed_src = String.trim_leading(src, "/") + + base_url <> "/" <> scrubbed_src + + _ -> + src + end + + pre_src <> new_src <> post_src + end + + # Get the base url of a given url, including its scheme. + # E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides + # would return http://elixir-lang.org + defp base_url(url) do + scheme_regex = ~r/^(https?:\/\/)?(.*)/i + path_regex = ~r/^([^\/]+)(.*)/i + + url_without_scheme = Regex.replace(scheme_regex, url, "\\2") + base_url = Regex.replace(path_regex, url_without_scheme, "\\1") + + scheme = URI.parse(url).scheme || "http" + + scheme <> "://" <> base_url + end end diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs index 7eef33b..a0a60b2 100644 --- a/test/readability/helper_test.exs +++ b/test/readability/helper_test.exs @@ -11,10 +11,12 @@ defmodule Readability.HelperTest do a

abc +

b + alt

@@ -43,8 +45,30 @@ defmodule Readability.HelperTest do assert result == expected end - test "inner text lengt", %{html_tree: html_tree} do + test "inner text length", %{html_tree: html_tree} do result = html_tree |> Helper.text_length() assert result == 5 end + + test "transform img relative paths into absolute" do + foo_url = "https://example.org/images/foo.png" + bar_url_http = "http://example.org/images/bar.png" + bar_url_https = "https://example.org/images/bar.png" + + result_without_scheme = + @sample + |> Helper.normalize(url: "example.org/blog/a-blog-post") + |> Floki.raw_html() + + result_with_scheme = + @sample + |> Helper.normalize(url: "https://example.org/blog/a-blog-post") + |> Floki.raw_html() + + assert result_without_scheme =~ foo_url + assert result_without_scheme =~ bar_url_http + + assert result_with_scheme =~ foo_url + assert result_with_scheme =~ bar_url_https + end end