Merge pull request #37 from fribmendes/frm/img-tags

Convert relative img paths into absolute
This commit is contained in:
Jaehyun Shin 2018-07-24 18:00:43 +09:00 committed by GitHub
commit 133044f50c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 5 deletions

View File

@ -62,7 +62,8 @@ defmodule Readability do
replace_xml_version: ~r/<\?xml.*\?>/i, replace_xml_version: ~r/<\?xml.*\?>/i,
normalize: ~r/\s{2,}/, normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
] ]
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i @markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
@ -84,7 +85,9 @@ defmodule Readability do
case is_response_markup(headers) do case is_response_markup(headers) do
true -> true ->
html_tree = Helper.normalize(raw) html_tree =
raw
|> Helper.normalize(url: url)
article_tree = article_tree =
html_tree html_tree

View File

@ -101,14 +101,54 @@ defmodule Readability.Helper do
@doc """ @doc """
Normalize and Parse to html tree(tuple or list)) from binary html Normalize and Parse to html tree(tuple or list)) from binary html
""" """
@spec normalize(binary) :: html_tree @spec normalize(binary, list) :: html_tree
def normalize(raw_html) do def normalize(raw_html, opts \\ []) do
raw_html raw_html
|> String.replace(Readability.regexes(:replace_xml_version), "") |> String.replace(Readability.regexes(:replace_xml_version), "")
|> String.replace(Readability.regexes(:replace_brs), "</p><p>") |> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>") |> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|> String.replace(Readability.regexes(:normalize), " ") |> String.replace(Readability.regexes(:normalize), " ")
|> transform_img_paths(opts[:url])
|> Floki.parse() |> Floki.parse()
|> Floki.filter_out(:comment) |> Floki.filter_out(:comment)
end end
# Turn relative `img` tag paths into absolute if possible
defp transform_img_paths(html_str, nil), do: html_str
defp transform_img_paths(html_str, url) do
Readability.regexes(:img_tag_src)
|> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))
end
defp build_img_path(url, _str, pre_src, src, post_src) do
new_src =
case URI.parse(src) do
%URI{host: nil} ->
base_url = base_url(url)
scrubbed_src = String.trim_leading(src, "/")
base_url <> "/" <> scrubbed_src
_ ->
src
end
pre_src <> new_src <> post_src
end
# Get the base url of a given url, including its scheme.
# E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides
# would return http://elixir-lang.org
defp base_url(url) do
scheme_regex = ~r/^(https?:\/\/)?(.*)/i
path_regex = ~r/^([^\/]+)(.*)/i
url_without_scheme = Regex.replace(scheme_regex, url, "\\2")
base_url = Regex.replace(path_regex, url_without_scheme, "\\1")
scheme = URI.parse(url).scheme || "http"
scheme <> "://" <> base_url
end
end end

View File

@ -11,10 +11,12 @@ defmodule Readability.HelperTest do
<font>a</fond> <font>a</fond>
<p> <p>
<font>abc</font> <font>abc</font>
<img src="https://example.org/images/foo.png">
</p> </p>
</p> </p>
<p> <p>
<font>b</font> <font>b</font>
<img class="img" src="/images/bar.png" alt="alt" />
</p> </p>
</body> </body>
</html> </html>
@ -43,8 +45,30 @@ defmodule Readability.HelperTest do
assert result == expected assert result == expected
end end
test "inner text lengt", %{html_tree: html_tree} do test "inner text length", %{html_tree: html_tree} do
result = html_tree |> Helper.text_length() result = html_tree |> Helper.text_length()
assert result == 5 assert result == 5
end end
test "transform img relative paths into absolute" do
foo_url = "https://example.org/images/foo.png"
bar_url_http = "http://example.org/images/bar.png"
bar_url_https = "https://example.org/images/bar.png"
result_without_scheme =
@sample
|> Helper.normalize(url: "example.org/blog/a-blog-post")
|> Floki.raw_html()
result_with_scheme =
@sample
|> Helper.normalize(url: "https://example.org/blog/a-blog-post")
|> Floki.raw_html()
assert result_without_scheme =~ foo_url
assert result_without_scheme =~ bar_url_http
assert result_with_scheme =~ foo_url
assert result_with_scheme =~ bar_url_https
end
end end