parent
4f2449558d
commit
ebc8c90e71
|
@ -62,7 +62,8 @@ defmodule Readability do
|
||||||
replace_xml_version: ~r/<\?xml.*\?>/i,
|
replace_xml_version: ~r/<\?xml.*\?>/i,
|
||||||
normalize: ~r/\s{2,}/,
|
normalize: ~r/\s{2,}/,
|
||||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
|
||||||
|
img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
|
||||||
]
|
]
|
||||||
|
|
||||||
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
|
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
|
||||||
|
@ -84,7 +85,9 @@ defmodule Readability do
|
||||||
|
|
||||||
case is_response_markup(headers) do
|
case is_response_markup(headers) do
|
||||||
true ->
|
true ->
|
||||||
html_tree = Helper.normalize(raw)
|
html_tree =
|
||||||
|
raw
|
||||||
|
|> Helper.normalize(url: url)
|
||||||
|
|
||||||
article_tree =
|
article_tree =
|
||||||
html_tree
|
html_tree
|
||||||
|
|
|
@ -101,14 +101,54 @@ defmodule Readability.Helper do
|
||||||
@doc """
|
@doc """
|
||||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||||
"""
|
"""
|
||||||
@spec normalize(binary) :: html_tree
|
@spec normalize(binary, list) :: html_tree
|
||||||
def normalize(raw_html) do
|
def normalize(raw_html, opts \\ []) do
|
||||||
raw_html
|
raw_html
|
||||||
|> String.replace(Readability.regexes(:replace_xml_version), "")
|
|> String.replace(Readability.regexes(:replace_xml_version), "")
|
||||||
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
||||||
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
||||||
|> String.replace(Readability.regexes(:normalize), " ")
|
|> String.replace(Readability.regexes(:normalize), " ")
|
||||||
|
|> transform_img_paths(opts[:url])
|
||||||
|> Floki.parse()
|
|> Floki.parse()
|
||||||
|> Floki.filter_out(:comment)
|
|> Floki.filter_out(:comment)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Turn relative `img` tag paths into absolute if possible
|
||||||
|
defp transform_img_paths(html_str, nil), do: html_str
|
||||||
|
|
||||||
|
defp transform_img_paths(html_str, url) do
|
||||||
|
Readability.regexes(:img_tag_src)
|
||||||
|
|> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))
|
||||||
|
end
|
||||||
|
|
||||||
|
defp build_img_path(url, _str, pre_src, src, post_src) do
|
||||||
|
new_src =
|
||||||
|
case URI.parse(src) do
|
||||||
|
%URI{host: nil} ->
|
||||||
|
base_url = base_url(url)
|
||||||
|
scrubbed_src = String.trim_leading(src, "/")
|
||||||
|
|
||||||
|
base_url <> "/" <> scrubbed_src
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
src
|
||||||
|
end
|
||||||
|
|
||||||
|
pre_src <> new_src <> post_src
|
||||||
|
end
|
||||||
|
|
||||||
|
# Get the base url of a given url, including its scheme.
|
||||||
|
# E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides
|
||||||
|
# would return http://elixir-lang.org
|
||||||
|
defp base_url(url) do
|
||||||
|
scheme_regex = ~r/^(https?:\/\/)?(.*)/i
|
||||||
|
path_regex = ~r/^([^\/]+)(.*)/i
|
||||||
|
|
||||||
|
url_without_scheme = Regex.replace(scheme_regex, url, "\\2")
|
||||||
|
base_url = Regex.replace(path_regex, url_without_scheme, "\\1")
|
||||||
|
|
||||||
|
scheme = URI.parse(url).scheme || "http"
|
||||||
|
|
||||||
|
scheme <> "://" <> base_url
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -11,10 +11,12 @@ defmodule Readability.HelperTest do
|
||||||
<font>a</fond>
|
<font>a</fond>
|
||||||
<p>
|
<p>
|
||||||
<font>abc</font>
|
<font>abc</font>
|
||||||
|
<img src="https://example.org/images/foo.png">
|
||||||
</p>
|
</p>
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<font>b</font>
|
<font>b</font>
|
||||||
|
<img class="img" src="/images/bar.png" alt="alt" />
|
||||||
</p>
|
</p>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -43,8 +45,30 @@ defmodule Readability.HelperTest do
|
||||||
assert result == expected
|
assert result == expected
|
||||||
end
|
end
|
||||||
|
|
||||||
test "inner text lengt", %{html_tree: html_tree} do
|
test "inner text length", %{html_tree: html_tree} do
|
||||||
result = html_tree |> Helper.text_length()
|
result = html_tree |> Helper.text_length()
|
||||||
assert result == 5
|
assert result == 5
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "transform img relative paths into absolute" do
|
||||||
|
foo_url = "https://example.org/images/foo.png"
|
||||||
|
bar_url_http = "http://example.org/images/bar.png"
|
||||||
|
bar_url_https = "https://example.org/images/bar.png"
|
||||||
|
|
||||||
|
result_without_scheme =
|
||||||
|
@sample
|
||||||
|
|> Helper.normalize(url: "example.org/blog/a-blog-post")
|
||||||
|
|> Floki.raw_html()
|
||||||
|
|
||||||
|
result_with_scheme =
|
||||||
|
@sample
|
||||||
|
|> Helper.normalize(url: "https://example.org/blog/a-blog-post")
|
||||||
|
|> Floki.raw_html()
|
||||||
|
|
||||||
|
assert result_without_scheme =~ foo_url
|
||||||
|
assert result_without_scheme =~ bar_url_http
|
||||||
|
|
||||||
|
assert result_with_scheme =~ foo_url
|
||||||
|
assert result_with_scheme =~ bar_url_https
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue