parent
4f2449558d
commit
ebc8c90e71
|
@ -62,7 +62,8 @@ defmodule Readability do
|
|||
replace_xml_version: ~r/<\?xml.*\?>/i,
|
||||
normalize: ~r/\s{2,}/,
|
||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
|
||||
img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
|
||||
]
|
||||
|
||||
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
|
||||
|
@ -84,7 +85,9 @@ defmodule Readability do
|
|||
|
||||
case is_response_markup(headers) do
|
||||
true ->
|
||||
html_tree = Helper.normalize(raw)
|
||||
html_tree =
|
||||
raw
|
||||
|> Helper.normalize(url: url)
|
||||
|
||||
article_tree =
|
||||
html_tree
|
||||
|
|
|
@ -101,14 +101,54 @@ defmodule Readability.Helper do
|
|||
@doc """
|
||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||
"""
|
||||
@spec normalize(binary) :: html_tree
|
||||
def normalize(raw_html) do
|
||||
@spec normalize(binary, list) :: html_tree
|
||||
def normalize(raw_html, opts \\ []) do
|
||||
raw_html
|
||||
|> String.replace(Readability.regexes(:replace_xml_version), "")
|
||||
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
||||
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
||||
|> String.replace(Readability.regexes(:normalize), " ")
|
||||
|> transform_img_paths(opts[:url])
|
||||
|> Floki.parse()
|
||||
|> Floki.filter_out(:comment)
|
||||
end
|
||||
|
||||
# Turn relative `img` tag paths into absolute if possible
|
||||
defp transform_img_paths(html_str, nil), do: html_str
|
||||
|
||||
defp transform_img_paths(html_str, url) do
|
||||
Readability.regexes(:img_tag_src)
|
||||
|> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))
|
||||
end
|
||||
|
||||
defp build_img_path(url, _str, pre_src, src, post_src) do
|
||||
new_src =
|
||||
case URI.parse(src) do
|
||||
%URI{host: nil} ->
|
||||
base_url = base_url(url)
|
||||
scrubbed_src = String.trim_leading(src, "/")
|
||||
|
||||
base_url <> "/" <> scrubbed_src
|
||||
|
||||
_ ->
|
||||
src
|
||||
end
|
||||
|
||||
pre_src <> new_src <> post_src
|
||||
end
|
||||
|
||||
# Get the base url of a given url, including its scheme.
|
||||
# E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides
|
||||
# would return http://elixir-lang.org
|
||||
defp base_url(url) do
|
||||
scheme_regex = ~r/^(https?:\/\/)?(.*)/i
|
||||
path_regex = ~r/^([^\/]+)(.*)/i
|
||||
|
||||
url_without_scheme = Regex.replace(scheme_regex, url, "\\2")
|
||||
base_url = Regex.replace(path_regex, url_without_scheme, "\\1")
|
||||
|
||||
scheme = URI.parse(url).scheme || "http"
|
||||
|
||||
scheme <> "://" <> base_url
|
||||
end
|
||||
end
|
||||
|
|
|
@ -11,10 +11,12 @@ defmodule Readability.HelperTest do
|
|||
<font>a</fond>
|
||||
<p>
|
||||
<font>abc</font>
|
||||
<img src="https://example.org/images/foo.png">
|
||||
</p>
|
||||
</p>
|
||||
<p>
|
||||
<font>b</font>
|
||||
<img class="img" src="/images/bar.png" alt="alt" />
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -43,8 +45,30 @@ defmodule Readability.HelperTest do
|
|||
assert result == expected
|
||||
end
|
||||
|
||||
test "inner text lengt", %{html_tree: html_tree} do
|
||||
test "inner text length", %{html_tree: html_tree} do
|
||||
result = html_tree |> Helper.text_length()
|
||||
assert result == 5
|
||||
end
|
||||
|
||||
test "transform img relative paths into absolute" do
|
||||
foo_url = "https://example.org/images/foo.png"
|
||||
bar_url_http = "http://example.org/images/bar.png"
|
||||
bar_url_https = "https://example.org/images/bar.png"
|
||||
|
||||
result_without_scheme =
|
||||
@sample
|
||||
|> Helper.normalize(url: "example.org/blog/a-blog-post")
|
||||
|> Floki.raw_html()
|
||||
|
||||
result_with_scheme =
|
||||
@sample
|
||||
|> Helper.normalize(url: "https://example.org/blog/a-blog-post")
|
||||
|> Floki.raw_html()
|
||||
|
||||
assert result_without_scheme =~ foo_url
|
||||
assert result_without_scheme =~ bar_url_http
|
||||
|
||||
assert result_with_scheme =~ foo_url
|
||||
assert result_with_scheme =~ bar_url_https
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue