Strip out atom tags
Standard tags are returned by Mochiweb as binaries. The atom tags are
for special case parsing (such as php includes). Since that's not oging
to be part of the article, simply exclude those while normalizing.
Fixes #30
See also:
Mochiweb parser: 9608d786ef/src/mochiweb_html.erl (L345)
This commit is contained in:
parent
c2dbdf14e8
commit
b35746bfed
|
@ -111,6 +111,7 @@ defmodule Readability.Helper do
|
||||||
|> transform_img_paths(opts[:url])
|
|> transform_img_paths(opts[:url])
|
||||||
|> Floki.parse()
|
|> Floki.parse()
|
||||||
|> Floki.filter_out(:comment)
|
|> Floki.filter_out(:comment)
|
||||||
|
|> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Turn relative `img` tag paths into absolute if possible
|
# Turn relative `img` tag paths into absolute if possible
|
||||||
|
|
|
@ -50,6 +50,15 @@ defmodule Readability.HelperTest do
|
||||||
assert result == 5
|
assert result == 5
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "strips out special case tags" do
|
||||||
|
expected_html =
|
||||||
|
"<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
|
||||||
|
|> Helper.normalize()
|
||||||
|
|> Floki.raw_html()
|
||||||
|
|
||||||
|
assert expected_html == "<html><body><p>Hello </p></body></html>"
|
||||||
|
end
|
||||||
|
|
||||||
test "transform img relative paths into absolute" do
|
test "transform img relative paths into absolute" do
|
||||||
foo_url = "https://example.org/images/foo.png"
|
foo_url = "https://example.org/images/foo.png"
|
||||||
bar_url_http = "http://example.org/images/bar.png"
|
bar_url_http = "http://example.org/images/bar.png"
|
||||||
|
|
Loading…
Reference in New Issue