From b35746bfed977d9a260976f704b8f073894ae9aa Mon Sep 17 00:00:00 2001 From: Ben Olive Date: Wed, 10 Oct 2018 21:10:29 -0400 Subject: [PATCH] Strip out atom tags Standard tags are returned by Mochiweb as binaries. The atom tags are for special case parsing (such as php includes). Since that's not oging to be part of the article, simply exclude those while normalizing. Fixes #30 See also: Mochiweb parser: https://github.com/mochi/mochiweb/blob/9608d786efe474b862d3399d99c200bd36fc8942/src/mochiweb_html.erl#L345 --- lib/readability/helper.ex | 1 + test/readability/helper_test.exs | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index 9861143..7562dbc 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -111,6 +111,7 @@ defmodule Readability.Helper do |> transform_img_paths(opts[:url]) |> Floki.parse() |> Floki.filter_out(:comment) + |> remove_tag(fn {tag, _, _} -> is_atom(tag) end) end # Turn relative `img` tag paths into absolute if possible diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs index a0a60b2..dd85b59 100644 --- a/test/readability/helper_test.exs +++ b/test/readability/helper_test.exs @@ -50,6 +50,15 @@ defmodule Readability.HelperTest do assert result == 5 end + test "strips out special case tags" do + expected_html = + "

Hello

" + |> Helper.normalize() + |> Floki.raw_html() + + assert expected_html == "

Hello

" + end + test "transform img relative paths into absolute" do foo_url = "https://example.org/images/foo.png" bar_url_http = "http://example.org/images/bar.png"