Strip out atom tags

Standard tags are returned by Mochiweb as binaries. The atom tags are for special case parsing (such as php includes). Since that's not oging to be part of the article, simply exclude those while normalizing. Fixes #30 See also: Mochiweb parser: 9608d786ef/src/mochiweb_html.erl (L345)
2018-10-10 21:10:29 -04:00 · 2018-10-10 21:10:29 -04:00 · b35746bfed
commit b35746bfed
parent c2dbdf14e8
2 changed files with 10 additions and 0 deletions
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -111,6 +111,7 @@ defmodule Readability.Helper do
    |> transform_img_paths(opts[:url])
    |> Floki.parse()
    |> Floki.filter_out(:comment)
+    |> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
  end

  # Turn relative `img` tag paths into absolute if possible
--- a/test/readability/helper_test.exs
+++ b/test/readability/helper_test.exs
@ -50,6 +50,15 @@ defmodule Readability.HelperTest do
    assert result == 5
  end

+  test "strips out special case tags" do
+    expected_html =
+      "<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
+      |> Helper.normalize()
+      |> Floki.raw_html()
+
+    assert expected_html == "<html><body><p>Hello </p></body></html>"
+  end
+
  test "transform img relative paths into absolute" do
    foo_url = "https://example.org/images/foo.png"
    bar_url_http = "http://example.org/images/bar.png"