From b35746bfed977d9a260976f704b8f073894ae9aa Mon Sep 17 00:00:00 2001
From: Ben Olive <ben.olive@salesloft.com>
Date: Wed, 10 Oct 2018 21:10:29 -0400
Subject: [PATCH] Strip out atom tags

Standard tags are returned by Mochiweb as binaries. The atom tags are
for special case parsing (such as php includes). Since that's not oging
to be part of the article, simply exclude those while normalizing.

Fixes #30

See also:

Mochiweb parser: https://github.com/mochi/mochiweb/blob/9608d786efe474b862d3399d99c200bd36fc8942/src/mochiweb_html.erl#L345
---
 lib/readability/helper.ex        | 1 +
 test/readability/helper_test.exs | 9 +++++++++
 2 files changed, 10 insertions(+)
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 9861143..7562dbc 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -111,6 +111,7 @@ defmodule Readability.Helper do
     |> transform_img_paths(opts[:url])
     |> Floki.parse()
     |> Floki.filter_out(:comment)
+    |> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
   end
 
   # Turn relative `img` tag paths into absolute if possible
diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs
index a0a60b2..dd85b59 100644
--- a/test/readability/helper_test.exs
+++ b/test/readability/helper_test.exs
@@ -50,6 +50,15 @@ defmodule Readability.HelperTest do
     assert result == 5
   end
 
+  test "strips out special case tags" do
+    expected_html =
+      "<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
+      |> Helper.normalize()
+      |> Floki.raw_html()
+
+    assert expected_html == "<html><body><p>Hello </p></body></html>"
+  end
+
   test "transform img relative paths into absolute" do
     foo_url = "https://example.org/images/foo.png"
     bar_url_http = "http://example.org/images/bar.png"