Strip out atom tags

Standard tags are returned by Mochiweb as binaries. The atom tags are for special case parsing (such as php includes). Since that's not oging to be part of the article, simply exclude those while normalizing. Fixes #30 See also: Mochiweb parser: 9608d786ef/src/mochiweb_html.erl (L345)
2018-10-10 21:10:29 -04:00 · 2018-10-10 21:10:29 -04:00 · b35746bfed
parent c2dbdf14e8
commit b35746bfed
2 changed files with 10 additions and 0 deletions
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -111,6 +111,7 @@ defmodule Readability.Helper do
    |> transform_img_paths(opts[:url])
    |> Floki.parse()
    |> Floki.filter_out(:comment)
    |> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
  end
  # Turn relative `img` tag paths into absolute if possible
--- a/test/readability/helper_test.exs
+++ b/test/readability/helper_test.exs
@ -50,6 +50,15 @@ defmodule Readability.HelperTest do
    assert result == 5
  end
  test "strips out special case tags" do
    expected_html =
      "<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
      |> Helper.normalize()
      |> Floki.raw_html()
    assert expected_html == "<html><body><p>Hello </p></body></html>"
  end
  test "transform img relative paths into absolute" do
    foo_url = "https://example.org/images/foo.png"
    bar_url_http = "http://example.org/images/bar.png"