From 6840a9d0d722b4e20783077c560337f140897230 Mon Sep 17 00:00:00 2001
From: Eason Goodale
]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
+ replace_xml_version: ~r/<\?xml.*\?>/i,
normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 604be68..f77cfff 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -92,6 +92,7 @@ defmodule Readability.Helper do
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
+ |> String.replace(Readability.regexes[:replace_xml_version], "")
|> String.replace(Readability.regexes[:replace_brs], "
") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") |> String.replace(Readability.regexes[:normalize], " ") diff --git a/test/fixtures/pubmed.html b/test/fixtures/pubmed.html new file mode 100644 index 0000000..a38adbc --- /dev/null +++ b/test/fixtures/pubmed.html @@ -0,0 +1,308 @@ + + + + +
+ + + + + + +