diff --git a/lib/readability.ex b/lib/readability.ex
index 5687c98..4d10f41 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -55,6 +55,7 @@ defmodule Readability do
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replace_brs: ~r/(
]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
+ replace_xml_version: ~r/<\?xml.*\?>/i,
normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 604be68..f77cfff 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -92,6 +92,7 @@ defmodule Readability.Helper do
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
+ |> String.replace(Readability.regexes[:replace_xml_version], "")
|> String.replace(Readability.regexes[:replace_brs], "
") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") |> String.replace(Readability.regexes[:normalize], " ") diff --git a/test/fixtures/pubmed.html b/test/fixtures/pubmed.html new file mode 100644 index 0000000..a38adbc --- /dev/null +++ b/test/fixtures/pubmed.html @@ -0,0 +1,308 @@ + + + + +
+ + + + + + +