Fixes crash when html has an xml version tag by stripping it out

This commit is contained in:
Eason Goodale 2016-08-13 22:11:01 -07:00
parent e2b7cd1a24
commit 6840a9d0d7
4 changed files with 325 additions and 0 deletions

View File

@ -55,6 +55,7 @@ defmodule Readability do
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
replace_xml_version: ~r/<\?xml.*\?>/i,
normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i

View File

@ -92,6 +92,7 @@ defmodule Readability.Helper do
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_xml_version], "")
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")

308
test/fixtures/pubmed.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -59,4 +59,19 @@ defmodule ReadabilityTest do
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apples help/
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
end
test "readability for pubmed" do
html = TestHelper.read_fixture("pubmed.html")
pubmed = Readability.article(html)
pubmed_html = Readability.readable_html(pubmed)
assert pubmed_html =~ ~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
assert pubmed_html =~ ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
pubmed_text = Readability.readable_text(pubmed)
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
assert pubmed_text =~ ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
end
end