Fixes crash when html has an xml version tag by stripping it out
This commit is contained in:
parent
e2b7cd1a24
commit
6840a9d0d7
|
@ -55,6 +55,7 @@ defmodule Readability do
|
|||
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
||||
replace_xml_version: ~r/<\?xml.*\?>/i,
|
||||
normalize: ~r/\s{2,}/,
|
||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||
|
|
|
@ -92,6 +92,7 @@ defmodule Readability.Helper do
|
|||
@spec normalize(binary) :: html_tree
|
||||
def normalize(raw_html) do
|
||||
raw_html
|
||||
|> String.replace(Readability.regexes[:replace_xml_version], "")
|
||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||
|> String.replace(Readability.regexes[:normalize], " ")
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -59,4 +59,19 @@ defmodule ReadabilityTest do
|
|||
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||
end
|
||||
|
||||
test "readability for pubmed" do
|
||||
html = TestHelper.read_fixture("pubmed.html")
|
||||
pubmed = Readability.article(html)
|
||||
|
||||
pubmed_html = Readability.readable_html(pubmed)
|
||||
|
||||
assert pubmed_html =~ ~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
|
||||
assert pubmed_html =~ ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
|
||||
|
||||
pubmed_text = Readability.readable_text(pubmed)
|
||||
|
||||
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
|
||||
assert pubmed_text =~ ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue