Fixes crash when html has an xml version tag by stripping it out
This commit is contained in:
parent
e2b7cd1a24
commit
6840a9d0d7
|
@ -55,6 +55,7 @@ defmodule Readability do
|
||||||
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
||||||
|
replace_xml_version: ~r/<\?xml.*\?>/i,
|
||||||
normalize: ~r/\s{2,}/,
|
normalize: ~r/\s{2,}/,
|
||||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||||
|
|
|
@ -92,6 +92,7 @@ defmodule Readability.Helper do
|
||||||
@spec normalize(binary) :: html_tree
|
@spec normalize(binary) :: html_tree
|
||||||
def normalize(raw_html) do
|
def normalize(raw_html) do
|
||||||
raw_html
|
raw_html
|
||||||
|
|> String.replace(Readability.regexes[:replace_xml_version], "")
|
||||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||||
|> String.replace(Readability.regexes[:normalize], " ")
|
|> String.replace(Readability.regexes[:normalize], " ")
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -59,4 +59,19 @@ defmodule ReadabilityTest do
|
||||||
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||||
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "readability for pubmed" do
|
||||||
|
html = TestHelper.read_fixture("pubmed.html")
|
||||||
|
pubmed = Readability.article(html)
|
||||||
|
|
||||||
|
pubmed_html = Readability.readable_html(pubmed)
|
||||||
|
|
||||||
|
assert pubmed_html =~ ~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
|
||||||
|
assert pubmed_html =~ ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
|
||||||
|
|
||||||
|
pubmed_text = Readability.readable_text(pubmed)
|
||||||
|
|
||||||
|
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
|
||||||
|
assert pubmed_text =~ ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue