diff --git a/lib/readability.ex b/lib/readability.ex index 720f42e..1a317e7 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -73,11 +73,13 @@ defmodule Readability do """ @spec raw_html(html_tree) :: binary def readabl_text(html_tree) do + # TODO: Remove image caption when extract only text tags_to_br = ~r/<\/(p|div|article|h\d)/i html_str = html_tree |> raw_html Regex.replace(tags_to_br, html_str, &("\n#{&1}")) |> Floki.parse |> Floki.text + |> String.strip end def regexes, do: @regexes diff --git a/test/fixtures/bbc.html b/test/fixtures/bbc.html index db47cc4..dd44887 100644 --- a/test/fixtures/bbc.html +++ b/test/fixtures/bbc.html @@ -1,103 +1,97 @@ - - - - Obama admits US gun laws are his 'biggest frustration' - BBC News - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - + + - + - - + + + + - - + + + + + + + + - + - - + + + - -
+ +
-
-
+
+
-
- - - - -
- - - US & Canada - - - - -