From d8677a599ccfe602e0222f1e4a1a2938a32964cd Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Sun, 24 Apr 2016 14:32:43 +0900 Subject: [PATCH] add test --- lib/readability.ex | 2 + test/fixtures/bbc.html | 2612 ++------ test/fixtures/buzzfeed.html | 6570 +++++++++++++++++++ test/fixtures/medium.html | 8 + test/readability/candidate/_builder.exs | 53 - test/readability/candidate/_finder.ex | 77 - test/readability/candidate/cleaner_test.exs | 7 +- test/readability_test.exs | 48 +- test/test_helper.exs | 9 + 9 files changed, 7272 insertions(+), 2114 deletions(-) create mode 100644 test/fixtures/buzzfeed.html create mode 100644 test/fixtures/medium.html delete mode 100644 test/readability/candidate/_builder.exs delete mode 100644 test/readability/candidate/_finder.ex diff --git a/lib/readability.ex b/lib/readability.ex index 720f42e..1a317e7 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -73,11 +73,13 @@ defmodule Readability do """ @spec raw_html(html_tree) :: binary def readabl_text(html_tree) do + # TODO: Remove image caption when extract only text tags_to_br = ~r/<\/(p|div|article|h\d)/i html_str = html_tree |> raw_html Regex.replace(tags_to_br, html_str, &("\n#{&1}")) |> Floki.parse |> Floki.text + |> String.strip end def regexes, do: @regexes diff --git a/test/fixtures/bbc.html b/test/fixtures/bbc.html index db47cc4..dd44887 100644 --- a/test/fixtures/bbc.html +++ b/test/fixtures/bbc.html @@ -1,103 +1,97 @@ - - - - Obama admits US gun laws are his 'biggest frustration' - BBC News - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - + + - + - - + + + + - - + + + + + + + + - + - - + + + - -
+ +
-
-
+
+
-
- - - - -
- - - US & Canada - - - - -