From 747e0495eda67811585245a0e9a8f6c882225472 Mon Sep 17 00:00:00 2001 From: Jeff Browning Date: Fri, 4 Nov 2016 14:49:25 -0400 Subject: [PATCH 1/3] Fix detection of title suffix --- lib/readability/title_finder.ex | 2 +- test/readability/title_finder_test.exs | 40 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index 4cb0f64..4273da5 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -3,7 +3,7 @@ defmodule Readability.TitleFinder do The TitleFinder engine traverses HTML tree searching for finding title. """ - @title_suffix ~r/(\-)|(\:\:)|(\|)/ + @title_suffix ~r/\s(?:\-|\:\:|\|)\s/ @h_tag_selector "h1, h2, h3" @type html_tree :: tuple | list diff --git a/test/readability/title_finder_test.exs b/test/readability/title_finder_test.exs index af724ba..7ba6f87 100644 --- a/test/readability/title_finder_test.exs +++ b/test/readability/title_finder_test.exs @@ -31,6 +31,46 @@ defmodule Readability.TitleFinderTest do test "extract tag title" do title = Readability.TitleFinder.tag_title(@html) assert title == "Tag title" + + html = """ + + + Tag title :: test + + + """ + title = Readability.TitleFinder.tag_title(html) + assert title == "Tag title" + + html = """ + + + Tag title | test + + + """ + title = Readability.TitleFinder.tag_title(html) + assert title == "Tag title" + + html = """ + + + Tag title-tag + + + """ + title = Readability.TitleFinder.tag_title(html) + assert title == "Tag title-tag" + + html = """ + + + Tag title-tag-title - test + + + """ + title = Readability.TitleFinder.tag_title(html) + assert title == "Tag title-tag-title" end test "extract h1 tag title" do From 2f8e84eb8afe2f83a24fdf36daee2b2ed77f1227 Mon Sep 17 00:00:00 2001 From: Jeff Browning Date: Fri, 4 Nov 2016 14:49:42 -0400 Subject: [PATCH 2/3] Clean up and fix warnings --- lib/readability/title_finder.ex | 39 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index 4273da5..86f6aaf 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -13,19 +13,18 @@ defmodule Readability.TitleFinder do """ @spec title(html_tree) :: binary def title(html_tree) do - maybe_title = og_title(html_tree) - if String.length(String.strip(maybe_title)) == 0 do - maybe_title = tag_title(html_tree) - end + case og_title(html_tree) do + "" -> + title = tag_title(html_tree) - unless good_title?(maybe_title) do - h_title = h_tag_title(html_tree) - if good_title?(h_title) do - maybe_title = h_title - end + if good_title?(title) do + title + else + h_tag_title(html_tree) + end + title when is_binary(title) -> + title end - - maybe_title end @doc """ @@ -35,7 +34,7 @@ defmodule Readability.TitleFinder do def tag_title(html_tree) do html_tree |> Floki.find("title") - |> clean_title + |> clean_title() end @doc """ @@ -46,7 +45,7 @@ defmodule Readability.TitleFinder do html_tree |> Floki.find("meta[property=og:title]") |> Floki.attribute("content") - |> clean_title + |> clean_title() end @doc """ @@ -56,16 +55,16 @@ defmodule Readability.TitleFinder do def h_tag_title(html_tree, selector \\ @h_tag_selector) do html_tree |> Floki.find(selector) - |> hd - |> clean_title + |> hd() + |> clean_title() end defp clean_title(html_tree) do - title_text = html_tree - |> Floki.text - |> String.split(@title_suffix) - |> hd - |> String.strip + html_tree + |> Floki.text() + |> String.split(@title_suffix) + |> hd() + |> String.strip() end defp good_title?(title) do From d3be3bdd82eb6ad1205d44a244391333311006a6 Mon Sep 17 00:00:00 2001 From: Jeff Browning Date: Fri, 4 Nov 2016 14:51:24 -0400 Subject: [PATCH 3/3] Only split title suffix for tag titles --- lib/readability/title_finder.ex | 4 ++-- test/readability/title_finder_test.exs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index 86f6aaf..ca4d970 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -35,6 +35,8 @@ defmodule Readability.TitleFinder do html_tree |> Floki.find("title") |> clean_title() + |> String.split(@title_suffix) + |> hd() end @doc """ @@ -62,8 +64,6 @@ defmodule Readability.TitleFinder do defp clean_title(html_tree) do html_tree |> Floki.text() - |> String.split(@title_suffix) - |> hd() |> String.strip() end diff --git a/test/readability/title_finder_test.exs b/test/readability/title_finder_test.exs index 7ba6f87..7201221 100644 --- a/test/readability/title_finder_test.exs +++ b/test/readability/title_finder_test.exs @@ -7,7 +7,7 @@ defmodule Readability.TitleFinderTest do Tag title - test - +