Merge pull request #14 from pineconellc/fix_title_suffix_detection

Fix title suffix detection
This commit is contained in:
Jaehyun Shin 2016-11-05 15:56:41 +09:00 committed by GitHub
commit ba9027ee7e
2 changed files with 61 additions and 22 deletions

View File

@ -3,7 +3,7 @@ defmodule Readability.TitleFinder do
The TitleFinder engine traverses HTML tree searching for finding title. The TitleFinder engine traverses HTML tree searching for finding title.
""" """
@title_suffix ~r/(\-)|(\:\:)|(\|)/ @title_suffix ~r/\s(?:\-|\:\:|\|)\s/
@h_tag_selector "h1, h2, h3" @h_tag_selector "h1, h2, h3"
@type html_tree :: tuple | list @type html_tree :: tuple | list
@ -13,19 +13,18 @@ defmodule Readability.TitleFinder do
""" """
@spec title(html_tree) :: binary @spec title(html_tree) :: binary
def title(html_tree) do def title(html_tree) do
maybe_title = og_title(html_tree) case og_title(html_tree) do
if String.length(String.strip(maybe_title)) == 0 do "" ->
maybe_title = tag_title(html_tree) title = tag_title(html_tree)
end
unless good_title?(maybe_title) do if good_title?(title) do
h_title = h_tag_title(html_tree) title
if good_title?(h_title) do else
maybe_title = h_title h_tag_title(html_tree)
end end
title when is_binary(title) ->
title
end end
maybe_title
end end
@doc """ @doc """
@ -35,7 +34,9 @@ defmodule Readability.TitleFinder do
def tag_title(html_tree) do def tag_title(html_tree) do
html_tree html_tree
|> Floki.find("title") |> Floki.find("title")
|> clean_title |> clean_title()
|> String.split(@title_suffix)
|> hd()
end end
@doc """ @doc """
@ -46,7 +47,7 @@ defmodule Readability.TitleFinder do
html_tree html_tree
|> Floki.find("meta[property=og:title]") |> Floki.find("meta[property=og:title]")
|> Floki.attribute("content") |> Floki.attribute("content")
|> clean_title |> clean_title()
end end
@doc """ @doc """
@ -56,16 +57,14 @@ defmodule Readability.TitleFinder do
def h_tag_title(html_tree, selector \\ @h_tag_selector) do def h_tag_title(html_tree, selector \\ @h_tag_selector) do
html_tree html_tree
|> Floki.find(selector) |> Floki.find(selector)
|> hd |> hd()
|> clean_title |> clean_title()
end end
defp clean_title(html_tree) do defp clean_title(html_tree) do
title_text = html_tree html_tree
|> Floki.text |> Floki.text()
|> String.split(@title_suffix) |> String.strip()
|> hd
|> String.strip
end end
defp good_title?(title) do defp good_title?(title) do

View File

@ -7,7 +7,7 @@ defmodule Readability.TitleFinderTest do
<html> <html>
<head> <head>
<title>Tag title - test</title> <title>Tag title - test</title>
<meta property='og:title' content='og title | test'> <meta property='og:title' content='og title'>
</head> </head>
<body> <body>
<p> <p>
@ -31,6 +31,46 @@ defmodule Readability.TitleFinderTest do
test "extract tag title" do test "extract tag title" do
title = Readability.TitleFinder.tag_title(@html) title = Readability.TitleFinder.tag_title(@html)
assert title == "Tag title" assert title == "Tag title"
html = """
<html>
<head>
<title>Tag title :: test</title>
</head>
</html>
"""
title = Readability.TitleFinder.tag_title(html)
assert title == "Tag title"
html = """
<html>
<head>
<title>Tag title | test</title>
</head>
</html>
"""
title = Readability.TitleFinder.tag_title(html)
assert title == "Tag title"
html = """
<html>
<head>
<title>Tag title-tag</title>
</head>
</html>
"""
title = Readability.TitleFinder.tag_title(html)
assert title == "Tag title-tag"
html = """
<html>
<head>
<title>Tag title-tag-title - test</title>
</head>
</html>
"""
title = Readability.TitleFinder.tag_title(html)
assert title == "Tag title-tag-title"
end end
test "extract h1 tag title" do test "extract h1 tag title" do