Merge pull request #14 from pineconellc/fix_title_suffix_detection
Fix title suffix detection
This commit is contained in:
commit
ba9027ee7e
|
@ -3,7 +3,7 @@ defmodule Readability.TitleFinder do
|
||||||
The TitleFinder engine traverses HTML tree searching for finding title.
|
The TitleFinder engine traverses HTML tree searching for finding title.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@title_suffix ~r/(\-)|(\:\:)|(\|)/
|
@title_suffix ~r/\s(?:\-|\:\:|\|)\s/
|
||||||
@h_tag_selector "h1, h2, h3"
|
@h_tag_selector "h1, h2, h3"
|
||||||
|
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
|
@ -13,19 +13,18 @@ defmodule Readability.TitleFinder do
|
||||||
"""
|
"""
|
||||||
@spec title(html_tree) :: binary
|
@spec title(html_tree) :: binary
|
||||||
def title(html_tree) do
|
def title(html_tree) do
|
||||||
maybe_title = og_title(html_tree)
|
case og_title(html_tree) do
|
||||||
if String.length(String.strip(maybe_title)) == 0 do
|
"" ->
|
||||||
maybe_title = tag_title(html_tree)
|
title = tag_title(html_tree)
|
||||||
end
|
|
||||||
|
|
||||||
unless good_title?(maybe_title) do
|
if good_title?(title) do
|
||||||
h_title = h_tag_title(html_tree)
|
title
|
||||||
if good_title?(h_title) do
|
else
|
||||||
maybe_title = h_title
|
h_tag_title(html_tree)
|
||||||
end
|
end
|
||||||
|
title when is_binary(title) ->
|
||||||
|
title
|
||||||
end
|
end
|
||||||
|
|
||||||
maybe_title
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -35,7 +34,9 @@ defmodule Readability.TitleFinder do
|
||||||
def tag_title(html_tree) do
|
def tag_title(html_tree) do
|
||||||
html_tree
|
html_tree
|
||||||
|> Floki.find("title")
|
|> Floki.find("title")
|
||||||
|> clean_title
|
|> clean_title()
|
||||||
|
|> String.split(@title_suffix)
|
||||||
|
|> hd()
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -46,7 +47,7 @@ defmodule Readability.TitleFinder do
|
||||||
html_tree
|
html_tree
|
||||||
|> Floki.find("meta[property=og:title]")
|
|> Floki.find("meta[property=og:title]")
|
||||||
|> Floki.attribute("content")
|
|> Floki.attribute("content")
|
||||||
|> clean_title
|
|> clean_title()
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -56,16 +57,14 @@ defmodule Readability.TitleFinder do
|
||||||
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
|
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
|
||||||
html_tree
|
html_tree
|
||||||
|> Floki.find(selector)
|
|> Floki.find(selector)
|
||||||
|> hd
|
|> hd()
|
||||||
|> clean_title
|
|> clean_title()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_title(html_tree) do
|
defp clean_title(html_tree) do
|
||||||
title_text = html_tree
|
html_tree
|
||||||
|> Floki.text
|
|> Floki.text()
|
||||||
|> String.split(@title_suffix)
|
|> String.strip()
|
||||||
|> hd
|
|
||||||
|> String.strip
|
|
||||||
end
|
end
|
||||||
|
|
||||||
defp good_title?(title) do
|
defp good_title?(title) do
|
||||||
|
|
|
@ -7,7 +7,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Tag title - test</title>
|
<title>Tag title - test</title>
|
||||||
<meta property='og:title' content='og title | test'>
|
<meta property='og:title' content='og title'>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<p>
|
<p>
|
||||||
|
@ -31,6 +31,46 @@ defmodule Readability.TitleFinderTest do
|
||||||
test "extract tag title" do
|
test "extract tag title" do
|
||||||
title = Readability.TitleFinder.tag_title(@html)
|
title = Readability.TitleFinder.tag_title(@html)
|
||||||
assert title == "Tag title"
|
assert title == "Tag title"
|
||||||
|
|
||||||
|
html = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Tag title :: test</title>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
|
assert title == "Tag title"
|
||||||
|
|
||||||
|
html = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Tag title | test</title>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
|
assert title == "Tag title"
|
||||||
|
|
||||||
|
html = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Tag title-tag</title>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
|
assert title == "Tag title-tag"
|
||||||
|
|
||||||
|
html = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Tag title-tag-title - test</title>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
|
assert title == "Tag title-tag-title"
|
||||||
end
|
end
|
||||||
|
|
||||||
test "extract h1 tag title" do
|
test "extract h1 tag title" do
|
||||||
|
|
Loading…
Reference in New Issue