Merge pull request #21 from pineconellc/fix_multiple_title_tags

Fix multi-match and no-match Title extractor issues
This commit is contained in:
Jaehyun Shin 2016-11-24 19:04:29 +09:00 committed by GitHub
commit 1ea6f138ba
3 changed files with 69 additions and 5 deletions

View File

@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/). This project adheres to [Semantic Versioning](http://semver.org/).
## Unreleased ## Unreleased
- Fix concatenation of multiple matching tags in Title extractor
- Fix exception when no matches are found in Title extractor
## [0.6.1] - 2015.11.07 ## [0.6.1] - 2015.11.07
- Fix `httpoison_options` default option error - Fix `httpoison_options` default option error

View File

@ -33,7 +33,7 @@ defmodule Readability.TitleFinder do
@spec tag_title(html_tree) :: binary @spec tag_title(html_tree) :: binary
def tag_title(html_tree) do def tag_title(html_tree) do
html_tree html_tree
|> Floki.find("head title") |> find_tag("head title")
|> clean_title() |> clean_title()
|> String.split(@title_suffix) |> String.split(@title_suffix)
|> hd() |> hd()
@ -45,7 +45,7 @@ defmodule Readability.TitleFinder do
@spec og_title(html_tree) :: binary @spec og_title(html_tree) :: binary
def og_title(html_tree) do def og_title(html_tree) do
html_tree html_tree
|> Floki.find("meta[property=og:title]") |> find_tag("meta[property=og:title]")
|> Floki.attribute("content") |> Floki.attribute("content")
|> clean_title() |> clean_title()
end end
@ -56,11 +56,22 @@ defmodule Readability.TitleFinder do
@spec h_tag_title(html_tree, String.t) :: binary @spec h_tag_title(html_tree, String.t) :: binary
def h_tag_title(html_tree, selector \\ @h_tag_selector) do def h_tag_title(html_tree, selector \\ @h_tag_selector) do
html_tree html_tree
|> Floki.find(selector) |> find_tag(selector)
|> hd()
|> clean_title() |> clean_title()
end end
defp find_tag(html_tree, selector) do
case Floki.find(html_tree, selector) do
[] ->
[]
matches when is_list(matches) ->
hd(matches)
end
end
defp clean_title([]) do
""
end
defp clean_title(html_tree) do defp clean_title(html_tree) do
html_tree html_tree
|> Floki.text() |> Floki.text()

View File

@ -28,6 +28,19 @@ defmodule Readability.TitleFinderTest do
assert title == "og title" assert title == "og title"
end end
test "does not merge multiple matching og:title tags" do
html = """
<html>
<head>
<meta property='og:title' content='og title 1'>
<meta property='og:title' content='og title 2'>
</head>
</html>
"""
title = Readability.TitleFinder.og_title(html)
assert title == "og title 1"
end
test "extract tag title" do test "extract tag title" do
title = Readability.TitleFinder.tag_title(@html) title = Readability.TitleFinder.tag_title(@html)
assert title == "Tag title" assert title == "Tag title"
@ -86,13 +99,51 @@ defmodule Readability.TitleFinderTest do
assert title == "Tag title" assert title == "Tag title"
end end
test "does not merge multiple title tags" do
html = """
<html>
<head>
<title>tag title 1</title>
<title>tag title 2</title>
</head>
</html>
"""
title = Readability.TitleFinder.tag_title(html)
assert title == "tag title 1"
end
test "extract h1 tag title" do test "extract h1 tag title" do
title = Readability.TitleFinder.h_tag_title(@html) title = Readability.TitleFinder.h_tag_title(@html)
assert title == "h1 title" assert title == "h1 title"
end end
test "extrat h2 tag title" do test "extract h2 tag title" do
title = Readability.TitleFinder.h_tag_title(@html, "h2") title = Readability.TitleFinder.h_tag_title(@html, "h2")
assert title == "h2 title" assert title == "h2 title"
end end
test "does not merge multile header tags" do
html = """
<html>
<body>
<h1>header 1</h1>
<h1>header 2</h1>
</body>
</html>
"""
title = Readability.TitleFinder.h_tag_title(html)
assert title == "header 1"
end
test "returns an empty string when no title tag can be found" do
assert Readability.TitleFinder.tag_title("") == ""
end
test "returns an empty string when no og:title tag can be found" do
assert Readability.TitleFinder.og_title("") == ""
end
test "returns an empty string when no header tag can be found" do
assert Readability.TitleFinder.h_tag_title("") == ""
end
end end