readability/lib/readability/title_finder.ex

93 lines
1.8 KiB
Elixir
Raw Normal View History

2016-04-15 11:51:29 +00:00
defmodule Readability.TitleFinder do
@moduledoc """
2016-04-24 09:40:35 +00:00
The TitleFinder engine traverses HTML tree searching for finding title.
2016-04-15 11:51:29 +00:00
"""
2016-11-04 18:49:25 +00:00
@title_suffix ~r/\s(?:\-|\:\:|\|)\s/
2016-04-15 11:51:29 +00:00
@h_tag_selector "h1, h2, h3"
@type html_tree :: tuple | list
2016-04-17 06:28:33 +00:00
@doc """
Find proper title
"""
@spec title(html_tree) :: binary
2016-04-15 11:51:29 +00:00
def title(html_tree) do
2016-11-04 18:49:42 +00:00
case og_title(html_tree) do
"" ->
title = tag_title(html_tree)
h_title = h_tag_title(html_tree)
2016-04-24 07:14:31 +00:00
if good_title?(title) || h_title == "" do
2016-11-04 18:49:42 +00:00
title
else
h_title
2016-11-04 18:49:42 +00:00
end
2016-11-04 18:49:42 +00:00
title when is_binary(title) ->
title
2016-04-24 07:14:31 +00:00
end
2016-04-15 11:51:29 +00:00
end
@doc """
Find title from title tag
"""
@spec tag_title(html_tree) :: binary
def tag_title(html_tree) do
html_tree
|> find_tag("head title")
2016-11-04 18:49:42 +00:00
|> clean_title()
2016-11-04 18:51:24 +00:00
|> String.split(@title_suffix)
|> hd()
2016-04-15 11:51:29 +00:00
end
@doc """
Find title from og:title property of meta tag
"""
@spec og_title(html_tree) :: binary
def og_title(html_tree) do
html_tree
2017-02-05 09:48:26 +00:00
|> find_tag("meta[property='og:title']")
2016-04-15 11:51:29 +00:00
|> Floki.attribute("content")
2016-11-04 18:49:42 +00:00
|> clean_title()
2016-04-15 11:51:29 +00:00
end
@doc """
Find title from h tag
"""
@spec h_tag_title(html_tree, String.t()) :: binary
2016-04-17 12:26:51 +00:00
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
2016-04-15 11:51:29 +00:00
html_tree
|> find_tag(selector)
2016-11-04 18:49:42 +00:00
|> clean_title()
2016-04-15 11:51:29 +00:00
end
defp find_tag(html_tree, selector) do
case Floki.find(html_tree, selector) do
[] ->
[]
matches when is_list(matches) ->
hd(matches)
end
end
defp clean_title([]) do
""
end
2017-02-05 09:48:26 +00:00
defp clean_title([title]) when is_binary(title) do
String.strip(title)
end
2016-04-17 12:26:51 +00:00
defp clean_title(html_tree) do
2016-11-04 18:49:42 +00:00
html_tree
|> Floki.text()
|> String.strip()
2016-04-15 11:51:29 +00:00
end
2016-04-24 07:14:31 +00:00
defp good_title?(title) do
length(String.split(title, " ")) >= 4
end
2016-04-15 11:51:29 +00:00
end