readability/lib/readability/title_finder.ex

63 lines
1.4 KiB
Elixir
Raw Normal View History

2016-04-15 11:51:29 +00:00
defmodule Readability.TitleFinder do
@moduledoc """
The TitleFinder engine traverse the HTML tree searching for finding title.
"""
@title_suffix ~r/(\-)|(\:\:)|(\|)/
@h_tag_selector "h1, h2, h3"
@type html_tree :: tuple | list
2016-04-17 06:28:33 +00:00
@doc """
Find proper title
"""
@spec title(html_tree) :: binary
2016-04-15 11:51:29 +00:00
def title(html_tree) do
maybe_title = tag_title(html_tree)
if length(String.split(maybe_title, " ")) <= 4 do
maybe_title = og_title(html_tree)
end
maybe_title || h_tag_title(html_tree)
end
@doc """
Find title from title tag
"""
@spec tag_title(html_tree) :: binary
def tag_title(html_tree) do
html_tree
|> Floki.find("title")
2016-04-17 12:26:51 +00:00
|> clean_title
2016-04-15 11:51:29 +00:00
end
@doc """
Find title from og:title property of meta tag
"""
@spec og_title(html_tree) :: binary
def og_title(html_tree) do
html_tree
|> Floki.find("meta[property=og:title]")
|> Floki.attribute("content")
2016-04-17 12:26:51 +00:00
|> clean_title
2016-04-15 11:51:29 +00:00
end
@doc """
Find title from h tag
"""
@spec h_tag_title(html_tree, String.t) :: binary
2016-04-17 12:26:51 +00:00
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
2016-04-15 11:51:29 +00:00
html_tree
|> Floki.find(selector)
|> hd
2016-04-17 12:26:51 +00:00
|> clean_title
2016-04-15 11:51:29 +00:00
end
2016-04-17 12:26:51 +00:00
defp clean_title(html_tree) do
2016-04-15 11:51:29 +00:00
title_text = html_tree
|> Floki.text
|> String.split(@title_suffix)
|> hd
|> String.strip
end
end