add authors finder

This commit is contained in:
keepcosmos 2016-04-28 15:13:03 +09:00
parent cc5e07271a
commit 4aa8f6ecea
9 changed files with 145 additions and 39 deletions

View File

@ -1,5 +0,0 @@
# Change log
## [0.3.0] - 2016.04.24
- Release!!

14
CHANGELOG.md Normal file
View File

@ -0,0 +1,14 @@
# Change log
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [0.4.0] - 2016.04.28
### Added
- Add author extractor function
- Add `readable_html` function
## [0.3.1] - 2016.04.24
- Release!!

View File

@ -14,7 +14,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
```elixir ```elixir
def deps do def deps do
[{:readability, "~> 0.3"}] [{:readability, "~> 0.4"}]
end end
``` ```
@ -28,23 +28,29 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage ## Usage
The example below, `html` variable is the html source from blog content "[Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)".
### Examples ### Examples
```elixir ```elixir
### Get example page.
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
### Extract the title ### Extract the title.
Readability.title(html) Readability.title(html)
#=> Elixir Design Goals #=> "Why Im betting on Elixir"
### Extract authors.
Readability.authors(html)
#=> ["Ken Mazaika"]
### Extract the primary content with transformed html. ### Extract the primary content with transformed html.
html html
|> Readability.article |> Readability.article
|> Readability.raw_html |> Readability.readable_html
#=> #=>
# <div><div class=\"entry-content\"><p>During the last year, # <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>Ive spent...
# ... # ...
# ... out our sidebar for other learning resources.</p></div></div> # ...button!</em></h3></div></div>
### Extract only text from the primary content. ### Extract only text from the primary content.
html html
@ -52,9 +58,9 @@ html
|> Readability.readable_text |> Readability.readable_text
#=> #=>
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s..... # Background: Ive spent the past 6 years building web applications in Ruby and.....
# ... # ...
# ... started guide, or check out our sidebar for other learning resources. # ... value in this article, it would mean a lot to me if you hit the recommend button!
``` ```
### Options ### Options
@ -75,7 +81,7 @@ To run the test suite:
$ mix test $ mix test
## Todo ## Todo
* [ ] Extract authors * [x] Extract authors
* [ ] Extract Images * [ ] Extract Images
* [ ] Extract Videos * [ ] Extract Videos
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href` * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`

View File

@ -23,7 +23,9 @@ defmodule Readability do
""" """
alias Readability.TitleFinder alias Readability.TitleFinder
alias Readability.AuthorFinder
alias Readability.ArticleBuilder alias Readability.ArticleBuilder
alias Readability.Helper
@default_options [retry_length: 250, @default_options [retry_length: 250,
min_text_length: 25, min_text_length: 25,
@ -46,7 +48,8 @@ defmodule Readability do
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i, replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i, replace_fonts: ~r/<(\/?)font[^>]*>/i,
normalize: ~r/\s{2,}/, normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
] ]
@type html_tree :: tuple | list @type html_tree :: tuple | list
@ -60,10 +63,23 @@ defmodule Readability do
iex> title = Readability.title(html_str) iex> title = Readability.title(html_str)
"Some title in html" "Some title in html"
""" """
@spec title(binary) :: binary @spec title(binary | html_tree) :: binary
def title(html) when is_binary(html), do: html |> parse |> title def title(html) when is_binary(html), do: html |> normalize |> title
def title(html_tree), do: TitleFinder.title(html_tree) def title(html_tree), do: TitleFinder.title(html_tree)
@doc """
Extract authors
## Example
iex> authors = Readability.authors(html_str)
["José Valim", "chrismccord"]
"""
@spec authors(binary | html_tree) :: list[binary]
def authors(html) when is_binary(html), do: html |> parse |> authors
def authors(html_tree), do: AuthorFinder.find(html_tree)
@doc """ @doc """
Using a variety of metrics (content score, classname, element types), find the content that is Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read most likely to be the stuff a user wants to read
@ -78,23 +94,24 @@ defmodule Readability do
def article(raw_html, opts \\ []) do def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts) opts = Keyword.merge(@default_options, opts)
raw_html raw_html
|> parse |> normalize
|> ArticleBuilder.build(opts) |> ArticleBuilder.build(opts)
end end
@doc """ @doc """
return raw html binary from html_tree return attributes, tags cleaned html
""" """
@spec raw_html(html_tree) :: binary @spec readable_html(html_tree) :: binary
def raw_html(html_tree) do def readable_html(html_tree) do
html_tree |> Floki.raw_html html_tree
|> Helper.remove_attrs(regexes[:protect_attrs])
|> raw_html
end end
@doc """ @doc """
return only text binary from html_tree return only text binary from html_tree
""" """
@spec raw_html(html_tree) :: binary @spec readable_text(html_tree) :: binary
def readable_text(html_tree) do def readable_text(html_tree) do
# TODO: Remove image caption when extract only text # TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i tags_to_br = ~r/<\/(p|div|article|h\d)/i
@ -105,11 +122,19 @@ defmodule Readability do
|> String.strip |> String.strip
end end
@doc """
return raw html binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """ @doc """
Normalize and Parse to html tree(tuple or list)) from binary html Normalize and Parse to html tree(tuple or list)) from binary html
""" """
@spec parse(binary) :: html_tree @spec parse(binary) :: html_tree
def parse(raw_html) do def normalize(raw_html) do
raw_html raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>") |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
@ -118,6 +143,8 @@ defmodule Readability do
|> Floki.filter_out(:comment) |> Floki.filter_out(:comment)
end end
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
def regexes, do: @regexes def regexes, do: @regexes
def default_options, do: @default_options def default_options, do: @default_options

View File

@ -0,0 +1,38 @@
defmodule Readability.AuthorFinder do
@moduledoc """
AuthorFinder extracts authors
"""
@type html_tree :: tuple | list
@doc """
Extract authors
"""
@spec find(html_tree) :: [binary]
def find(html_tree) do
author_names = find_by_meta_tag(html_tree)
split_author_names(author_names)
end
def find_by_meta_tag(html_tree) do
names = html_tree
|> Floki.find("meta[name*=author], meta[property*=author]")
|> Enum.map(fn(meta) ->
meta
|> Floki.attribute("content")
|> Floki.text
|> String.strip
end)
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
if length(names) > 0 do
hd(names)
else
nil
end
end
defp split_author_names(author_name) do
String.split(author_name, ~r/,\s|\sand\s|by\s/i)
|> Enum.reject(&(String.length(&1) == 0))
end
end

View File

@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do
html_tree = html_tree html_tree = html_tree
|> Helper.remove_tag(conditionally_cleaing_fn(candidates)) |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
end end
html_tree
html_tree |> Helper.remove_attrs("style")
end end
defp conditionally_cleaing_fn(candidates) do defp conditionally_cleaing_fn(candidates) do

View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """ @moduledoc """
""" """
@version "0.3.1" @version "0.4.0"
@description """ @description """
Readability library for extracting and curating articles. Readability library for extracting and curating articles.
""" """
@ -25,7 +25,8 @@ defmodule Readability.Mixfile do
# Type "mix help compile.app" for more information # Type "mix help compile.app" for more information
def application do def application do
[applications: [:logger, [applications: [:logger,
:floki :floki,
:httpoison
]] ]]
end end
@ -40,6 +41,7 @@ defmodule Readability.Mixfile do
# Type "mix help deps" for more examples and options # Type "mix help deps" for more examples and options
defp deps do defp deps do
[{:floki, "~> 0.8.0"}, [{:floki, "~> 0.8.0"},
{:httpoison, "~> 0.8.0"},
{:earmark, "~> 0.1", only: :dev}, {:earmark, "~> 0.1", only: :dev},
{:ex_doc, "~> 0.11", only: :dev}, {:ex_doc, "~> 0.11", only: :dev},
{:credo, "~> 0.3", only: [:dev, :test]}, {:credo, "~> 0.3", only: [:dev, :test]},

View File

@ -0,0 +1,25 @@
defmodule Readability.AuthoFinderTest do
use ExUnit.Case, async: true
alias Readability.AuthorFinder
test "extracting bbc format author" do
html = TestHelper.read_fixture("bbc.html")
assert AuthorFinder.find(html) == ["BBC News"]
end
test "extracting buzzfeed format author" do
html = TestHelper.read_fixture("buzzfeed.html")
assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"]
end
test "extracting medium format author" do
html = TestHelper.read_fixture("medium.html")
assert AuthorFinder.find(html) == ["Ken Mazaika"]
end
test "extracting nytimes format author" do
html = TestHelper.read_fixture("nytimes.html")
assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"]
end
end

View File

@ -6,8 +6,8 @@ defmodule ReadabilityTest do
opts = [clean_conditionally: false] opts = [clean_conditionally: false]
nytimes = Readability.article(html, opts) nytimes = Readability.article(html, opts)
nytimes_html = Readability.raw_html(nytimes) nytimes_html = Readability.readable_html(nytimes)
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/ assert nytimes_html =~ ~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/ assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
nytimes_text = Readability.readable_text(nytimes) nytimes_text = Readability.readable_text(nytimes)
@ -19,9 +19,9 @@ defmodule ReadabilityTest do
html = TestHelper.read_fixture("bbc.html") html = TestHelper.read_fixture("bbc.html")
bbc = Readability.article(html) bbc = Readability.article(html)
bbc_html = Readability.raw_html(bbc) bbc_html = Readability.readable_html(bbc)
assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/ assert bbc_html =~ ~r/^<div><div><figure><span><img alt=\"A Microsoft logo/
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/ assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
bbc_text = Readability.readable_text(bbc) bbc_text = Readability.readable_text(bbc)
@ -34,9 +34,9 @@ defmodule ReadabilityTest do
html = TestHelper.read_fixture("medium.html") html = TestHelper.read_fixture("medium.html")
medium = Readability.article(html) medium = Readability.article(html)
medium_html = Readability.raw_html(medium) medium_html = Readability.readable_html(medium)
assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/ assert medium_html =~ ~r/^<div><div><p id=\"3476\"><strong><em>Background:/
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/ assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
medium_text = Readability.readable_text(medium) medium_text = Readability.readable_text(medium)
@ -49,9 +49,9 @@ defmodule ReadabilityTest do
html = TestHelper.read_fixture("buzzfeed.html") html = TestHelper.read_fixture("buzzfeed.html")
buzzfeed = Readability.article(html) buzzfeed = Readability.article(html)
buzzfeed_html = Readability.raw_html(buzzfeed) buzzfeed_html = Readability.readable_html(buzzfeed)
assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/ assert buzzfeed_html =~ ~r/^<div><div><p>The FBI no longer needs Apples help/
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/ assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
buzzfeed_text = Readability.readable_text(buzzfeed) buzzfeed_text = Readability.readable_text(buzzfeed)