add authors finder
This commit is contained in:
parent
cc5e07271a
commit
4aa8f6ecea
|
@ -1,5 +0,0 @@
|
||||||
# Change log
|
|
||||||
|
|
||||||
## [0.3.0] - 2016.04.24
|
|
||||||
|
|
||||||
- Release!!
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Change log
|
||||||
|
|
||||||
|
All notable changes to this project will be documented in this file.
|
||||||
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
||||||
|
|
||||||
|
## [0.4.0] - 2016.04.28
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Add author extractor function
|
||||||
|
- Add `readable_html` function
|
||||||
|
|
||||||
|
## [0.3.1] - 2016.04.24
|
||||||
|
|
||||||
|
- Release!!
|
28
README.md
28
README.md
|
@ -14,7 +14,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
||||||
|
|
||||||
```elixir
|
```elixir
|
||||||
def deps do
|
def deps do
|
||||||
[{:readability, "~> 0.3"}]
|
[{:readability, "~> 0.4"}]
|
||||||
end
|
end
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -28,23 +28,29 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
The example below, `html` variable is the html source from blog content "[Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)".
|
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
```elixir
|
```elixir
|
||||||
|
### Get example page.
|
||||||
|
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
|
||||||
|
|
||||||
### Extract the title
|
### Extract the title.
|
||||||
Readability.title(html)
|
Readability.title(html)
|
||||||
#=> Elixir Design Goals
|
#=> "Why I’m betting on Elixir"
|
||||||
|
|
||||||
|
### Extract authors.
|
||||||
|
Readability.authors(html)
|
||||||
|
#=> ["Ken Mazaika"]
|
||||||
|
|
||||||
|
|
||||||
### Extract the primary content with transformed html.
|
### Extract the primary content with transformed html.
|
||||||
html
|
html
|
||||||
|> Readability.article
|
|> Readability.article
|
||||||
|> Readability.raw_html
|
|> Readability.readable_html
|
||||||
#=>
|
#=>
|
||||||
# <div><div class=\"entry-content\"><p>During the last year,
|
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
|
||||||
# ...
|
# ...
|
||||||
# ... out our sidebar for other learning resources.</p></div></div>
|
# ...button!</em></h3></div></div>
|
||||||
|
|
||||||
|
|
||||||
### Extract only text from the primary content.
|
### Extract only text from the primary content.
|
||||||
html
|
html
|
||||||
|
@ -52,9 +58,9 @@ html
|
||||||
|> Readability.readable_text
|
|> Readability.readable_text
|
||||||
|
|
||||||
#=>
|
#=>
|
||||||
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
|
# Background: I’ve spent the past 6 years building web applications in Ruby and.....
|
||||||
# ...
|
# ...
|
||||||
# ... started guide, or check out our sidebar for other learning resources.
|
# ... value in this article, it would mean a lot to me if you hit the recommend button!
|
||||||
```
|
```
|
||||||
|
|
||||||
### Options
|
### Options
|
||||||
|
@ -75,7 +81,7 @@ To run the test suite:
|
||||||
$ mix test
|
$ mix test
|
||||||
|
|
||||||
## Todo
|
## Todo
|
||||||
* [ ] Extract authors
|
* [x] Extract authors
|
||||||
* [ ] Extract Images
|
* [ ] Extract Images
|
||||||
* [ ] Extract Videos
|
* [ ] Extract Videos
|
||||||
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
||||||
|
|
|
@ -23,7 +23,9 @@ defmodule Readability do
|
||||||
"""
|
"""
|
||||||
|
|
||||||
alias Readability.TitleFinder
|
alias Readability.TitleFinder
|
||||||
|
alias Readability.AuthorFinder
|
||||||
alias Readability.ArticleBuilder
|
alias Readability.ArticleBuilder
|
||||||
|
alias Readability.Helper
|
||||||
|
|
||||||
@default_options [retry_length: 250,
|
@default_options [retry_length: 250,
|
||||||
min_text_length: 25,
|
min_text_length: 25,
|
||||||
|
@ -46,7 +48,8 @@ defmodule Readability do
|
||||||
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
||||||
normalize: ~r/\s{2,}/,
|
normalize: ~r/\s{2,}/,
|
||||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
|
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||||
|
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||||
]
|
]
|
||||||
|
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
|
@ -60,10 +63,23 @@ defmodule Readability do
|
||||||
iex> title = Readability.title(html_str)
|
iex> title = Readability.title(html_str)
|
||||||
"Some title in html"
|
"Some title in html"
|
||||||
"""
|
"""
|
||||||
@spec title(binary) :: binary
|
@spec title(binary | html_tree) :: binary
|
||||||
def title(html) when is_binary(html), do: html |> parse |> title
|
def title(html) when is_binary(html), do: html |> normalize |> title
|
||||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Extract authors
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
iex> authors = Readability.authors(html_str)
|
||||||
|
["José Valim", "chrismccord"]
|
||||||
|
"""
|
||||||
|
@spec authors(binary | html_tree) :: list[binary]
|
||||||
|
def authors(html) when is_binary(html), do: html |> parse |> authors
|
||||||
|
def authors(html_tree), do: AuthorFinder.find(html_tree)
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Using a variety of metrics (content score, classname, element types), find the content that is
|
Using a variety of metrics (content score, classname, element types), find the content that is
|
||||||
most likely to be the stuff a user wants to read
|
most likely to be the stuff a user wants to read
|
||||||
|
@ -78,23 +94,24 @@ defmodule Readability do
|
||||||
def article(raw_html, opts \\ []) do
|
def article(raw_html, opts \\ []) do
|
||||||
opts = Keyword.merge(@default_options, opts)
|
opts = Keyword.merge(@default_options, opts)
|
||||||
raw_html
|
raw_html
|
||||||
|> parse
|
|> normalize
|
||||||
|> ArticleBuilder.build(opts)
|
|> ArticleBuilder.build(opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
return raw html binary from html_tree
|
return attributes, tags cleaned html
|
||||||
"""
|
"""
|
||||||
@spec raw_html(html_tree) :: binary
|
@spec readable_html(html_tree) :: binary
|
||||||
def raw_html(html_tree) do
|
def readable_html(html_tree) do
|
||||||
html_tree |> Floki.raw_html
|
html_tree
|
||||||
|
|> Helper.remove_attrs(regexes[:protect_attrs])
|
||||||
|
|> raw_html
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
return only text binary from html_tree
|
return only text binary from html_tree
|
||||||
"""
|
"""
|
||||||
@spec raw_html(html_tree) :: binary
|
@spec readable_text(html_tree) :: binary
|
||||||
def readable_text(html_tree) do
|
def readable_text(html_tree) do
|
||||||
# TODO: Remove image caption when extract only text
|
# TODO: Remove image caption when extract only text
|
||||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||||
|
@ -105,11 +122,19 @@ defmodule Readability do
|
||||||
|> String.strip
|
|> String.strip
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
return raw html binary from html_tree
|
||||||
|
"""
|
||||||
|
@spec raw_html(html_tree) :: binary
|
||||||
|
def raw_html(html_tree) do
|
||||||
|
html_tree |> Floki.raw_html
|
||||||
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||||
"""
|
"""
|
||||||
@spec parse(binary) :: html_tree
|
@spec parse(binary) :: html_tree
|
||||||
def parse(raw_html) do
|
def normalize(raw_html) do
|
||||||
raw_html
|
raw_html
|
||||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||||
|
@ -118,6 +143,8 @@ defmodule Readability do
|
||||||
|> Floki.filter_out(:comment)
|
|> Floki.filter_out(:comment)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
||||||
|
|
||||||
def regexes, do: @regexes
|
def regexes, do: @regexes
|
||||||
|
|
||||||
def default_options, do: @default_options
|
def default_options, do: @default_options
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
defmodule Readability.AuthorFinder do
|
||||||
|
@moduledoc """
|
||||||
|
AuthorFinder extracts authors
|
||||||
|
"""
|
||||||
|
|
||||||
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Extract authors
|
||||||
|
"""
|
||||||
|
@spec find(html_tree) :: [binary]
|
||||||
|
def find(html_tree) do
|
||||||
|
author_names = find_by_meta_tag(html_tree)
|
||||||
|
split_author_names(author_names)
|
||||||
|
end
|
||||||
|
|
||||||
|
def find_by_meta_tag(html_tree) do
|
||||||
|
names = html_tree
|
||||||
|
|> Floki.find("meta[name*=author], meta[property*=author]")
|
||||||
|
|> Enum.map(fn(meta) ->
|
||||||
|
meta
|
||||||
|
|> Floki.attribute("content")
|
||||||
|
|> Floki.text
|
||||||
|
|> String.strip
|
||||||
|
end)
|
||||||
|
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
|
||||||
|
if length(names) > 0 do
|
||||||
|
hd(names)
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp split_author_names(author_name) do
|
||||||
|
String.split(author_name, ~r/,\s|\sand\s|by\s/i)
|
||||||
|
|> Enum.reject(&(String.length(&1) == 0))
|
||||||
|
end
|
||||||
|
end
|
|
@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do
|
||||||
html_tree = html_tree
|
html_tree = html_tree
|
||||||
|> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
|> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
||||||
end
|
end
|
||||||
|
html_tree
|
||||||
html_tree |> Helper.remove_attrs("style")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
defp conditionally_cleaing_fn(candidates) do
|
defp conditionally_cleaing_fn(candidates) do
|
||||||
|
|
6
mix.exs
6
mix.exs
|
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
|
||||||
@moduledoc """
|
@moduledoc """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@version "0.3.1"
|
@version "0.4.0"
|
||||||
@description """
|
@description """
|
||||||
Readability library for extracting and curating articles.
|
Readability library for extracting and curating articles.
|
||||||
"""
|
"""
|
||||||
|
@ -25,7 +25,8 @@ defmodule Readability.Mixfile do
|
||||||
# Type "mix help compile.app" for more information
|
# Type "mix help compile.app" for more information
|
||||||
def application do
|
def application do
|
||||||
[applications: [:logger,
|
[applications: [:logger,
|
||||||
:floki
|
:floki,
|
||||||
|
:httpoison
|
||||||
]]
|
]]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -40,6 +41,7 @@ defmodule Readability.Mixfile do
|
||||||
# Type "mix help deps" for more examples and options
|
# Type "mix help deps" for more examples and options
|
||||||
defp deps do
|
defp deps do
|
||||||
[{:floki, "~> 0.8.0"},
|
[{:floki, "~> 0.8.0"},
|
||||||
|
{:httpoison, "~> 0.8.0"},
|
||||||
{:earmark, "~> 0.1", only: :dev},
|
{:earmark, "~> 0.1", only: :dev},
|
||||||
{:ex_doc, "~> 0.11", only: :dev},
|
{:ex_doc, "~> 0.11", only: :dev},
|
||||||
{:credo, "~> 0.3", only: [:dev, :test]},
|
{:credo, "~> 0.3", only: [:dev, :test]},
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
defmodule Readability.AuthoFinderTest do
|
||||||
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
|
alias Readability.AuthorFinder
|
||||||
|
|
||||||
|
test "extracting bbc format author" do
|
||||||
|
html = TestHelper.read_fixture("bbc.html")
|
||||||
|
assert AuthorFinder.find(html) == ["BBC News"]
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extracting buzzfeed format author" do
|
||||||
|
html = TestHelper.read_fixture("buzzfeed.html")
|
||||||
|
assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"]
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extracting medium format author" do
|
||||||
|
html = TestHelper.read_fixture("medium.html")
|
||||||
|
assert AuthorFinder.find(html) == ["Ken Mazaika"]
|
||||||
|
end
|
||||||
|
|
||||||
|
test "extracting nytimes format author" do
|
||||||
|
html = TestHelper.read_fixture("nytimes.html")
|
||||||
|
assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"]
|
||||||
|
end
|
||||||
|
end
|
|
@ -6,8 +6,8 @@ defmodule ReadabilityTest do
|
||||||
opts = [clean_conditionally: false]
|
opts = [clean_conditionally: false]
|
||||||
nytimes = Readability.article(html, opts)
|
nytimes = Readability.article(html, opts)
|
||||||
|
|
||||||
nytimes_html = Readability.raw_html(nytimes)
|
nytimes_html = Readability.readable_html(nytimes)
|
||||||
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
assert nytimes_html =~ ~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
|
||||||
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
|
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
|
||||||
|
|
||||||
nytimes_text = Readability.readable_text(nytimes)
|
nytimes_text = Readability.readable_text(nytimes)
|
||||||
|
@ -19,9 +19,9 @@ defmodule ReadabilityTest do
|
||||||
html = TestHelper.read_fixture("bbc.html")
|
html = TestHelper.read_fixture("bbc.html")
|
||||||
bbc = Readability.article(html)
|
bbc = Readability.article(html)
|
||||||
|
|
||||||
bbc_html = Readability.raw_html(bbc)
|
bbc_html = Readability.readable_html(bbc)
|
||||||
|
|
||||||
assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
|
assert bbc_html =~ ~r/^<div><div><figure><span><img alt=\"A Microsoft logo/
|
||||||
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||||
|
|
||||||
bbc_text = Readability.readable_text(bbc)
|
bbc_text = Readability.readable_text(bbc)
|
||||||
|
@ -34,9 +34,9 @@ defmodule ReadabilityTest do
|
||||||
html = TestHelper.read_fixture("medium.html")
|
html = TestHelper.read_fixture("medium.html")
|
||||||
medium = Readability.article(html)
|
medium = Readability.article(html)
|
||||||
|
|
||||||
medium_html = Readability.raw_html(medium)
|
medium_html = Readability.readable_html(medium)
|
||||||
|
|
||||||
assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
|
assert medium_html =~ ~r/^<div><div><p id=\"3476\"><strong><em>Background:/
|
||||||
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
|
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
|
||||||
|
|
||||||
medium_text = Readability.readable_text(medium)
|
medium_text = Readability.readable_text(medium)
|
||||||
|
@ -49,9 +49,9 @@ defmodule ReadabilityTest do
|
||||||
html = TestHelper.read_fixture("buzzfeed.html")
|
html = TestHelper.read_fixture("buzzfeed.html")
|
||||||
buzzfeed = Readability.article(html)
|
buzzfeed = Readability.article(html)
|
||||||
|
|
||||||
buzzfeed_html = Readability.raw_html(buzzfeed)
|
buzzfeed_html = Readability.readable_html(buzzfeed)
|
||||||
|
|
||||||
assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
|
assert buzzfeed_html =~ ~r/^<div><div><p>The FBI no longer needs Apple’s help/
|
||||||
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
|
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
|
||||||
|
|
||||||
buzzfeed_text = Readability.readable_text(buzzfeed)
|
buzzfeed_text = Readability.readable_text(buzzfeed)
|
||||||
|
|
Loading…
Reference in New Issue