parent
23970d5b82
commit
93bdf48b8c
|
@ -3,11 +3,16 @@
|
||||||
All notable changes to this project will be documented in this file.
|
All notable changes to this project will be documented in this file.
|
||||||
This project adheres to [Semantic Versioning](http://semver.org/).
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
||||||
|
|
||||||
|
## [0.5.0] - 2016.05.07
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Add `Readability.summarize` function
|
||||||
|
|
||||||
## [0.4.0] - 2016.04.28
|
## [0.4.0] - 2016.04.28
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- Add author extractor function
|
- Add author extractor function
|
||||||
- Add `readable_html` function
|
- Add `Readability.readable_html` function
|
||||||
|
|
||||||
## [0.3.1] - 2016.04.24
|
## [0.3.1] - 2016.04.24
|
||||||
|
|
||||||
|
|
79
README.md
79
README.md
|
@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
```elixir
|
|
||||||
### Get example page.
|
|
||||||
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
|
|
||||||
|
|
||||||
### Extract the title.
|
#### Just pass url
|
||||||
Readability.title(html)
|
```elixir
|
||||||
|
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
|
||||||
|
summary = Readability.summarize(url)
|
||||||
|
|
||||||
|
summary.title
|
||||||
#=> "Why I’m betting on Elixir"
|
#=> "Why I’m betting on Elixir"
|
||||||
|
|
||||||
### Extract authors.
|
summary.authors
|
||||||
Readability.authors(html)
|
|
||||||
#=> ["Ken Mazaika"]
|
#=> ["Ken Mazaika"]
|
||||||
|
|
||||||
|
summary.article_html
|
||||||
### Extract the primary content with transformed html.
|
|
||||||
html
|
|
||||||
|> Readability.article
|
|
||||||
|> Readability.readable_html
|
|
||||||
#=>
|
#=>
|
||||||
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
|
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
|
||||||
# ...
|
# ...
|
||||||
# ...button!</em></h3></div></div>
|
# ...button!</em></h3></div></div>
|
||||||
|
|
||||||
|
summary.article_text
|
||||||
### Extract only text from the primary content.
|
|
||||||
html
|
|
||||||
|> Readability.article
|
|
||||||
|> Readability.readable_text
|
|
||||||
|
|
||||||
#=>
|
#=>
|
||||||
# Background: I’ve spent the past 6 years building web applications in Ruby and.....
|
# Background: I’ve spent the past 6 years building web applications in Ruby and.....
|
||||||
# ...
|
# ...
|
||||||
# ... value in this article, it would mean a lot to me if you hit the recommend button!
|
# ... value in this article, it would mean a lot to me if you hit the recommend button!
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### From raw html
|
||||||
|
|
||||||
|
```elixir
|
||||||
|
### Extract the title.
|
||||||
|
Readability.title(html)
|
||||||
|
|
||||||
|
### Extract authors.
|
||||||
|
Readability.authors(html)
|
||||||
|
|
||||||
|
### Extract the primary content with transformed html.
|
||||||
|
html
|
||||||
|
|> Readability.article
|
||||||
|
|> Readability.readable_html
|
||||||
|
|
||||||
|
### Extract only text from the primary content.
|
||||||
|
html
|
||||||
|
|> Readability.article
|
||||||
|
|> Readability.readable_text
|
||||||
|
|
||||||
|
### you can extract the primary images with Floki
|
||||||
|
html
|
||||||
|
|> Readability.article
|
||||||
|
|> Floki.find("img")
|
||||||
|
|> Floki.attribute("src")
|
||||||
|
```
|
||||||
|
|
||||||
### Options
|
### Options
|
||||||
|
|
||||||
You may provide options(Keyword type) to `Readability.article`, including:
|
If result is different with your expectation, you can add options.
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
```elixir
|
||||||
|
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
|
||||||
|
summary = Readability.summarize(url, [clean_conditionally: false])
|
||||||
|
```
|
||||||
|
|
||||||
* retry_length \\\\ 250
|
|
||||||
* min_text_length \\\\ 25
|
* min_text_length \\\\ 25
|
||||||
* remove_unlikely_candidates \\\\ true,
|
* remove_unlikely_candidates \\\\ true
|
||||||
* weight_classes \\\\ true,
|
* weight_classes \\\\ true
|
||||||
* clean_conditionally \\\\ true,
|
* clean_conditionally \\\\ true
|
||||||
* remove_empty_nodes \\\\ true,
|
* retry_length \\\\ 250
|
||||||
|
|
||||||
|
**You can find other algorithm and regex options in `readability.ex`**
|
||||||
|
|
||||||
## Test
|
## Test
|
||||||
|
|
||||||
|
@ -81,10 +105,17 @@ To run the test suite:
|
||||||
$ mix test
|
$ mix test
|
||||||
|
|
||||||
## Todo
|
## Todo
|
||||||
|
<<<<<<< HEAD
|
||||||
|
|
||||||
**Contributions are welcome!**
|
**Contributions are welcome!**
|
||||||
|
|
||||||
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
|
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
|
||||||
|
=======
|
||||||
|
* [x] Extract authors
|
||||||
|
* [x] More configurable
|
||||||
|
* [x] Summarize function
|
||||||
|
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
||||||
|
>>>>>>> cb86b7a... add summarize function
|
||||||
|
|
||||||
## Related and Inpired Projects
|
## Related and Inpired Projects
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,9 @@ defmodule Readability do
|
||||||
```elixir
|
```elixir
|
||||||
@type html :: binary
|
@type html :: binary
|
||||||
|
|
||||||
|
# Just pass url
|
||||||
|
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
|
||||||
|
|
||||||
# Extract title
|
# Extract title
|
||||||
Readability.title(html)
|
Readability.title(html)
|
||||||
|
|
||||||
|
@ -28,6 +31,7 @@ defmodule Readability do
|
||||||
alias Readability.TitleFinder
|
alias Readability.TitleFinder
|
||||||
alias Readability.AuthorFinder
|
alias Readability.AuthorFinder
|
||||||
alias Readability.ArticleBuilder
|
alias Readability.ArticleBuilder
|
||||||
|
alias Readability.Summary
|
||||||
alias Readability.Helper
|
alias Readability.Helper
|
||||||
|
|
||||||
@default_options [retry_length: 250,
|
@default_options [retry_length: 250,
|
||||||
|
@ -40,7 +44,8 @@ defmodule Readability do
|
||||||
min_image_height: 80,
|
min_image_height: 80,
|
||||||
ignore_image_format: [],
|
ignore_image_format: [],
|
||||||
blacklist: nil,
|
blacklist: nil,
|
||||||
whitelist: nil
|
whitelist: nil,
|
||||||
|
page_url: nil
|
||||||
]
|
]
|
||||||
|
|
||||||
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||||
|
@ -56,8 +61,28 @@ defmodule Readability do
|
||||||
]
|
]
|
||||||
|
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
|
@type raw_html :: binary
|
||||||
|
@type url :: binary
|
||||||
@type options :: list
|
@type options :: list
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
summarize the primary readable content of a webpage.
|
||||||
|
"""
|
||||||
|
@spec summarize(url, options) :: Summary.t
|
||||||
|
def summarize(url, opts \\ []) do
|
||||||
|
opts = Keyword.merge(opts, [page_url: url])
|
||||||
|
%{status_code: _, body: raw_html} = HTTPoison.get!(url)
|
||||||
|
html_tree = Helper.normalize(raw_html)
|
||||||
|
article_tree = html_tree
|
||||||
|
|> ArticleBuilder.build(opts)
|
||||||
|
|
||||||
|
%Summary{title: title(html_tree),
|
||||||
|
authors: authors(html_tree),
|
||||||
|
article_html: readable_html(article_tree),
|
||||||
|
article_text: readable_text(article_tree)
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Extract title
|
Extract title
|
||||||
|
|
||||||
|
@ -67,7 +92,11 @@ defmodule Readability do
|
||||||
"Some title in html"
|
"Some title in html"
|
||||||
"""
|
"""
|
||||||
@spec title(binary | html_tree) :: binary
|
@spec title(binary | html_tree) :: binary
|
||||||
def title(html) when is_binary(html), do: html |> normalize |> title
|
def title(raw_html) when is_binary(raw_html) do
|
||||||
|
raw_html
|
||||||
|
|> Helper.normalize
|
||||||
|
|> title
|
||||||
|
end
|
||||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,7 +126,7 @@ defmodule Readability do
|
||||||
def article(raw_html, opts \\ []) do
|
def article(raw_html, opts \\ []) do
|
||||||
opts = Keyword.merge(@default_options, opts)
|
opts = Keyword.merge(@default_options, opts)
|
||||||
raw_html
|
raw_html
|
||||||
|> normalize
|
|> Helper.normalize
|
||||||
|> ArticleBuilder.build(opts)
|
|> ArticleBuilder.build(opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -133,19 +162,6 @@ defmodule Readability do
|
||||||
html_tree |> Floki.raw_html
|
html_tree |> Floki.raw_html
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
|
||||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
|
||||||
"""
|
|
||||||
@spec parse(binary) :: html_tree
|
|
||||||
def normalize(raw_html) do
|
|
||||||
raw_html
|
|
||||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
|
||||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
|
||||||
|> String.replace(Readability.regexes[:normalize], " ")
|
|
||||||
|> Floki.parse
|
|
||||||
|> Floki.filter_out(:comment)
|
|
||||||
end
|
|
||||||
|
|
||||||
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
||||||
|
|
||||||
def regexes, do: @regexes
|
def regexes, do: @regexes
|
||||||
|
|
|
@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
|
||||||
@spec find(html_tree) :: [binary]
|
@spec find(html_tree) :: [binary]
|
||||||
def find(html_tree) do
|
def find(html_tree) do
|
||||||
author_names = find_by_meta_tag(html_tree)
|
author_names = find_by_meta_tag(html_tree)
|
||||||
split_author_names(author_names)
|
if author_names do
|
||||||
|
split_author_names(author_names)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def find_by_meta_tag(html_tree) do
|
def find_by_meta_tag(html_tree) do
|
||||||
|
|
|
@ -86,6 +86,19 @@ defmodule Readability.Helper do
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||||
|
"""
|
||||||
|
@spec normalize(binary) :: html_tree
|
||||||
|
def normalize(raw_html) do
|
||||||
|
raw_html
|
||||||
|
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||||
|
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||||
|
|> String.replace(Readability.regexes[:normalize], " ")
|
||||||
|
|> Floki.parse
|
||||||
|
|> Floki.filter_out(:comment)
|
||||||
|
end
|
||||||
|
|
||||||
defp candidates_selector do
|
defp candidates_selector do
|
||||||
["p", "td"]
|
["p", "td"]
|
||||||
|> Enum.map(fn(s) ->
|
|> Enum.map(fn(s) ->
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
defmodule Readability.Summary do
|
||||||
|
defstruct title: nil, authors: [], article_html: nil, article_text: nil
|
||||||
|
end
|
Loading…
Reference in New Issue