commit
7e99bb3b6a
|
@ -3,11 +3,16 @@
|
|||
All notable changes to this project will be documented in this file.
|
||||
This project adheres to [Semantic Versioning](http://semver.org/).
|
||||
|
||||
## [0.5.0] - 2016.05.07
|
||||
|
||||
### Added
|
||||
- Add `Readability.summarize` function
|
||||
|
||||
## [0.4.0] - 2016.04.28
|
||||
|
||||
### Added
|
||||
- Add author extractor function
|
||||
- Add `readable_html` function
|
||||
- Add `Readability.readable_html` function
|
||||
|
||||
## [0.3.1] - 2016.04.24
|
||||
|
||||
|
|
79
README.md
79
README.md
|
@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
|||
## Usage
|
||||
|
||||
### Examples
|
||||
```elixir
|
||||
### Get example page.
|
||||
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
|
||||
|
||||
### Extract the title.
|
||||
Readability.title(html)
|
||||
#### Just pass url
|
||||
```elixir
|
||||
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
|
||||
summary = Readability.summarize(url)
|
||||
|
||||
summary.title
|
||||
#=> "Why I’m betting on Elixir"
|
||||
|
||||
### Extract authors.
|
||||
Readability.authors(html)
|
||||
summary.authors
|
||||
#=> ["Ken Mazaika"]
|
||||
|
||||
|
||||
### Extract the primary content with transformed html.
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.readable_html
|
||||
summary.article_html
|
||||
#=>
|
||||
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
|
||||
# ...
|
||||
# ...button!</em></h3></div></div>
|
||||
|
||||
|
||||
### Extract only text from the primary content.
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.readable_text
|
||||
|
||||
summary.article_text
|
||||
#=>
|
||||
# Background: I’ve spent the past 6 years building web applications in Ruby and.....
|
||||
# ...
|
||||
# ... value in this article, it would mean a lot to me if you hit the recommend button!
|
||||
```
|
||||
|
||||
#### From raw html
|
||||
|
||||
```elixir
|
||||
### Extract the title.
|
||||
Readability.title(html)
|
||||
|
||||
### Extract authors.
|
||||
Readability.authors(html)
|
||||
|
||||
### Extract the primary content with transformed html.
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.readable_html
|
||||
|
||||
### Extract only text from the primary content.
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.readable_text
|
||||
|
||||
### you can extract the primary images with Floki
|
||||
html
|
||||
|> Readability.article
|
||||
|> Floki.find("img")
|
||||
|> Floki.attribute("src")
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
You may provide options(Keyword type) to `Readability.article`, including:
|
||||
If result is different with your expectation, you can add options.
|
||||
|
||||
#### Example
|
||||
```elixir
|
||||
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
|
||||
summary = Readability.summarize(url, [clean_conditionally: false])
|
||||
```
|
||||
|
||||
* retry_length \\\\ 250
|
||||
* min_text_length \\\\ 25
|
||||
* remove_unlikely_candidates \\\\ true,
|
||||
* weight_classes \\\\ true,
|
||||
* clean_conditionally \\\\ true,
|
||||
* remove_empty_nodes \\\\ true,
|
||||
* remove_unlikely_candidates \\\\ true
|
||||
* weight_classes \\\\ true
|
||||
* clean_conditionally \\\\ true
|
||||
* retry_length \\\\ 250
|
||||
|
||||
**You can find other algorithm and regex options in `readability.ex`**
|
||||
|
||||
## Test
|
||||
|
||||
|
@ -81,10 +105,17 @@ To run the test suite:
|
|||
$ mix test
|
||||
|
||||
## Todo
|
||||
<<<<<<< HEAD
|
||||
|
||||
**Contributions are welcome!**
|
||||
|
||||
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
|
||||
=======
|
||||
* [x] Extract authors
|
||||
* [x] More configurable
|
||||
* [x] Summarize function
|
||||
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
||||
>>>>>>> cb86b7a... add summarize function
|
||||
|
||||
## Related and Inpired Projects
|
||||
|
||||
|
|
|
@ -7,6 +7,9 @@ defmodule Readability do
|
|||
```elixir
|
||||
@type html :: binary
|
||||
|
||||
# Just pass url
|
||||
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
|
||||
|
||||
# Extract title
|
||||
Readability.title(html)
|
||||
|
||||
|
@ -28,6 +31,7 @@ defmodule Readability do
|
|||
alias Readability.TitleFinder
|
||||
alias Readability.AuthorFinder
|
||||
alias Readability.ArticleBuilder
|
||||
alias Readability.Summary
|
||||
alias Readability.Helper
|
||||
|
||||
@default_options [retry_length: 250,
|
||||
|
@ -40,7 +44,8 @@ defmodule Readability do
|
|||
min_image_height: 80,
|
||||
ignore_image_format: [],
|
||||
blacklist: nil,
|
||||
whitelist: nil
|
||||
whitelist: nil,
|
||||
page_url: nil
|
||||
]
|
||||
|
||||
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
|
@ -56,8 +61,28 @@ defmodule Readability do
|
|||
]
|
||||
|
||||
@type html_tree :: tuple | list
|
||||
@type raw_html :: binary
|
||||
@type url :: binary
|
||||
@type options :: list
|
||||
|
||||
@doc """
|
||||
summarize the primary readable content of a webpage.
|
||||
"""
|
||||
@spec summarize(url, options) :: Summary.t
|
||||
def summarize(url, opts \\ []) do
|
||||
opts = Keyword.merge(opts, [page_url: url])
|
||||
%{status_code: _, body: raw_html} = HTTPoison.get!(url)
|
||||
html_tree = Helper.normalize(raw_html)
|
||||
article_tree = html_tree
|
||||
|> ArticleBuilder.build(opts)
|
||||
|
||||
%Summary{title: title(html_tree),
|
||||
authors: authors(html_tree),
|
||||
article_html: readable_html(article_tree),
|
||||
article_text: readable_text(article_tree)
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract title
|
||||
|
||||
|
@ -67,7 +92,11 @@ defmodule Readability do
|
|||
"Some title in html"
|
||||
"""
|
||||
@spec title(binary | html_tree) :: binary
|
||||
def title(html) when is_binary(html), do: html |> normalize |> title
|
||||
def title(raw_html) when is_binary(raw_html) do
|
||||
raw_html
|
||||
|> Helper.normalize
|
||||
|> title
|
||||
end
|
||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||
|
||||
|
||||
|
@ -97,7 +126,7 @@ defmodule Readability do
|
|||
def article(raw_html, opts \\ []) do
|
||||
opts = Keyword.merge(@default_options, opts)
|
||||
raw_html
|
||||
|> normalize
|
||||
|> Helper.normalize
|
||||
|> ArticleBuilder.build(opts)
|
||||
end
|
||||
|
||||
|
@ -133,19 +162,6 @@ defmodule Readability do
|
|||
html_tree |> Floki.raw_html
|
||||
end
|
||||
|
||||
@doc """
|
||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||
"""
|
||||
@spec parse(binary) :: html_tree
|
||||
def normalize(raw_html) do
|
||||
raw_html
|
||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||
|> String.replace(Readability.regexes[:normalize], " ")
|
||||
|> Floki.parse
|
||||
|> Floki.filter_out(:comment)
|
||||
end
|
||||
|
||||
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
||||
|
||||
def regexes, do: @regexes
|
||||
|
|
|
@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
|
|||
@spec find(html_tree) :: [binary]
|
||||
def find(html_tree) do
|
||||
author_names = find_by_meta_tag(html_tree)
|
||||
split_author_names(author_names)
|
||||
if author_names do
|
||||
split_author_names(author_names)
|
||||
end
|
||||
end
|
||||
|
||||
def find_by_meta_tag(html_tree) do
|
||||
|
|
|
@ -86,6 +86,19 @@ defmodule Readability.Helper do
|
|||
end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||
"""
|
||||
@spec normalize(binary) :: html_tree
|
||||
def normalize(raw_html) do
|
||||
raw_html
|
||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||
|> String.replace(Readability.regexes[:normalize], " ")
|
||||
|> Floki.parse
|
||||
|> Floki.filter_out(:comment)
|
||||
end
|
||||
|
||||
defp candidates_selector do
|
||||
["p", "td"]
|
||||
|> Enum.map(fn(s) ->
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
defmodule Readability.Summary do
|
||||
defstruct title: nil, authors: [], article_html: nil, article_text: nil
|
||||
end
|
Loading…
Reference in New Issue