Merge pull request #7 from keepcosmos/summarize

add summarize function
This commit is contained in:
Jaehyun Shin 2016-05-07 18:31:56 +09:00
commit 7e99bb3b6a
7 changed files with 113 additions and 43 deletions

View File

@ -3,11 +3,16 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [0.5.0] - 2016.05.07
### Added
- Add `Readability.summarize` function
## [0.4.0] - 2016.04.28
### Added
- Add author extractor function
- Add `readable_html` function
- Add `Readability.readable_html` function
## [0.3.1] - 2016.04.24

View File

@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage
### Examples
```elixir
### Get example page.
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
### Extract the title.
Readability.title(html)
#### Just pass url
```elixir
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url)
summary.title
#=> "Why Im betting on Elixir"
### Extract authors.
Readability.authors(html)
summary.authors
#=> ["Ken Mazaika"]
### Extract the primary content with transformed html.
html
|> Readability.article
|> Readability.readable_html
summary.article_html
#=>
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>Ive spent...
# ...
# ...button!</em></h3></div></div>
### Extract only text from the primary content.
html
|> Readability.article
|> Readability.readable_text
summary.article_text
#=>
# Background: Ive spent the past 6 years building web applications in Ruby and.....
# ...
# ... value in this article, it would mean a lot to me if you hit the recommend button!
```
#### From raw html
```elixir
### Extract the title.
Readability.title(html)
### Extract authors.
Readability.authors(html)
### Extract the primary content with transformed html.
html
|> Readability.article
|> Readability.readable_html
### Extract only text from the primary content.
html
|> Readability.article
|> Readability.readable_text
### you can extract the primary images with Floki
html
|> Readability.article
|> Floki.find("img")
|> Floki.attribute("src")
```
### Options
You may provide options(Keyword type) to `Readability.article`, including:
If result is different with your expectation, you can add options.
#### Example
```elixir
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url, [clean_conditionally: false])
```
* retry_length \\\\ 250
* min_text_length \\\\ 25
* remove_unlikely_candidates \\\\ true,
* weight_classes \\\\ true,
* clean_conditionally \\\\ true,
* remove_empty_nodes \\\\ true,
* remove_unlikely_candidates \\\\ true
* weight_classes \\\\ true
* clean_conditionally \\\\ true
* retry_length \\\\ 250
**You can find other algorithm and regex options in `readability.ex`**
## Test
@ -81,10 +105,17 @@ To run the test suite:
$ mix test
## Todo
<<<<<<< HEAD
**Contributions are welcome!**
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
=======
* [x] Extract authors
* [x] More configurable
* [x] Summarize function
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
>>>>>>> cb86b7a... add summarize function
## Related and Inpired Projects

View File

@ -7,6 +7,9 @@ defmodule Readability do
```elixir
@type html :: binary
# Just pass url
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
# Extract title
Readability.title(html)
@ -28,6 +31,7 @@ defmodule Readability do
alias Readability.TitleFinder
alias Readability.AuthorFinder
alias Readability.ArticleBuilder
alias Readability.Summary
alias Readability.Helper
@default_options [retry_length: 250,
@ -40,7 +44,8 @@ defmodule Readability do
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
whitelist: nil
whitelist: nil,
page_url: nil
]
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
@ -56,8 +61,28 @@ defmodule Readability do
]
@type html_tree :: tuple | list
@type raw_html :: binary
@type url :: binary
@type options :: list
@doc """
summarize the primary readable content of a webpage.
"""
@spec summarize(url, options) :: Summary.t
def summarize(url, opts \\ []) do
opts = Keyword.merge(opts, [page_url: url])
%{status_code: _, body: raw_html} = HTTPoison.get!(url)
html_tree = Helper.normalize(raw_html)
article_tree = html_tree
|> ArticleBuilder.build(opts)
%Summary{title: title(html_tree),
authors: authors(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
end
@doc """
Extract title
@ -67,7 +92,11 @@ defmodule Readability do
"Some title in html"
"""
@spec title(binary | html_tree) :: binary
def title(html) when is_binary(html), do: html |> normalize |> title
def title(raw_html) when is_binary(raw_html) do
raw_html
|> Helper.normalize
|> title
end
def title(html_tree), do: TitleFinder.title(html_tree)
@ -97,7 +126,7 @@ defmodule Readability do
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
raw_html
|> normalize
|> Helper.normalize
|> ArticleBuilder.build(opts)
end
@ -133,19 +162,6 @@ defmodule Readability do
html_tree |> Floki.raw_html
end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec parse(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
def regexes, do: @regexes

View File

@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
@spec find(html_tree) :: [binary]
def find(html_tree) do
author_names = find_by_meta_tag(html_tree)
split_author_names(author_names)
if author_names do
split_author_names(author_names)
end
end
def find_by_meta_tag(html_tree) do

View File

@ -86,6 +86,19 @@ defmodule Readability.Helper do
end)
end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
defp candidates_selector do
["p", "td"]
|> Enum.map(fn(s) ->

View File

@ -0,0 +1,3 @@
defmodule Readability.Summary do
defstruct title: nil, authors: [], article_html: nil, article_text: nil
end

View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """
"""
@version "0.4.0"
@version "0.5.0"
@description """
Readability library for extracting and curating articles.
"""