Merge pull request #7 from keepcosmos/summarize

add summarize function
This commit is contained in:
Jaehyun Shin 2016-05-07 18:31:56 +09:00
commit 7e99bb3b6a
7 changed files with 113 additions and 43 deletions

View File

@ -3,11 +3,16 @@
All notable changes to this project will be documented in this file. All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/). This project adheres to [Semantic Versioning](http://semver.org/).
## [0.5.0] - 2016.05.07
### Added
- Add `Readability.summarize` function
## [0.4.0] - 2016.04.28 ## [0.4.0] - 2016.04.28
### Added ### Added
- Add author extractor function - Add author extractor function
- Add `readable_html` function - Add `Readability.readable_html` function
## [0.3.1] - 2016.04.24 ## [0.3.1] - 2016.04.24

View File

@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage ## Usage
### Examples ### Examples
```elixir
### Get example page.
%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
### Extract the title. #### Just pass url
Readability.title(html) ```elixir
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url)
summary.title
#=> "Why Im betting on Elixir" #=> "Why Im betting on Elixir"
### Extract authors. summary.authors
Readability.authors(html)
#=> ["Ken Mazaika"] #=> ["Ken Mazaika"]
summary.article_html
### Extract the primary content with transformed html.
html
|> Readability.article
|> Readability.readable_html
#=> #=>
# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>Ive spent... # <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>Ive spent...
# ... # ...
# ...button!</em></h3></div></div> # ...button!</em></h3></div></div>
summary.article_text
### Extract only text from the primary content.
html
|> Readability.article
|> Readability.readable_text
#=> #=>
# Background: Ive spent the past 6 years building web applications in Ruby and..... # Background: Ive spent the past 6 years building web applications in Ruby and.....
# ... # ...
# ... value in this article, it would mean a lot to me if you hit the recommend button! # ... value in this article, it would mean a lot to me if you hit the recommend button!
``` ```
#### From raw html
```elixir
### Extract the title.
Readability.title(html)
### Extract authors.
Readability.authors(html)
### Extract the primary content with transformed html.
html
|> Readability.article
|> Readability.readable_html
### Extract only text from the primary content.
html
|> Readability.article
|> Readability.readable_text
### you can extract the primary images with Floki
html
|> Readability.article
|> Floki.find("img")
|> Floki.attribute("src")
```
### Options ### Options
You may provide options(Keyword type) to `Readability.article`, including: If result is different with your expectation, you can add options.
#### Example
```elixir
url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url, [clean_conditionally: false])
```
* retry_length \\\\ 250
* min_text_length \\\\ 25 * min_text_length \\\\ 25
* remove_unlikely_candidates \\\\ true, * remove_unlikely_candidates \\\\ true
* weight_classes \\\\ true, * weight_classes \\\\ true
* clean_conditionally \\\\ true, * clean_conditionally \\\\ true
* remove_empty_nodes \\\\ true, * retry_length \\\\ 250
**You can find other algorithm and regex options in `readability.ex`**
## Test ## Test
@ -81,10 +105,17 @@ To run the test suite:
$ mix test $ mix test
## Todo ## Todo
<<<<<<< HEAD
**Contributions are welcome!** **Contributions are welcome!**
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones) Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
=======
* [x] Extract authors
* [x] More configurable
* [x] Summarize function
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
>>>>>>> cb86b7a... add summarize function
## Related and Inpired Projects ## Related and Inpired Projects

View File

@ -7,6 +7,9 @@ defmodule Readability do
```elixir ```elixir
@type html :: binary @type html :: binary
# Just pass url
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
# Extract title # Extract title
Readability.title(html) Readability.title(html)
@ -28,6 +31,7 @@ defmodule Readability do
alias Readability.TitleFinder alias Readability.TitleFinder
alias Readability.AuthorFinder alias Readability.AuthorFinder
alias Readability.ArticleBuilder alias Readability.ArticleBuilder
alias Readability.Summary
alias Readability.Helper alias Readability.Helper
@default_options [retry_length: 250, @default_options [retry_length: 250,
@ -40,7 +44,8 @@ defmodule Readability do
min_image_height: 80, min_image_height: 80,
ignore_image_format: [], ignore_image_format: [],
blacklist: nil, blacklist: nil,
whitelist: nil whitelist: nil,
page_url: nil
] ]
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
@ -56,8 +61,28 @@ defmodule Readability do
] ]
@type html_tree :: tuple | list @type html_tree :: tuple | list
@type raw_html :: binary
@type url :: binary
@type options :: list @type options :: list
@doc """
summarize the primary readable content of a webpage.
"""
@spec summarize(url, options) :: Summary.t
def summarize(url, opts \\ []) do
opts = Keyword.merge(opts, [page_url: url])
%{status_code: _, body: raw_html} = HTTPoison.get!(url)
html_tree = Helper.normalize(raw_html)
article_tree = html_tree
|> ArticleBuilder.build(opts)
%Summary{title: title(html_tree),
authors: authors(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
end
@doc """ @doc """
Extract title Extract title
@ -67,7 +92,11 @@ defmodule Readability do
"Some title in html" "Some title in html"
""" """
@spec title(binary | html_tree) :: binary @spec title(binary | html_tree) :: binary
def title(html) when is_binary(html), do: html |> normalize |> title def title(raw_html) when is_binary(raw_html) do
raw_html
|> Helper.normalize
|> title
end
def title(html_tree), do: TitleFinder.title(html_tree) def title(html_tree), do: TitleFinder.title(html_tree)
@ -97,7 +126,7 @@ defmodule Readability do
def article(raw_html, opts \\ []) do def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts) opts = Keyword.merge(@default_options, opts)
raw_html raw_html
|> normalize |> Helper.normalize
|> ArticleBuilder.build(opts) |> ArticleBuilder.build(opts)
end end
@ -133,19 +162,6 @@ defmodule Readability do
html_tree |> Floki.raw_html html_tree |> Floki.raw_html
end end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec parse(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
def regexes, do: @regexes def regexes, do: @regexes

View File

@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
@spec find(html_tree) :: [binary] @spec find(html_tree) :: [binary]
def find(html_tree) do def find(html_tree) do
author_names = find_by_meta_tag(html_tree) author_names = find_by_meta_tag(html_tree)
split_author_names(author_names) if author_names do
split_author_names(author_names)
end
end end
def find_by_meta_tag(html_tree) do def find_by_meta_tag(html_tree) do

View File

@ -86,6 +86,19 @@ defmodule Readability.Helper do
end) end)
end end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
defp candidates_selector do defp candidates_selector do
["p", "td"] ["p", "td"]
|> Enum.map(fn(s) -> |> Enum.map(fn(s) ->

View File

@ -0,0 +1,3 @@
defmodule Readability.Summary do
defstruct title: nil, authors: [], article_html: nil, article_text: nil
end

View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """ @moduledoc """
""" """
@version "0.4.0" @version "0.5.0"
@description """ @description """
Readability library for extracting and curating articles. Readability library for extracting and curating articles.
""" """