diff --git a/CHANGELOG.md b/CHANGELOG.md
index f66f8cc..f43f287 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,11 +3,16 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
+## [0.5.0] - 2016.05.07
+
+### Added
+- Add `Readability.summarize` function
+
## [0.4.0] - 2016.04.28
### Added
- Add author extractor function
-- Add `readable_html` function
+- Add `Readability.readable_html` function
## [0.3.1] - 2016.04.24
diff --git a/README.md b/README.md
index eebd38d..f2c4069 100644
--- a/README.md
+++ b/README.md
@@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage
### Examples
-```elixir
-### Get example page.
-%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
-### Extract the title.
-Readability.title(html)
+#### Just pass url
+```elixir
+url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
+summary = Readability.summarize(url)
+
+summary.title
#=> "Why I’m betting on Elixir"
-### Extract authors.
-Readability.authors(html)
+summary.authors
#=> ["Ken Mazaika"]
-
-### Extract the primary content with transformed html.
-html
-|> Readability.article
-|> Readability.readable_html
+summary.article_html
#=>
#
Background: I’ve spent...
# ...
# ...button!
-
-### Extract only text from the primary content.
-html
-|> Readability.article
-|> Readability.readable_text
-
+summary.article_text
#=>
# Background: I’ve spent the past 6 years building web applications in Ruby and.....
# ...
# ... value in this article, it would mean a lot to me if you hit the recommend button!
```
+#### From raw html
+
+```elixir
+### Extract the title.
+Readability.title(html)
+
+### Extract authors.
+Readability.authors(html)
+
+### Extract the primary content with transformed html.
+html
+|> Readability.article
+|> Readability.readable_html
+
+### Extract only text from the primary content.
+html
+|> Readability.article
+|> Readability.readable_text
+
+### you can extract the primary images with Floki
+html
+|> Readability.article
+|> Floki.find("img")
+|> Floki.attribute("src")
+```
+
### Options
-You may provide options(Keyword type) to `Readability.article`, including:
+If result is different with your expectation, you can add options.
+
+#### Example
+```elixir
+url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
+summary = Readability.summarize(url, [clean_conditionally: false])
+```
-* retry_length \\\\ 250
* min_text_length \\\\ 25
-* remove_unlikely_candidates \\\\ true,
-* weight_classes \\\\ true,
-* clean_conditionally \\\\ true,
-* remove_empty_nodes \\\\ true,
+* remove_unlikely_candidates \\\\ true
+* weight_classes \\\\ true
+* clean_conditionally \\\\ true
+* retry_length \\\\ 250
+
+**You can find other algorithm and regex options in `readability.ex`**
## Test
@@ -81,10 +105,17 @@ To run the test suite:
$ mix test
## Todo
+<<<<<<< HEAD
**Contributions are welcome!**
Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
+=======
+* [x] Extract authors
+* [x] More configurable
+* [x] Summarize function
+* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
+>>>>>>> cb86b7a... add summarize function
## Related and Inpired Projects
diff --git a/lib/readability.ex b/lib/readability.ex
index 8030cb7..5687c98 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -7,6 +7,9 @@ defmodule Readability do
```elixir
@type html :: binary
+ # Just pass url
+ %Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
+
# Extract title
Readability.title(html)
@@ -28,6 +31,7 @@ defmodule Readability do
alias Readability.TitleFinder
alias Readability.AuthorFinder
alias Readability.ArticleBuilder
+ alias Readability.Summary
alias Readability.Helper
@default_options [retry_length: 250,
@@ -40,7 +44,8 @@ defmodule Readability do
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
- whitelist: nil
+ whitelist: nil,
+ page_url: nil
]
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
@@ -56,8 +61,28 @@ defmodule Readability do
]
@type html_tree :: tuple | list
+ @type raw_html :: binary
+ @type url :: binary
@type options :: list
+ @doc """
+ summarize the primary readable content of a webpage.
+ """
+ @spec summarize(url, options) :: Summary.t
+ def summarize(url, opts \\ []) do
+ opts = Keyword.merge(opts, [page_url: url])
+ %{status_code: _, body: raw_html} = HTTPoison.get!(url)
+ html_tree = Helper.normalize(raw_html)
+ article_tree = html_tree
+ |> ArticleBuilder.build(opts)
+
+ %Summary{title: title(html_tree),
+ authors: authors(html_tree),
+ article_html: readable_html(article_tree),
+ article_text: readable_text(article_tree)
+ }
+ end
+
@doc """
Extract title
@@ -67,7 +92,11 @@ defmodule Readability do
"Some title in html"
"""
@spec title(binary | html_tree) :: binary
- def title(html) when is_binary(html), do: html |> normalize |> title
+ def title(raw_html) when is_binary(raw_html) do
+ raw_html
+ |> Helper.normalize
+ |> title
+ end
def title(html_tree), do: TitleFinder.title(html_tree)
@@ -97,7 +126,7 @@ defmodule Readability do
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
raw_html
- |> normalize
+ |> Helper.normalize
|> ArticleBuilder.build(opts)
end
@@ -133,19 +162,6 @@ defmodule Readability do
html_tree |> Floki.raw_html
end
- @doc """
- Normalize and Parse to html tree(tuple or list)) from binary html
- """
- @spec parse(binary) :: html_tree
- def normalize(raw_html) do
- raw_html
- |> String.replace(Readability.regexes[:replace_brs], "")
- |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
- |> String.replace(Readability.regexes[:normalize], " ")
- |> Floki.parse
- |> Floki.filter_out(:comment)
- end
-
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
def regexes, do: @regexes
diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex
index 8f9eaa6..3fb366c 100644
--- a/lib/readability/author_finder.ex
+++ b/lib/readability/author_finder.ex
@@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
@spec find(html_tree) :: [binary]
def find(html_tree) do
author_names = find_by_meta_tag(html_tree)
- split_author_names(author_names)
+ if author_names do
+ split_author_names(author_names)
+ end
end
def find_by_meta_tag(html_tree) do
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index fe9dd3f..604be68 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -86,6 +86,19 @@ defmodule Readability.Helper do
end)
end
+ @doc """
+ Normalize and Parse to html tree(tuple or list)) from binary html
+ """
+ @spec normalize(binary) :: html_tree
+ def normalize(raw_html) do
+ raw_html
+ |> String.replace(Readability.regexes[:replace_brs], "
")
+ |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
+ |> String.replace(Readability.regexes[:normalize], " ")
+ |> Floki.parse
+ |> Floki.filter_out(:comment)
+ end
+
defp candidates_selector do
["p", "td"]
|> Enum.map(fn(s) ->
diff --git a/lib/readability/summary.ex b/lib/readability/summary.ex
new file mode 100644
index 0000000..e46f6f5
--- /dev/null
+++ b/lib/readability/summary.ex
@@ -0,0 +1,3 @@
+defmodule Readability.Summary do
+ defstruct title: nil, authors: [], article_html: nil, article_text: nil
+end
diff --git a/mix.exs b/mix.exs
index 2a3b728..74bc285 100644
--- a/mix.exs
+++ b/mix.exs
@@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """
"""
- @version "0.4.0"
+ @version "0.5.0"
@description """
Readability library for extracting and curating articles.
"""