Merge pull request #7 from keepcosmos/summarize

add summarize function
2016-05-07 18:31:56 +09:00 · 2016-05-07 18:31:56 +09:00 · 7e99bb3b6a
commit 7e99bb3b6a
parent 23970d5b82 93bdf48b8c
7 changed files with 113 additions and 43 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,11 +3,16 @@
 All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).

+## [0.5.0] - 2016.05.07
+
+### Added
+- Add `Readability.summarize` function
+
 ## [0.4.0] - 2016.04.28

 ### Added
 - Add author extractor function
- Add `readable_html` function
+- Add `Readability.readable_html` function

 ## [0.3.1] - 2016.04.24

--- a/README.md
+++ b/README.md
@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 ## Usage

 ### Examples
-```elixir
-### Get example page.
-%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")

-### Extract the title.
-Readability.title(html)
+#### Just pass url
+```elixir
+url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
+summary = Readability.summarize(url)
+
+summary.title
 #=> "Why I’m betting on Elixir"

-### Extract authors.
-Readability.authors(html)
+summary.authors
 #=> ["Ken Mazaika"]

-
-### Extract the primary content with transformed html.
-html
-|> Readability.article
-|> Readability.readable_html
+summary.article_html
 #=>
 # <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
 # ...
 # ...button!</em></h3></div></div>

-
-### Extract only text from the primary content.
-html
-|> Readability.article
-|> Readability.readable_text
-
+summary.article_text
 #=>
 # Background: I’ve spent the past 6 years building web applications in Ruby and.....
 # ...
 # ... value in this article, it would mean a lot to me if you hit the recommend button!
 ```

+#### From raw html
+
+```elixir
+### Extract the title.
+Readability.title(html)
+
+### Extract authors.
+Readability.authors(html)
+
+### Extract the primary content with transformed html.
+html
+|> Readability.article
+|> Readability.readable_html
+
+### Extract only text from the primary content.
+html
+|> Readability.article
+|> Readability.readable_text
+
+### you can extract the primary images with Floki
+html
+|> Readability.article
+|> Floki.find("img")
+|> Floki.attribute("src")
+```
+
 ### Options

-You may provide options(Keyword type) to `Readability.article`, including:
+If result is different with your expectation, you can add options.
+
+#### Example
+```elixir
+url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
+summary = Readability.summarize(url, [clean_conditionally: false])
+```

-* retry_length \\\\ 250
 * min_text_length \\\\ 25
-* remove_unlikely_candidates \\\\ true,
-* weight_classes \\\\ true,
-* clean_conditionally \\\\ true,
-* remove_empty_nodes \\\\ true,
+* remove_unlikely_candidates \\\\ true
+* weight_classes \\\\ true
+* clean_conditionally \\\\ true
+* retry_length \\\\ 250
+
+**You can find other algorithm and regex options in `readability.ex`**

 ## Test

@ -81,10 +105,17 @@ To run the test suite:
    $ mix test

 ## Todo
+<<<<<<< HEAD

 **Contributions are welcome!**

 Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
+=======
+* [x] Extract authors
+* [x] More configurable
+* [x] Summarize function
+* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
+>>>>>>> cb86b7a... add summarize function

 ## Related and Inpired Projects

--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -7,6 +7,9 @@ defmodule Readability do
  ```elixir
  @type html :: binary

+  # Just pass url
+  %Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
+
  # Extract title
  Readability.title(html)

@ -28,6 +31,7 @@ defmodule Readability do
  alias Readability.TitleFinder
  alias Readability.AuthorFinder
  alias Readability.ArticleBuilder
+  alias Readability.Summary
  alias Readability.Helper

  @default_options [retry_length: 250,
@ -40,7 +44,8 @@ defmodule Readability do
                    min_image_height: 80,
                    ignore_image_format: [],
                    blacklist: nil,
-                    whitelist: nil
+                    whitelist: nil,
+                    page_url: nil
                   ]

  @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
@ -56,8 +61,28 @@ defmodule Readability do
           ]

  @type html_tree :: tuple | list
+  @type raw_html :: binary
+  @type url :: binary
  @type options :: list

+  @doc """
+  summarize the primary readable content of a webpage.
+  """
+  @spec summarize(url, options) :: Summary.t
+  def summarize(url, opts \\ []) do
+    opts = Keyword.merge(opts, [page_url: url])
+    %{status_code: _, body: raw_html} = HTTPoison.get!(url)
+    html_tree = Helper.normalize(raw_html)
+    article_tree = html_tree
+                   |> ArticleBuilder.build(opts)
+
+    %Summary{title: title(html_tree),
+             authors: authors(html_tree),
+             article_html: readable_html(article_tree),
+             article_text: readable_text(article_tree)
+           }
+  end
+
  @doc """
  Extract title

@ -67,7 +92,11 @@ defmodule Readability do
      "Some title in html"
  """
  @spec title(binary | html_tree) :: binary
-  def title(html) when is_binary(html), do: html |> normalize |> title
+  def title(raw_html) when is_binary(raw_html) do
+     raw_html
+     |> Helper.normalize
+     |> title
+  end
  def title(html_tree), do: TitleFinder.title(html_tree)


@ -97,7 +126,7 @@ defmodule Readability do
  def article(raw_html, opts \\ []) do
    opts = Keyword.merge(@default_options, opts)
    raw_html
-    |> normalize
+    |> Helper.normalize
    |> ArticleBuilder.build(opts)
  end

@ -133,19 +162,6 @@ defmodule Readability do
    html_tree |> Floki.raw_html
  end

-  @doc """
-  Normalize and Parse to html tree(tuple or list)) from binary html
-  """
-  @spec parse(binary) :: html_tree
-  def normalize(raw_html) do
-    raw_html
-    |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
-    |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
-    |> String.replace(Readability.regexes[:normalize], " ")
-    |> Floki.parse
-    |> Floki.filter_out(:comment)
-  end
-
  def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)

  def regexes, do: @regexes
--- a/lib/readability/author_finder.ex
+++ b/lib/readability/author_finder.ex
@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do
  @spec find(html_tree) :: [binary]
  def find(html_tree) do
    author_names = find_by_meta_tag(html_tree)
-    split_author_names(author_names)
+    if author_names do
+      split_author_names(author_names)
+    end
  end

  def find_by_meta_tag(html_tree) do
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -86,6 +86,19 @@ defmodule Readability.Helper do
    end)
  end

+  @doc """
+  Normalize and Parse to html tree(tuple or list)) from binary html
+  """
+  @spec normalize(binary) :: html_tree
+  def normalize(raw_html) do
+    raw_html
+    |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
+    |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
+    |> String.replace(Readability.regexes[:normalize], " ")
+    |> Floki.parse
+    |> Floki.filter_out(:comment)
+  end
+
  defp candidates_selector do
    ["p", "td"]
    |> Enum.map(fn(s) ->
--- a/lib/readability/summary.ex
+++ b/lib/readability/summary.ex
@ -0,0 +1,3 @@
+defmodule Readability.Summary do
+  defstruct title: nil, authors: [], article_html: nil, article_text: nil
+end
--- a/mix.exs
+++ b/mix.exs
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
  @moduledoc """
  """

-  @version "0.4.0"
+  @version "0.5.0"
  @description """
  Readability library for extracting and curating articles.
  """