Merge pull request #7 from keepcosmos/summarize

add summarize function
2016-05-07 18:31:56 +09:00 · 2016-05-07 18:31:56 +09:00 · 7e99bb3b6a
parent 23970d5b82 93bdf48b8c
commit 7e99bb3b6a
7 changed files with 113 additions and 43 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,11 +3,16 @@
 All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 ## [0.5.0] - 2016.05.07
 ### Added
 - Add `Readability.summarize` function
 ## [0.4.0] - 2016.04.28
 ### Added
 - Add author extractor function
- Add `readable_html` function
+- Add `Readability.readable_html` function
 ## [0.3.1] - 2016.04.24
--- a/README.md
+++ b/README.md
@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 ## Usage
 ### Examples
 ```elixir
 ### Get example page.
 %{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
-### Extract the title.
+#### Just pass url
-Readability.title(html)
+```elixir
 url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
 summary = Readability.summarize(url)
 summary.title
 #=> "Why I’m betting on Elixir"
-### Extract authors.
+summary.authors
 Readability.authors(html)
 #=> ["Ken Mazaika"]
-
+summary.article_html
 ### Extract the primary content with transformed html.
 html
 |> Readability.article
 |> Readability.readable_html
 #=>
 # <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
 # ...
 # ...button!</em></h3></div></div>
-
+summary.article_text
 ### Extract only text from the primary content.
 html
 |> Readability.article
 |> Readability.readable_text
 #=>
 # Background: I’ve spent the past 6 years building web applications in Ruby and.....
 # ...
 # ... value in this article, it would mean a lot to me if you hit the recommend button!
 ```
 #### From raw html
 ```elixir
 ### Extract the title.
 Readability.title(html)
 ### Extract authors.
 Readability.authors(html)
 ### Extract the primary content with transformed html.
 html
 |> Readability.article
 |> Readability.readable_html
 ### Extract only text from the primary content.
 html
 |> Readability.article
 |> Readability.readable_text
 ### you can extract the primary images with Floki
 html
 |> Readability.article
 |> Floki.find("img")
 |> Floki.attribute("src")
 ```
 ### Options
-You may provide options(Keyword type) to `Readability.article`, including:
+If result is different with your expectation, you can add options.
 #### Example
 ```elixir
 url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
 summary = Readability.summarize(url, [clean_conditionally: false])
 ```
 * retry_length \\\\ 250
 * min_text_length \\\\ 25
-* remove_unlikely_candidates \\\\ true,
+* remove_unlikely_candidates \\\\ true
-* weight_classes \\\\ true,
+* weight_classes \\\\ true
-* clean_conditionally \\\\ true,
+* clean_conditionally \\\\ true
-* remove_empty_nodes \\\\ true,
+* retry_length \\\\ 250
 **You can find other algorithm and regex options in `readability.ex`**
 ## Test
@ -81,10 +105,17 @@ To run the test suite:
    $ mix test
 ## Todo
 <<<<<<< HEAD
 **Contributions are welcome!**
 Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones)
 =======
 * [x] Extract authors
 * [x] More configurable
 * [x] Summarize function
 * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
 >>>>>>> cb86b7a... add summarize function
 ## Related and Inpired Projects
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -7,6 +7,9 @@ defmodule Readability do
  ```elixir
  @type html :: binary
  # Just pass url
  %Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
  # Extract title
  Readability.title(html)
@ -28,6 +31,7 @@ defmodule Readability do
  alias Readability.TitleFinder
  alias Readability.AuthorFinder
  alias Readability.ArticleBuilder
  alias Readability.Summary
  alias Readability.Helper
  @default_options [retry_length: 250,
@ -40,7 +44,8 @@ defmodule Readability do
                    min_image_height: 80,
                    ignore_image_format: [],
                    blacklist: nil,
-                    whitelist: nil
+                    whitelist: nil,
                    page_url: nil
                   ]
  @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
@ -56,8 +61,28 @@ defmodule Readability do
           ]
  @type html_tree :: tuple | list
  @type raw_html :: binary
  @type url :: binary
  @type options :: list
  @doc """
  summarize the primary readable content of a webpage.
  """
  @spec summarize(url, options) :: Summary.t
  def summarize(url, opts \\ []) do
    opts = Keyword.merge(opts, [page_url: url])
    %{status_code: _, body: raw_html} = HTTPoison.get!(url)
    html_tree = Helper.normalize(raw_html)
    article_tree = html_tree
                   |> ArticleBuilder.build(opts)
    %Summary{title: title(html_tree),
             authors: authors(html_tree),
             article_html: readable_html(article_tree),
             article_text: readable_text(article_tree)
           }
  end
  @doc """
  Extract title
@ -67,7 +92,11 @@ defmodule Readability do
      "Some title in html"
  """
  @spec title(binary | html_tree) :: binary
-  def title(html) when is_binary(html), do: html |> normalize |> title
+  def title(raw_html) when is_binary(raw_html) do
     raw_html
     |> Helper.normalize
     |> title
  end
  def title(html_tree), do: TitleFinder.title(html_tree)
@ -97,7 +126,7 @@ defmodule Readability do
  def article(raw_html, opts \\ []) do
    opts = Keyword.merge(@default_options, opts)
    raw_html
-    |> normalize
+    |> Helper.normalize
    |> ArticleBuilder.build(opts)
  end
@ -133,19 +162,6 @@ defmodule Readability do
    html_tree |> Floki.raw_html
  end
  @doc """
  Normalize and Parse to html tree(tuple or list)) from binary html
  """
  @spec parse(binary) :: html_tree
  def normalize(raw_html) do
    raw_html
    |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
    |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
    |> String.replace(Readability.regexes[:normalize], " ")
    |> Floki.parse
    |> Floki.filter_out(:comment)
  end
  def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
  def regexes, do: @regexes
--- a/lib/readability/author_finder.ex
+++ b/lib/readability/author_finder.ex
@ -11,8 +11,10 @@ defmodule Readability.AuthorFinder do
  @spec find(html_tree) :: [binary]
  def find(html_tree) do
    author_names = find_by_meta_tag(html_tree)
    if author_names do
      split_author_names(author_names)
    end
  end
  def find_by_meta_tag(html_tree) do
    names = html_tree
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -86,6 +86,19 @@ defmodule Readability.Helper do
    end)
  end
  @doc """
  Normalize and Parse to html tree(tuple or list)) from binary html
  """
  @spec normalize(binary) :: html_tree
  def normalize(raw_html) do
    raw_html
    |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
    |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
    |> String.replace(Readability.regexes[:normalize], " ")
    |> Floki.parse
    |> Floki.filter_out(:comment)
  end
  defp candidates_selector do
    ["p", "td"]
    |> Enum.map(fn(s) ->
--- a/lib/readability/summary.ex
+++ b/lib/readability/summary.ex
@ -0,0 +1,3 @@
 defmodule Readability.Summary do
  defstruct title: nil, authors: [], article_html: nil, article_text: nil
 end
--- a/mix.exs
+++ b/mix.exs
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
  @moduledoc """
  """
-  @version "0.4.0"
+  @version "0.5.0"
  @description """
  Readability library for extracting and curating articles.
  """