add document

2016-04-24 18:40:35 +09:00 · 2016-04-24 18:40:35 +09:00 · 23db20bbf0
parent 46ac9dddde
commit 23db20bbf0
8 changed files with 90 additions and 70 deletions
--- a/CANGELOG.md
+++ b/CANGELOG.md
@ -0,0 +1,5 @@
 # Change log
 ## [0.3.0] - 2016.04.24
 - Release!!
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
 [![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
-Readability library for extracting and curating articles.  
+Readability is Elixir library for extracting and curating articles.  
 Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
 ## Installation
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 ## Usage
 To parse document, you must prepare html string.
-The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
+The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
 ### Examples
 ```elixir
@ -39,33 +39,35 @@ Readability.title(html)
 #=> Elixir Design Goals
 ### Extract the content with transformed html.
-content = Readability.content(html)
+html
-Readability.raw_html(content)
+|> Readability.article
 |> Readability.raw_html
 #=>
 # <div><div class=\"entry-content\"><p>During the last year,
 # ...
-# ...
+# ... out our sidebar for other learning resources.</p></div></div>
 # or check out our sidebar for other learning resources.</p></div></div>
 ### Extract the text only content.
-Readability.readable_text(content)
+html
 |> Readability.article
 |> Readability.readable_text
 #=>
 # During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
 # ...
-# ...
+# ... started guide, or check out our sidebar for other learning resources.
 # started guide, or check out our sidebar for other learning resources.
 ```
 ### Options
-You may provide options(Keyword type) to `Readability.content`, including:
+You may provide options(Keyword type) to `Readability.article`, including:
-* retry_length: 250(default),
+* retry_length \\\\ 250
-* min_text_length: 25(default),
+* min_text_length \\\\ 25
-* remove_unlikely_candidates: true(default),
+* remove_unlikely_candidates \\\\ true,
-* weight_classes: true(default),
+* weight_classes \\\\ true,
-* clean_conditionally: true(default),
+* clean_conditionally \\\\ true,
-* remove_empty_nodes: true(default),
+* remove_empty_nodes \\\\ true,
 ## Test
@ -73,9 +75,10 @@ To run the test suite:
    $ mix test
-## TODO
+## Todo
-* [ ] Extract a author
+* [ ] Extract authors
 * [ ] Extract Images
 * [ ] Extract Videos
 * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
 * [ ] More configurable
 * [ ] Command line interface
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -7,17 +7,17 @@ defmodule Readability do
  ```elixir
  @type html :: binary
-  # extract title
+  # Extract title
  Readability.title(html)
-  # extract only text from content
+  # Extract only text from article
-  content = html
+  article = html
-            |> Readability.content
+            |> Readability.article
            |> Readability.readable_text
-  # extract content with transformed html
+  # Extract article with transformed html
-  content = html
+  article = html
-            |> Readability.content
+            |> Readability.article
            |> Readability.raw_html
  ```
  """
@ -52,21 +52,59 @@ defmodule Readability do
  @type html_tree :: tuple | list
  @type options :: list
  @doc """
  Extract title
  ## Example
      iex> title = Readability.title(html_str)
      "Some title in html"
  """
  @spec title(binary) :: binary
  def title(html) when is_binary(html), do: html |> parse |> title
  def title(html_tree), do: TitleFinder.title(html_tree)
  @doc """
  Using a variety of metrics (content score, classname, element types), find the content that is
  most likely to be the stuff a user wants to read
  ## Example
      iex> article_tree = Redability(html_str)
      # returns article that is tuple
  """
-  @spec content(binary, options) :: binary
+  @spec article(binary, options) :: html_tree
-  def content(raw_html, opts \\ []) do
+  def article(raw_html, opts \\ []) do
    opts = Keyword.merge(@default_options, opts)
    raw_html
    |> parse
    |> ArticleBuilder.build(opts)
  end
  @doc """
  return raw html binary from html_tree
  """
  @spec raw_html(html_tree) :: binary
  def raw_html(html_tree) do
    html_tree |> Floki.raw_html
  end
  @doc """
  return only text binary from html_tree
  """
  @spec raw_html(html_tree) :: binary
  def readable_text(html_tree) do
    # TODO: Remove image caption when extract only text
    tags_to_br = ~r/<\/(p|div|article|h\d)/i
    html_str = html_tree |> raw_html
    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
    |> Floki.parse
    |> Floki.text
    |> String.strip
  end
  @doc """
  Normalize and Parse to html tree(tuple or list)) from binary html
  """
@ -80,28 +118,6 @@ defmodule Readability do
    |> Floki.filter_out(:comment)
  end
  @doc """
  return raw html binary from html tree tuple
  """
  @spec raw_html(html_tree) :: binary
  def raw_html(html_tree) do
    html_tree |> Floki.raw_html
  end
  @doc """
  return only text binary from html tree tuple
  """
  @spec raw_html(html_tree) :: binary
  def readable_text(html_tree) do
    # TODO: Remove image caption when extract only text
    tags_to_br = ~r/<\/(p|div|article|h\d)/i
    html_str = html_tree |> raw_html
    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
    |> Floki.parse
    |> Floki.text
    |> String.strip
  end
  def regexes, do: @regexes
  def default_options, do: @default_options
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@ -1,6 +1,6 @@
 defmodule Readability.ArticleBuilder do
  @moduledoc """
-  build article for readability
+  Build article for readability
  """
  alias Readability.Helper
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -21,6 +21,9 @@ defmodule Readability.Helper do
    {tag_name, attrs, change_tag(html_tree, selector, tag)}
  end
  @doc """
  Remove html attributes
  """
  @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
  def remove_attrs(content, _) when is_binary(content), do: content
  def remove_attrs([], _), do: []
@ -65,7 +68,7 @@ defmodule Readability.Helper do
  end
  @doc """
-  count only text length
+  Count only text length
  """
  @spec text_length(html_tree) :: number
  def text_length(html_tree) do
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@ -1,6 +1,6 @@
 defmodule Readability.TitleFinder do
  @moduledoc """
-  The TitleFinder engine traverse the HTML tree searching for finding title.
+  The TitleFinder engine traverses HTML tree searching for finding title.
  """
  @title_suffix ~r/(\-)|(\:\:)|(\|)/
--- a/mix.exs
+++ b/mix.exs
@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
  @moduledoc """
  """
  @version "0.3.1"
  @description """
  Readability library for extracting and curating articles.
  """
  use Mix.Project
  def project do
    [app: :readability,
-     version: "0.3.1",
+     version: @version,
     elixir: "~> 1.2",
-     description: description,
+     description: @description,
     package: package,
     build_embedded: Mix.env == :prod,
     start_permanent: Mix.env == :prod,
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
    ]
  end
  defp description do
    """
    Readability library for extracting and curating articles.
    """
  end
  defp package do
    [files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
     maintainers: ["Jaehyun Shin"],
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -4,7 +4,7 @@ defmodule ReadabilityTest do
  test "readability for NY Times" do
    html = TestHelper.read_fixture("nytimes.html")
    opts = [clean_conditionally: false]
-    nytimes = Readability.content(html, opts)
+    nytimes = Readability.article(html, opts)
    nytimes_html = Readability.raw_html(nytimes)
    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -17,7 +17,7 @@ defmodule ReadabilityTest do
  test "readability for BBC" do
    html = TestHelper.read_fixture("bbc.html")
-    bbc = Readability.content(html)
+    bbc = Readability.article(html)
    bbc_html = Readability.raw_html(bbc)
@ -32,7 +32,7 @@ defmodule ReadabilityTest do
  test "readability for medium" do
    html = TestHelper.read_fixture("medium.html")
-    medium = Readability.content(html)
+    medium = Readability.article(html)
    medium_html = Readability.raw_html(medium)
@ -47,7 +47,7 @@ defmodule ReadabilityTest do
  test "readability for buzzfeed" do
    html = TestHelper.read_fixture("buzzfeed.html")
-    buzzfeed = Readability.content(html)
+    buzzfeed = Readability.article(html)
    buzzfeed_html = Readability.raw_html(buzzfeed)
@ -59,10 +59,4 @@ defmodule ReadabilityTest do
    assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
    assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
  end
  test "readability elixir blog" do
    html = TestHelper.read_fixture("elixir.html")
    html =  Readability.content(html)
    IO.inspect Readability.readable_text(html)
  end
 end