add document

2016-04-24 18:40:35 +09:00 · 2016-04-24 18:40:35 +09:00 · 23db20bbf0
parent 46ac9dddde
commit 23db20bbf0
8 changed files with 90 additions and 70 deletions
--- a/CANGELOG.md
+++ b/CANGELOG.md
@ -0,0 +1,5 @@
+# Change log
+
+## [0.3.0] - 2016.04.24
+
+- Release!!
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
 [![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)

-Readability library for extracting and curating articles.  
+Readability is Elixir library for extracting and curating articles.  
 Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides

 ## Installation
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 ## Usage

 To parse document, you must prepare html string.
-The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
+The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)

 ### Examples
 ```elixir
@ -39,33 +39,35 @@ Readability.title(html)
 #=> Elixir Design Goals

 ### Extract the content with transformed html.
-content = Readability.content(html)
-Readability.raw_html(content)
+html
+|> Readability.article
+|> Readability.raw_html
 #=>
 # <div><div class=\"entry-content\"><p>During the last year,
 # ...
-# ...
-# or check out our sidebar for other learning resources.</p></div></div>
+# ... out our sidebar for other learning resources.</p></div></div>

 ### Extract the text only content.
-Readability.readable_text(content)
+html
+|> Readability.article
+|> Readability.readable_text
+
 #=>
 # During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
 # ...
-# ...
-# started guide, or check out our sidebar for other learning resources.
+# ... started guide, or check out our sidebar for other learning resources.
 ```

 ### Options

-You may provide options(Keyword type) to `Readability.content`, including:
+You may provide options(Keyword type) to `Readability.article`, including:

-* retry_length: 250(default),
-* min_text_length: 25(default),
-* remove_unlikely_candidates: true(default),
-* weight_classes: true(default),
-* clean_conditionally: true(default),
-* remove_empty_nodes: true(default),
+* retry_length \\\\ 250
+* min_text_length \\\\ 25
+* remove_unlikely_candidates \\\\ true,
+* weight_classes \\\\ true,
+* clean_conditionally \\\\ true,
+* remove_empty_nodes \\\\ true,

 ## Test

@ -73,9 +75,10 @@ To run the test suite:

    $ mix test

-## TODO
-* [ ] Extract a author
+## Todo
+* [ ] Extract authors
 * [ ] Extract Images
+* [ ] Extract Videos
 * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
 * [ ] More configurable
 * [ ] Command line interface
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -7,17 +7,17 @@ defmodule Readability do
  ```elixir
  @type html :: binary

-  # extract title
+  # Extract title
  Readability.title(html)

-  # extract only text from content
-  content = html
-            |> Readability.content
+  # Extract only text from article
+  article = html
+            |> Readability.article
            |> Readability.readable_text

-  # extract content with transformed html
-  content = html
-            |> Readability.content
+  # Extract article with transformed html
+  article = html
+            |> Readability.article
            |> Readability.raw_html
  ```
  """
@ -52,21 +52,59 @@ defmodule Readability do
  @type html_tree :: tuple | list
  @type options :: list

+  @doc """
+  Extract title
+
+  ## Example
+
+      iex> title = Readability.title(html_str)
+      "Some title in html"
+  """
+  @spec title(binary) :: binary
  def title(html) when is_binary(html), do: html |> parse |> title
  def title(html_tree), do: TitleFinder.title(html_tree)

  @doc """
  Using a variety of metrics (content score, classname, element types), find the content that is
  most likely to be the stuff a user wants to read
+
+  ## Example
+
+      iex> article_tree = Redability(html_str)
+      # returns article that is tuple
+
  """
-  @spec content(binary, options) :: binary
-  def content(raw_html, opts \\ []) do
+  @spec article(binary, options) :: html_tree
+  def article(raw_html, opts \\ []) do
    opts = Keyword.merge(@default_options, opts)
    raw_html
    |> parse
    |> ArticleBuilder.build(opts)
  end

+
+  @doc """
+  return raw html binary from html_tree
+  """
+  @spec raw_html(html_tree) :: binary
+  def raw_html(html_tree) do
+    html_tree |> Floki.raw_html
+  end
+
+  @doc """
+  return only text binary from html_tree
+  """
+  @spec raw_html(html_tree) :: binary
+  def readable_text(html_tree) do
+    # TODO: Remove image caption when extract only text
+    tags_to_br = ~r/<\/(p|div|article|h\d)/i
+    html_str = html_tree |> raw_html
+    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
+    |> Floki.parse
+    |> Floki.text
+    |> String.strip
+  end
+
  @doc """
  Normalize and Parse to html tree(tuple or list)) from binary html
  """
@ -80,28 +118,6 @@ defmodule Readability do
    |> Floki.filter_out(:comment)
  end

-  @doc """
-  return raw html binary from html tree tuple
-  """
-  @spec raw_html(html_tree) :: binary
-  def raw_html(html_tree) do
-    html_tree |> Floki.raw_html
-  end
-
-  @doc """
-  return only text binary from html tree tuple
-  """
-  @spec raw_html(html_tree) :: binary
-  def readable_text(html_tree) do
-    # TODO: Remove image caption when extract only text
-    tags_to_br = ~r/<\/(p|div|article|h\d)/i
-    html_str = html_tree |> raw_html
-    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
-    |> Floki.parse
-    |> Floki.text
-    |> String.strip
-  end
-
  def regexes, do: @regexes

  def default_options, do: @default_options
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@ -1,6 +1,6 @@
 defmodule Readability.ArticleBuilder do
  @moduledoc """
-  build article for readability
+  Build article for readability
  """

  alias Readability.Helper
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -21,6 +21,9 @@ defmodule Readability.Helper do
    {tag_name, attrs, change_tag(html_tree, selector, tag)}
  end

+  @doc """
+  Remove html attributes
+  """
  @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
  def remove_attrs(content, _) when is_binary(content), do: content
  def remove_attrs([], _), do: []
@ -65,7 +68,7 @@ defmodule Readability.Helper do
  end

  @doc """
-  count only text length
+  Count only text length
  """
  @spec text_length(html_tree) :: number
  def text_length(html_tree) do
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@ -1,6 +1,6 @@
 defmodule Readability.TitleFinder do
  @moduledoc """
-  The TitleFinder engine traverse the HTML tree searching for finding title.
+  The TitleFinder engine traverses HTML tree searching for finding title.
  """

  @title_suffix ~r/(\-)|(\:\:)|(\|)/
--- a/mix.exs
+++ b/mix.exs
@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
  @moduledoc """
  """

+  @version "0.3.1"
+  @description """
+  Readability library for extracting and curating articles.
+  """
+
  use Mix.Project

  def project do
    [app: :readability,
-     version: "0.3.1",
+     version: @version,
     elixir: "~> 1.2",
-     description: description,
+     description: @description,
     package: package,
     build_embedded: Mix.env == :prod,
     start_permanent: Mix.env == :prod,
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
    ]
  end

-  defp description do
-    """
-    Readability library for extracting and curating articles.
-    """
-  end
-
  defp package do
    [files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
     maintainers: ["Jaehyun Shin"],
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -4,7 +4,7 @@ defmodule ReadabilityTest do
  test "readability for NY Times" do
    html = TestHelper.read_fixture("nytimes.html")
    opts = [clean_conditionally: false]
-    nytimes = Readability.content(html, opts)
+    nytimes = Readability.article(html, opts)

    nytimes_html = Readability.raw_html(nytimes)
    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -17,7 +17,7 @@ defmodule ReadabilityTest do

  test "readability for BBC" do
    html = TestHelper.read_fixture("bbc.html")
-    bbc = Readability.content(html)
+    bbc = Readability.article(html)

    bbc_html = Readability.raw_html(bbc)

@ -32,7 +32,7 @@ defmodule ReadabilityTest do

  test "readability for medium" do
    html = TestHelper.read_fixture("medium.html")
-    medium = Readability.content(html)
+    medium = Readability.article(html)

    medium_html = Readability.raw_html(medium)

@ -47,7 +47,7 @@ defmodule ReadabilityTest do

  test "readability for buzzfeed" do
    html = TestHelper.read_fixture("buzzfeed.html")
-    buzzfeed = Readability.content(html)
+    buzzfeed = Readability.article(html)

    buzzfeed_html = Readability.raw_html(buzzfeed)

@ -59,10 +59,4 @@ defmodule ReadabilityTest do
    assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
    assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
  end
-
-  test "readability elixir blog" do
-    html = TestHelper.read_fixture("elixir.html")
-    html =  Readability.content(html)
-    IO.inspect Readability.readable_text(html)
-  end
 end