add document

This commit is contained in:
keepcosmos 2016-04-24 18:40:35 +09:00
parent 46ac9dddde
commit 23db20bbf0
8 changed files with 90 additions and 70 deletions

5
CANGELOG.md Normal file
View File

@ -0,0 +1,5 @@
# Change log
## [0.3.0] - 2016.04.24
- Release!!

View File

@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability) [![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
[![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability) [![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
Readability library for extracting and curating articles. Readability is Elixir library for extracting and curating articles.
Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
## Installation ## Installation
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage ## Usage
To parse document, you must prepare html string. To parse document, you must prepare html string.
The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/) The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
### Examples ### Examples
```elixir ```elixir
@ -39,33 +39,35 @@ Readability.title(html)
#=> Elixir Design Goals #=> Elixir Design Goals
### Extract the content with transformed html. ### Extract the content with transformed html.
content = Readability.content(html) html
Readability.raw_html(content) |> Readability.article
|> Readability.raw_html
#=> #=>
# <div><div class=\"entry-content\"><p>During the last year, # <div><div class=\"entry-content\"><p>During the last year,
# ... # ...
# ... # ... out our sidebar for other learning resources.</p></div></div>
# or check out our sidebar for other learning resources.</p></div></div>
### Extract the text only content. ### Extract the text only content.
Readability.readable_text(content) html
|> Readability.article
|> Readability.readable_text
#=> #=>
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s..... # During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
# ... # ...
# ... # ... started guide, or check out our sidebar for other learning resources.
# started guide, or check out our sidebar for other learning resources.
``` ```
### Options ### Options
You may provide options(Keyword type) to `Readability.content`, including: You may provide options(Keyword type) to `Readability.article`, including:
* retry_length: 250(default), * retry_length \\\\ 250
* min_text_length: 25(default), * min_text_length \\\\ 25
* remove_unlikely_candidates: true(default), * remove_unlikely_candidates \\\\ true,
* weight_classes: true(default), * weight_classes \\\\ true,
* clean_conditionally: true(default), * clean_conditionally \\\\ true,
* remove_empty_nodes: true(default), * remove_empty_nodes \\\\ true,
## Test ## Test
@ -73,9 +75,10 @@ To run the test suite:
$ mix test $ mix test
## TODO ## Todo
* [ ] Extract a author * [ ] Extract authors
* [ ] Extract Images * [ ] Extract Images
* [ ] Extract Videos
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href` * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
* [ ] More configurable * [ ] More configurable
* [ ] Command line interface * [ ] Command line interface

View File

@ -7,17 +7,17 @@ defmodule Readability do
```elixir ```elixir
@type html :: binary @type html :: binary
# extract title # Extract title
Readability.title(html) Readability.title(html)
# extract only text from content # Extract only text from article
content = html article = html
|> Readability.content |> Readability.article
|> Readability.readable_text |> Readability.readable_text
# extract content with transformed html # Extract article with transformed html
content = html article = html
|> Readability.content |> Readability.article
|> Readability.raw_html |> Readability.raw_html
``` ```
""" """
@ -52,21 +52,59 @@ defmodule Readability do
@type html_tree :: tuple | list @type html_tree :: tuple | list
@type options :: list @type options :: list
@doc """
Extract title
## Example
iex> title = Readability.title(html_str)
"Some title in html"
"""
@spec title(binary) :: binary
def title(html) when is_binary(html), do: html |> parse |> title def title(html) when is_binary(html), do: html |> parse |> title
def title(html_tree), do: TitleFinder.title(html_tree) def title(html_tree), do: TitleFinder.title(html_tree)
@doc """ @doc """
Using a variety of metrics (content score, classname, element types), find the content that is Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read most likely to be the stuff a user wants to read
## Example
iex> article_tree = Redability(html_str)
# returns article that is tuple
""" """
@spec content(binary, options) :: binary @spec article(binary, options) :: html_tree
def content(raw_html, opts \\ []) do def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts) opts = Keyword.merge(@default_options, opts)
raw_html raw_html
|> parse |> parse
|> ArticleBuilder.build(opts) |> ArticleBuilder.build(opts)
end end
@doc """
return raw html binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """
return only text binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def readable_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
|> String.strip
end
@doc """ @doc """
Normalize and Parse to html tree(tuple or list)) from binary html Normalize and Parse to html tree(tuple or list)) from binary html
""" """
@ -80,28 +118,6 @@ defmodule Readability do
|> Floki.filter_out(:comment) |> Floki.filter_out(:comment)
end end
@doc """
return raw html binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """
return only text binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def readable_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
|> String.strip
end
def regexes, do: @regexes def regexes, do: @regexes
def default_options, do: @default_options def default_options, do: @default_options

View File

@ -1,6 +1,6 @@
defmodule Readability.ArticleBuilder do defmodule Readability.ArticleBuilder do
@moduledoc """ @moduledoc """
build article for readability Build article for readability
""" """
alias Readability.Helper alias Readability.Helper

View File

@ -21,6 +21,9 @@ defmodule Readability.Helper do
{tag_name, attrs, change_tag(html_tree, selector, tag)} {tag_name, attrs, change_tag(html_tree, selector, tag)}
end end
@doc """
Remove html attributes
"""
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
def remove_attrs(content, _) when is_binary(content), do: content def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs([], _), do: [] def remove_attrs([], _), do: []
@ -65,7 +68,7 @@ defmodule Readability.Helper do
end end
@doc """ @doc """
count only text length Count only text length
""" """
@spec text_length(html_tree) :: number @spec text_length(html_tree) :: number
def text_length(html_tree) do def text_length(html_tree) do

View File

@ -1,6 +1,6 @@
defmodule Readability.TitleFinder do defmodule Readability.TitleFinder do
@moduledoc """ @moduledoc """
The TitleFinder engine traverse the HTML tree searching for finding title. The TitleFinder engine traverses HTML tree searching for finding title.
""" """
@title_suffix ~r/(\-)|(\:\:)|(\|)/ @title_suffix ~r/(\-)|(\:\:)|(\|)/

15
mix.exs
View File

@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
@moduledoc """ @moduledoc """
""" """
@version "0.3.1"
@description """
Readability library for extracting and curating articles.
"""
use Mix.Project use Mix.Project
def project do def project do
[app: :readability, [app: :readability,
version: "0.3.1", version: @version,
elixir: "~> 1.2", elixir: "~> 1.2",
description: description, description: @description,
package: package, package: package,
build_embedded: Mix.env == :prod, build_embedded: Mix.env == :prod,
start_permanent: Mix.env == :prod, start_permanent: Mix.env == :prod,
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
] ]
end end
defp description do
"""
Readability library for extracting and curating articles.
"""
end
defp package do defp package do
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"], [files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
maintainers: ["Jaehyun Shin"], maintainers: ["Jaehyun Shin"],

View File

@ -4,7 +4,7 @@ defmodule ReadabilityTest do
test "readability for NY Times" do test "readability for NY Times" do
html = TestHelper.read_fixture("nytimes.html") html = TestHelper.read_fixture("nytimes.html")
opts = [clean_conditionally: false] opts = [clean_conditionally: false]
nytimes = Readability.content(html, opts) nytimes = Readability.article(html, opts)
nytimes_html = Readability.raw_html(nytimes) nytimes_html = Readability.raw_html(nytimes)
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/ assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -17,7 +17,7 @@ defmodule ReadabilityTest do
test "readability for BBC" do test "readability for BBC" do
html = TestHelper.read_fixture("bbc.html") html = TestHelper.read_fixture("bbc.html")
bbc = Readability.content(html) bbc = Readability.article(html)
bbc_html = Readability.raw_html(bbc) bbc_html = Readability.raw_html(bbc)
@ -32,7 +32,7 @@ defmodule ReadabilityTest do
test "readability for medium" do test "readability for medium" do
html = TestHelper.read_fixture("medium.html") html = TestHelper.read_fixture("medium.html")
medium = Readability.content(html) medium = Readability.article(html)
medium_html = Readability.raw_html(medium) medium_html = Readability.raw_html(medium)
@ -47,7 +47,7 @@ defmodule ReadabilityTest do
test "readability for buzzfeed" do test "readability for buzzfeed" do
html = TestHelper.read_fixture("buzzfeed.html") html = TestHelper.read_fixture("buzzfeed.html")
buzzfeed = Readability.content(html) buzzfeed = Readability.article(html)
buzzfeed_html = Readability.raw_html(buzzfeed) buzzfeed_html = Readability.raw_html(buzzfeed)
@ -59,10 +59,4 @@ defmodule ReadabilityTest do
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apples help/ assert buzzfeed_text =~ ~r/^The FBI no longer needs Apples help/
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/ assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
end end
test "readability elixir blog" do
html = TestHelper.read_fixture("elixir.html")
html = Readability.content(html)
IO.inspect Readability.readable_text(html)
end
end end