add document

This commit is contained in:
keepcosmos 2016-04-24 18:40:35 +09:00
parent 46ac9dddde
commit 23db20bbf0
8 changed files with 90 additions and 70 deletions

5
CANGELOG.md Normal file
View File

@ -0,0 +1,5 @@
# Change log
## [0.3.0] - 2016.04.24
- Release!!

View File

@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
[![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
Readability library for extracting and curating articles.
Readability is Elixir library for extracting and curating articles.
Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
## Installation
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
## Usage
To parse document, you must prepare html string.
The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
### Examples
```elixir
@ -39,33 +39,35 @@ Readability.title(html)
#=> Elixir Design Goals
### Extract the content with transformed html.
content = Readability.content(html)
Readability.raw_html(content)
html
|> Readability.article
|> Readability.raw_html
#=>
# <div><div class=\"entry-content\"><p>During the last year,
# ...
# ...
# or check out our sidebar for other learning resources.</p></div></div>
# ... out our sidebar for other learning resources.</p></div></div>
### Extract the text only content.
Readability.readable_text(content)
html
|> Readability.article
|> Readability.readable_text
#=>
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
# ...
# ...
# started guide, or check out our sidebar for other learning resources.
# ... started guide, or check out our sidebar for other learning resources.
```
### Options
You may provide options(Keyword type) to `Readability.content`, including:
You may provide options(Keyword type) to `Readability.article`, including:
* retry_length: 250(default),
* min_text_length: 25(default),
* remove_unlikely_candidates: true(default),
* weight_classes: true(default),
* clean_conditionally: true(default),
* remove_empty_nodes: true(default),
* retry_length \\\\ 250
* min_text_length \\\\ 25
* remove_unlikely_candidates \\\\ true,
* weight_classes \\\\ true,
* clean_conditionally \\\\ true,
* remove_empty_nodes \\\\ true,
## Test
@ -73,9 +75,10 @@ To run the test suite:
$ mix test
## TODO
* [ ] Extract a author
## Todo
* [ ] Extract authors
* [ ] Extract Images
* [ ] Extract Videos
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
* [ ] More configurable
* [ ] Command line interface

View File

@ -7,17 +7,17 @@ defmodule Readability do
```elixir
@type html :: binary
# extract title
# Extract title
Readability.title(html)
# extract only text from content
content = html
|> Readability.content
# Extract only text from article
article = html
|> Readability.article
|> Readability.readable_text
# extract content with transformed html
content = html
|> Readability.content
# Extract article with transformed html
article = html
|> Readability.article
|> Readability.raw_html
```
"""
@ -52,21 +52,59 @@ defmodule Readability do
@type html_tree :: tuple | list
@type options :: list
@doc """
Extract title
## Example
iex> title = Readability.title(html_str)
"Some title in html"
"""
@spec title(binary) :: binary
def title(html) when is_binary(html), do: html |> parse |> title
def title(html_tree), do: TitleFinder.title(html_tree)
@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read
## Example
iex> article_tree = Redability(html_str)
# returns article that is tuple
"""
@spec content(binary, options) :: binary
def content(raw_html, opts \\ []) do
@spec article(binary, options) :: html_tree
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
raw_html
|> parse
|> ArticleBuilder.build(opts)
end
@doc """
return raw html binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """
return only text binary from html_tree
"""
@spec raw_html(html_tree) :: binary
def readable_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
|> String.strip
end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@ -80,28 +118,6 @@ defmodule Readability do
|> Floki.filter_out(:comment)
end
@doc """
return raw html binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """
return only text binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def readable_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
|> String.strip
end
def regexes, do: @regexes
def default_options, do: @default_options

View File

@ -1,6 +1,6 @@
defmodule Readability.ArticleBuilder do
@moduledoc """
build article for readability
Build article for readability
"""
alias Readability.Helper

View File

@ -21,6 +21,9 @@ defmodule Readability.Helper do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
@doc """
Remove html attributes
"""
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs([], _), do: []
@ -65,7 +68,7 @@ defmodule Readability.Helper do
end
@doc """
count only text length
Count only text length
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do

View File

@ -1,6 +1,6 @@
defmodule Readability.TitleFinder do
@moduledoc """
The TitleFinder engine traverse the HTML tree searching for finding title.
The TitleFinder engine traverses HTML tree searching for finding title.
"""
@title_suffix ~r/(\-)|(\:\:)|(\|)/

15
mix.exs
View File

@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
@moduledoc """
"""
@version "0.3.1"
@description """
Readability library for extracting and curating articles.
"""
use Mix.Project
def project do
[app: :readability,
version: "0.3.1",
version: @version,
elixir: "~> 1.2",
description: description,
description: @description,
package: package,
build_embedded: Mix.env == :prod,
start_permanent: Mix.env == :prod,
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
]
end
defp description do
"""
Readability library for extracting and curating articles.
"""
end
defp package do
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
maintainers: ["Jaehyun Shin"],

View File

@ -4,7 +4,7 @@ defmodule ReadabilityTest do
test "readability for NY Times" do
html = TestHelper.read_fixture("nytimes.html")
opts = [clean_conditionally: false]
nytimes = Readability.content(html, opts)
nytimes = Readability.article(html, opts)
nytimes_html = Readability.raw_html(nytimes)
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -17,7 +17,7 @@ defmodule ReadabilityTest do
test "readability for BBC" do
html = TestHelper.read_fixture("bbc.html")
bbc = Readability.content(html)
bbc = Readability.article(html)
bbc_html = Readability.raw_html(bbc)
@ -32,7 +32,7 @@ defmodule ReadabilityTest do
test "readability for medium" do
html = TestHelper.read_fixture("medium.html")
medium = Readability.content(html)
medium = Readability.article(html)
medium_html = Readability.raw_html(medium)
@ -47,7 +47,7 @@ defmodule ReadabilityTest do
test "readability for buzzfeed" do
html = TestHelper.read_fixture("buzzfeed.html")
buzzfeed = Readability.content(html)
buzzfeed = Readability.article(html)
buzzfeed_html = Readability.raw_html(buzzfeed)
@ -59,10 +59,4 @@ defmodule ReadabilityTest do
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apples help/
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
end
test "readability elixir blog" do
html = TestHelper.read_fixture("elixir.html")
html = Readability.content(html)
IO.inspect Readability.readable_text(html)
end
end