add document
This commit is contained in:
parent
46ac9dddde
commit
23db20bbf0
|
@ -0,0 +1,5 @@
|
||||||
|
# Change log
|
||||||
|
|
||||||
|
## [0.3.0] - 2016.04.24
|
||||||
|
|
||||||
|
- Release!!
|
39
README.md
39
README.md
|
@ -3,7 +3,7 @@
|
||||||
[![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
|
[![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
|
||||||
[![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
|
[![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
|
||||||
|
|
||||||
Readability library for extracting and curating articles.
|
Readability is Elixir library for extracting and curating articles.
|
||||||
Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
|
Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
To parse document, you must prepare html string.
|
To parse document, you must prepare html string.
|
||||||
The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
|
The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
```elixir
|
```elixir
|
||||||
|
@ -39,33 +39,35 @@ Readability.title(html)
|
||||||
#=> Elixir Design Goals
|
#=> Elixir Design Goals
|
||||||
|
|
||||||
### Extract the content with transformed html.
|
### Extract the content with transformed html.
|
||||||
content = Readability.content(html)
|
html
|
||||||
Readability.raw_html(content)
|
|> Readability.article
|
||||||
|
|> Readability.raw_html
|
||||||
#=>
|
#=>
|
||||||
# <div><div class=\"entry-content\"><p>During the last year,
|
# <div><div class=\"entry-content\"><p>During the last year,
|
||||||
# ...
|
# ...
|
||||||
# ...
|
# ... out our sidebar for other learning resources.</p></div></div>
|
||||||
# or check out our sidebar for other learning resources.</p></div></div>
|
|
||||||
|
|
||||||
### Extract the text only content.
|
### Extract the text only content.
|
||||||
Readability.readable_text(content)
|
html
|
||||||
|
|> Readability.article
|
||||||
|
|> Readability.readable_text
|
||||||
|
|
||||||
#=>
|
#=>
|
||||||
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
|
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
|
||||||
# ...
|
# ...
|
||||||
# ...
|
# ... started guide, or check out our sidebar for other learning resources.
|
||||||
# started guide, or check out our sidebar for other learning resources.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Options
|
### Options
|
||||||
|
|
||||||
You may provide options(Keyword type) to `Readability.content`, including:
|
You may provide options(Keyword type) to `Readability.article`, including:
|
||||||
|
|
||||||
* retry_length: 250(default),
|
* retry_length \\\\ 250
|
||||||
* min_text_length: 25(default),
|
* min_text_length \\\\ 25
|
||||||
* remove_unlikely_candidates: true(default),
|
* remove_unlikely_candidates \\\\ true,
|
||||||
* weight_classes: true(default),
|
* weight_classes \\\\ true,
|
||||||
* clean_conditionally: true(default),
|
* clean_conditionally \\\\ true,
|
||||||
* remove_empty_nodes: true(default),
|
* remove_empty_nodes \\\\ true,
|
||||||
|
|
||||||
## Test
|
## Test
|
||||||
|
|
||||||
|
@ -73,9 +75,10 @@ To run the test suite:
|
||||||
|
|
||||||
$ mix test
|
$ mix test
|
||||||
|
|
||||||
## TODO
|
## Todo
|
||||||
* [ ] Extract a author
|
* [ ] Extract authors
|
||||||
* [ ] Extract Images
|
* [ ] Extract Images
|
||||||
|
* [ ] Extract Videos
|
||||||
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
||||||
* [ ] More configurable
|
* [ ] More configurable
|
||||||
* [ ] Command line interface
|
* [ ] Command line interface
|
||||||
|
|
|
@ -7,17 +7,17 @@ defmodule Readability do
|
||||||
```elixir
|
```elixir
|
||||||
@type html :: binary
|
@type html :: binary
|
||||||
|
|
||||||
# extract title
|
# Extract title
|
||||||
Readability.title(html)
|
Readability.title(html)
|
||||||
|
|
||||||
# extract only text from content
|
# Extract only text from article
|
||||||
content = html
|
article = html
|
||||||
|> Readability.content
|
|> Readability.article
|
||||||
|> Readability.readable_text
|
|> Readability.readable_text
|
||||||
|
|
||||||
# extract content with transformed html
|
# Extract article with transformed html
|
||||||
content = html
|
article = html
|
||||||
|> Readability.content
|
|> Readability.article
|
||||||
|> Readability.raw_html
|
|> Readability.raw_html
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
@ -52,21 +52,59 @@ defmodule Readability do
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
@type options :: list
|
@type options :: list
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Extract title
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
iex> title = Readability.title(html_str)
|
||||||
|
"Some title in html"
|
||||||
|
"""
|
||||||
|
@spec title(binary) :: binary
|
||||||
def title(html) when is_binary(html), do: html |> parse |> title
|
def title(html) when is_binary(html), do: html |> parse |> title
|
||||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Using a variety of metrics (content score, classname, element types), find the content that is
|
Using a variety of metrics (content score, classname, element types), find the content that is
|
||||||
most likely to be the stuff a user wants to read
|
most likely to be the stuff a user wants to read
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
iex> article_tree = Redability(html_str)
|
||||||
|
# returns article that is tuple
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@spec content(binary, options) :: binary
|
@spec article(binary, options) :: html_tree
|
||||||
def content(raw_html, opts \\ []) do
|
def article(raw_html, opts \\ []) do
|
||||||
opts = Keyword.merge(@default_options, opts)
|
opts = Keyword.merge(@default_options, opts)
|
||||||
raw_html
|
raw_html
|
||||||
|> parse
|
|> parse
|
||||||
|> ArticleBuilder.build(opts)
|
|> ArticleBuilder.build(opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
return raw html binary from html_tree
|
||||||
|
"""
|
||||||
|
@spec raw_html(html_tree) :: binary
|
||||||
|
def raw_html(html_tree) do
|
||||||
|
html_tree |> Floki.raw_html
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
return only text binary from html_tree
|
||||||
|
"""
|
||||||
|
@spec raw_html(html_tree) :: binary
|
||||||
|
def readable_text(html_tree) do
|
||||||
|
# TODO: Remove image caption when extract only text
|
||||||
|
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||||
|
html_str = html_tree |> raw_html
|
||||||
|
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
||||||
|
|> Floki.parse
|
||||||
|
|> Floki.text
|
||||||
|
|> String.strip
|
||||||
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||||
"""
|
"""
|
||||||
|
@ -80,28 +118,6 @@ defmodule Readability do
|
||||||
|> Floki.filter_out(:comment)
|
|> Floki.filter_out(:comment)
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
|
||||||
return raw html binary from html tree tuple
|
|
||||||
"""
|
|
||||||
@spec raw_html(html_tree) :: binary
|
|
||||||
def raw_html(html_tree) do
|
|
||||||
html_tree |> Floki.raw_html
|
|
||||||
end
|
|
||||||
|
|
||||||
@doc """
|
|
||||||
return only text binary from html tree tuple
|
|
||||||
"""
|
|
||||||
@spec raw_html(html_tree) :: binary
|
|
||||||
def readable_text(html_tree) do
|
|
||||||
# TODO: Remove image caption when extract only text
|
|
||||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
|
||||||
html_str = html_tree |> raw_html
|
|
||||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
|
||||||
|> Floki.parse
|
|
||||||
|> Floki.text
|
|
||||||
|> String.strip
|
|
||||||
end
|
|
||||||
|
|
||||||
def regexes, do: @regexes
|
def regexes, do: @regexes
|
||||||
|
|
||||||
def default_options, do: @default_options
|
def default_options, do: @default_options
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
defmodule Readability.ArticleBuilder do
|
defmodule Readability.ArticleBuilder do
|
||||||
@moduledoc """
|
@moduledoc """
|
||||||
build article for readability
|
Build article for readability
|
||||||
"""
|
"""
|
||||||
|
|
||||||
alias Readability.Helper
|
alias Readability.Helper
|
||||||
|
|
|
@ -21,6 +21,9 @@ defmodule Readability.Helper do
|
||||||
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Remove html attributes
|
||||||
|
"""
|
||||||
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
|
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
|
||||||
def remove_attrs(content, _) when is_binary(content), do: content
|
def remove_attrs(content, _) when is_binary(content), do: content
|
||||||
def remove_attrs([], _), do: []
|
def remove_attrs([], _), do: []
|
||||||
|
@ -65,7 +68,7 @@ defmodule Readability.Helper do
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
count only text length
|
Count only text length
|
||||||
"""
|
"""
|
||||||
@spec text_length(html_tree) :: number
|
@spec text_length(html_tree) :: number
|
||||||
def text_length(html_tree) do
|
def text_length(html_tree) do
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
defmodule Readability.TitleFinder do
|
defmodule Readability.TitleFinder do
|
||||||
@moduledoc """
|
@moduledoc """
|
||||||
The TitleFinder engine traverse the HTML tree searching for finding title.
|
The TitleFinder engine traverses HTML tree searching for finding title.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@title_suffix ~r/(\-)|(\:\:)|(\|)/
|
@title_suffix ~r/(\-)|(\:\:)|(\|)/
|
||||||
|
|
15
mix.exs
15
mix.exs
|
@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
|
||||||
@moduledoc """
|
@moduledoc """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@version "0.3.1"
|
||||||
|
@description """
|
||||||
|
Readability library for extracting and curating articles.
|
||||||
|
"""
|
||||||
|
|
||||||
use Mix.Project
|
use Mix.Project
|
||||||
|
|
||||||
def project do
|
def project do
|
||||||
[app: :readability,
|
[app: :readability,
|
||||||
version: "0.3.1",
|
version: @version,
|
||||||
elixir: "~> 1.2",
|
elixir: "~> 1.2",
|
||||||
description: description,
|
description: @description,
|
||||||
package: package,
|
package: package,
|
||||||
build_embedded: Mix.env == :prod,
|
build_embedded: Mix.env == :prod,
|
||||||
start_permanent: Mix.env == :prod,
|
start_permanent: Mix.env == :prod,
|
||||||
|
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
defp description do
|
|
||||||
"""
|
|
||||||
Readability library for extracting and curating articles.
|
|
||||||
"""
|
|
||||||
end
|
|
||||||
|
|
||||||
defp package do
|
defp package do
|
||||||
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
|
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
|
||||||
maintainers: ["Jaehyun Shin"],
|
maintainers: ["Jaehyun Shin"],
|
||||||
|
|
|
@ -4,7 +4,7 @@ defmodule ReadabilityTest do
|
||||||
test "readability for NY Times" do
|
test "readability for NY Times" do
|
||||||
html = TestHelper.read_fixture("nytimes.html")
|
html = TestHelper.read_fixture("nytimes.html")
|
||||||
opts = [clean_conditionally: false]
|
opts = [clean_conditionally: false]
|
||||||
nytimes = Readability.content(html, opts)
|
nytimes = Readability.article(html, opts)
|
||||||
|
|
||||||
nytimes_html = Readability.raw_html(nytimes)
|
nytimes_html = Readability.raw_html(nytimes)
|
||||||
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
||||||
|
@ -17,7 +17,7 @@ defmodule ReadabilityTest do
|
||||||
|
|
||||||
test "readability for BBC" do
|
test "readability for BBC" do
|
||||||
html = TestHelper.read_fixture("bbc.html")
|
html = TestHelper.read_fixture("bbc.html")
|
||||||
bbc = Readability.content(html)
|
bbc = Readability.article(html)
|
||||||
|
|
||||||
bbc_html = Readability.raw_html(bbc)
|
bbc_html = Readability.raw_html(bbc)
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ defmodule ReadabilityTest do
|
||||||
|
|
||||||
test "readability for medium" do
|
test "readability for medium" do
|
||||||
html = TestHelper.read_fixture("medium.html")
|
html = TestHelper.read_fixture("medium.html")
|
||||||
medium = Readability.content(html)
|
medium = Readability.article(html)
|
||||||
|
|
||||||
medium_html = Readability.raw_html(medium)
|
medium_html = Readability.raw_html(medium)
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ defmodule ReadabilityTest do
|
||||||
|
|
||||||
test "readability for buzzfeed" do
|
test "readability for buzzfeed" do
|
||||||
html = TestHelper.read_fixture("buzzfeed.html")
|
html = TestHelper.read_fixture("buzzfeed.html")
|
||||||
buzzfeed = Readability.content(html)
|
buzzfeed = Readability.article(html)
|
||||||
|
|
||||||
buzzfeed_html = Readability.raw_html(buzzfeed)
|
buzzfeed_html = Readability.raw_html(buzzfeed)
|
||||||
|
|
||||||
|
@ -59,10 +59,4 @@ defmodule ReadabilityTest do
|
||||||
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||||
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||||
end
|
end
|
||||||
|
|
||||||
test "readability elixir blog" do
|
|
||||||
html = TestHelper.read_fixture("elixir.html")
|
|
||||||
html = Readability.content(html)
|
|
||||||
IO.inspect Readability.readable_text(html)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue