add document
This commit is contained in:
parent
46ac9dddde
commit
23db20bbf0
|
@ -0,0 +1,5 @@
|
|||
# Change log
|
||||
|
||||
## [0.3.0] - 2016.04.24
|
||||
|
||||
- Release!!
|
39
README.md
39
README.md
|
@ -3,7 +3,7 @@
|
|||
[![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability)
|
||||
[![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability)
|
||||
|
||||
Readability library for extracting and curating articles.
|
||||
Readability is Elixir library for extracting and curating articles.
|
||||
Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides
|
||||
|
||||
## Installation
|
||||
|
@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
|||
## Usage
|
||||
|
||||
To parse document, you must prepare html string.
|
||||
The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
|
||||
The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)
|
||||
|
||||
### Examples
|
||||
```elixir
|
||||
|
@ -39,33 +39,35 @@ Readability.title(html)
|
|||
#=> Elixir Design Goals
|
||||
|
||||
### Extract the content with transformed html.
|
||||
content = Readability.content(html)
|
||||
Readability.raw_html(content)
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.raw_html
|
||||
#=>
|
||||
# <div><div class=\"entry-content\"><p>During the last year,
|
||||
# ...
|
||||
# ...
|
||||
# or check out our sidebar for other learning resources.</p></div></div>
|
||||
# ... out our sidebar for other learning resources.</p></div></div>
|
||||
|
||||
### Extract the text only content.
|
||||
Readability.readable_text(content)
|
||||
html
|
||||
|> Readability.article
|
||||
|> Readability.readable_text
|
||||
|
||||
#=>
|
||||
# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
|
||||
# ...
|
||||
# ...
|
||||
# started guide, or check out our sidebar for other learning resources.
|
||||
# ... started guide, or check out our sidebar for other learning resources.
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
You may provide options(Keyword type) to `Readability.content`, including:
|
||||
You may provide options(Keyword type) to `Readability.article`, including:
|
||||
|
||||
* retry_length: 250(default),
|
||||
* min_text_length: 25(default),
|
||||
* remove_unlikely_candidates: true(default),
|
||||
* weight_classes: true(default),
|
||||
* clean_conditionally: true(default),
|
||||
* remove_empty_nodes: true(default),
|
||||
* retry_length \\\\ 250
|
||||
* min_text_length \\\\ 25
|
||||
* remove_unlikely_candidates \\\\ true,
|
||||
* weight_classes \\\\ true,
|
||||
* clean_conditionally \\\\ true,
|
||||
* remove_empty_nodes \\\\ true,
|
||||
|
||||
## Test
|
||||
|
||||
|
@ -73,9 +75,10 @@ To run the test suite:
|
|||
|
||||
$ mix test
|
||||
|
||||
## TODO
|
||||
* [ ] Extract a author
|
||||
## Todo
|
||||
* [ ] Extract authors
|
||||
* [ ] Extract Images
|
||||
* [ ] Extract Videos
|
||||
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
|
||||
* [ ] More configurable
|
||||
* [ ] Command line interface
|
||||
|
|
|
@ -7,17 +7,17 @@ defmodule Readability do
|
|||
```elixir
|
||||
@type html :: binary
|
||||
|
||||
# extract title
|
||||
# Extract title
|
||||
Readability.title(html)
|
||||
|
||||
# extract only text from content
|
||||
content = html
|
||||
|> Readability.content
|
||||
# Extract only text from article
|
||||
article = html
|
||||
|> Readability.article
|
||||
|> Readability.readable_text
|
||||
|
||||
# extract content with transformed html
|
||||
content = html
|
||||
|> Readability.content
|
||||
# Extract article with transformed html
|
||||
article = html
|
||||
|> Readability.article
|
||||
|> Readability.raw_html
|
||||
```
|
||||
"""
|
||||
|
@ -52,21 +52,59 @@ defmodule Readability do
|
|||
@type html_tree :: tuple | list
|
||||
@type options :: list
|
||||
|
||||
@doc """
|
||||
Extract title
|
||||
|
||||
## Example
|
||||
|
||||
iex> title = Readability.title(html_str)
|
||||
"Some title in html"
|
||||
"""
|
||||
@spec title(binary) :: binary
|
||||
def title(html) when is_binary(html), do: html |> parse |> title
|
||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||
|
||||
@doc """
|
||||
Using a variety of metrics (content score, classname, element types), find the content that is
|
||||
most likely to be the stuff a user wants to read
|
||||
|
||||
## Example
|
||||
|
||||
iex> article_tree = Redability(html_str)
|
||||
# returns article that is tuple
|
||||
|
||||
"""
|
||||
@spec content(binary, options) :: binary
|
||||
def content(raw_html, opts \\ []) do
|
||||
@spec article(binary, options) :: html_tree
|
||||
def article(raw_html, opts \\ []) do
|
||||
opts = Keyword.merge(@default_options, opts)
|
||||
raw_html
|
||||
|> parse
|
||||
|> ArticleBuilder.build(opts)
|
||||
end
|
||||
|
||||
|
||||
@doc """
|
||||
return raw html binary from html_tree
|
||||
"""
|
||||
@spec raw_html(html_tree) :: binary
|
||||
def raw_html(html_tree) do
|
||||
html_tree |> Floki.raw_html
|
||||
end
|
||||
|
||||
@doc """
|
||||
return only text binary from html_tree
|
||||
"""
|
||||
@spec raw_html(html_tree) :: binary
|
||||
def readable_text(html_tree) do
|
||||
# TODO: Remove image caption when extract only text
|
||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||
html_str = html_tree |> raw_html
|
||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
||||
|> Floki.parse
|
||||
|> Floki.text
|
||||
|> String.strip
|
||||
end
|
||||
|
||||
@doc """
|
||||
Normalize and Parse to html tree(tuple or list)) from binary html
|
||||
"""
|
||||
|
@ -80,28 +118,6 @@ defmodule Readability do
|
|||
|> Floki.filter_out(:comment)
|
||||
end
|
||||
|
||||
@doc """
|
||||
return raw html binary from html tree tuple
|
||||
"""
|
||||
@spec raw_html(html_tree) :: binary
|
||||
def raw_html(html_tree) do
|
||||
html_tree |> Floki.raw_html
|
||||
end
|
||||
|
||||
@doc """
|
||||
return only text binary from html tree tuple
|
||||
"""
|
||||
@spec raw_html(html_tree) :: binary
|
||||
def readable_text(html_tree) do
|
||||
# TODO: Remove image caption when extract only text
|
||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||
html_str = html_tree |> raw_html
|
||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
||||
|> Floki.parse
|
||||
|> Floki.text
|
||||
|> String.strip
|
||||
end
|
||||
|
||||
def regexes, do: @regexes
|
||||
|
||||
def default_options, do: @default_options
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
defmodule Readability.ArticleBuilder do
|
||||
@moduledoc """
|
||||
build article for readability
|
||||
Build article for readability
|
||||
"""
|
||||
|
||||
alias Readability.Helper
|
||||
|
|
|
@ -21,6 +21,9 @@ defmodule Readability.Helper do
|
|||
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Remove html attributes
|
||||
"""
|
||||
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
|
||||
def remove_attrs(content, _) when is_binary(content), do: content
|
||||
def remove_attrs([], _), do: []
|
||||
|
@ -65,7 +68,7 @@ defmodule Readability.Helper do
|
|||
end
|
||||
|
||||
@doc """
|
||||
count only text length
|
||||
Count only text length
|
||||
"""
|
||||
@spec text_length(html_tree) :: number
|
||||
def text_length(html_tree) do
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
defmodule Readability.TitleFinder do
|
||||
@moduledoc """
|
||||
The TitleFinder engine traverse the HTML tree searching for finding title.
|
||||
The TitleFinder engine traverses HTML tree searching for finding title.
|
||||
"""
|
||||
|
||||
@title_suffix ~r/(\-)|(\:\:)|(\|)/
|
||||
|
|
15
mix.exs
15
mix.exs
|
@ -2,13 +2,18 @@ defmodule Readability.Mixfile do
|
|||
@moduledoc """
|
||||
"""
|
||||
|
||||
@version "0.3.1"
|
||||
@description """
|
||||
Readability library for extracting and curating articles.
|
||||
"""
|
||||
|
||||
use Mix.Project
|
||||
|
||||
def project do
|
||||
[app: :readability,
|
||||
version: "0.3.1",
|
||||
version: @version,
|
||||
elixir: "~> 1.2",
|
||||
description: description,
|
||||
description: @description,
|
||||
package: package,
|
||||
build_embedded: Mix.env == :prod,
|
||||
start_permanent: Mix.env == :prod,
|
||||
|
@ -42,12 +47,6 @@ defmodule Readability.Mixfile do
|
|||
]
|
||||
end
|
||||
|
||||
defp description do
|
||||
"""
|
||||
Readability library for extracting and curating articles.
|
||||
"""
|
||||
end
|
||||
|
||||
defp package do
|
||||
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
|
||||
maintainers: ["Jaehyun Shin"],
|
||||
|
|
|
@ -4,7 +4,7 @@ defmodule ReadabilityTest do
|
|||
test "readability for NY Times" do
|
||||
html = TestHelper.read_fixture("nytimes.html")
|
||||
opts = [clean_conditionally: false]
|
||||
nytimes = Readability.content(html, opts)
|
||||
nytimes = Readability.article(html, opts)
|
||||
|
||||
nytimes_html = Readability.raw_html(nytimes)
|
||||
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
||||
|
@ -17,7 +17,7 @@ defmodule ReadabilityTest do
|
|||
|
||||
test "readability for BBC" do
|
||||
html = TestHelper.read_fixture("bbc.html")
|
||||
bbc = Readability.content(html)
|
||||
bbc = Readability.article(html)
|
||||
|
||||
bbc_html = Readability.raw_html(bbc)
|
||||
|
||||
|
@ -32,7 +32,7 @@ defmodule ReadabilityTest do
|
|||
|
||||
test "readability for medium" do
|
||||
html = TestHelper.read_fixture("medium.html")
|
||||
medium = Readability.content(html)
|
||||
medium = Readability.article(html)
|
||||
|
||||
medium_html = Readability.raw_html(medium)
|
||||
|
||||
|
@ -47,7 +47,7 @@ defmodule ReadabilityTest do
|
|||
|
||||
test "readability for buzzfeed" do
|
||||
html = TestHelper.read_fixture("buzzfeed.html")
|
||||
buzzfeed = Readability.content(html)
|
||||
buzzfeed = Readability.article(html)
|
||||
|
||||
buzzfeed_html = Readability.raw_html(buzzfeed)
|
||||
|
||||
|
@ -59,10 +59,4 @@ defmodule ReadabilityTest do
|
|||
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||
end
|
||||
|
||||
test "readability elixir blog" do
|
||||
html = TestHelper.read_fixture("elixir.html")
|
||||
html = Readability.content(html)
|
||||
IO.inspect Readability.readable_text(html)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue