From 23db20bbf03b218a895c9b1ce989da2193ef610c Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Sun, 24 Apr 2016 18:40:35 +0900 Subject: [PATCH] add document --- CANGELOG.md | 5 ++ README.md | 39 ++++++++------- lib/readability.ex | 78 ++++++++++++++++++------------ lib/readability/article_builder.ex | 2 +- lib/readability/helper.ex | 5 +- lib/readability/title_finder.ex | 2 +- mix.exs | 15 +++--- test/readability_test.exs | 14 ++---- 8 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 CANGELOG.md diff --git a/CANGELOG.md b/CANGELOG.md new file mode 100644 index 0000000..01d1c0e --- /dev/null +++ b/CANGELOG.md @@ -0,0 +1,5 @@ +# Change log + +## [0.3.0] - 2016.04.24 + +- Release!! diff --git a/README.md b/README.md index b97b6be..087b404 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/keepcosmos/readability.svg?branch=master)](https://travis-ci.org/keepcosmos/readability) [![Readability version](https://img.shields.io/hexpm/v/readability.svg)](https://hex.pm/packages/readability) -Readability library for extracting and curating articles. +Readability is Elixir library for extracting and curating articles. Check out The [Documentation](https://hexdocs.pm/readability/Readability.html) for full and detailed guides ## Installation @@ -29,7 +29,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ## Usage To parse document, you must prepare html string. -The below example below, `html` variable is the html code of page from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/) +The example below, `html` variable is the html source from [Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/) ### Examples ```elixir @@ -39,33 +39,35 @@ Readability.title(html) #=> Elixir Design Goals ### Extract the content with transformed html. -content = Readability.content(html) -Readability.raw_html(content) +html +|> Readability.article +|> Readability.raw_html #=> #

During the last year, # ... -# ... -# or check out our sidebar for other learning resources.

+# ... out our sidebar for other learning resources.

### Extract the text only content. -Readability.readable_text(content) +html +|> Readability.article +|> Readability.readable_text + #=> # During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s..... # ... -# ... -# started guide, or check out our sidebar for other learning resources. +# ... started guide, or check out our sidebar for other learning resources. ``` ### Options -You may provide options(Keyword type) to `Readability.content`, including: +You may provide options(Keyword type) to `Readability.article`, including: -* retry_length: 250(default), -* min_text_length: 25(default), -* remove_unlikely_candidates: true(default), -* weight_classes: true(default), -* clean_conditionally: true(default), -* remove_empty_nodes: true(default), +* retry_length \\\\ 250 +* min_text_length \\\\ 25 +* remove_unlikely_candidates \\\\ true, +* weight_classes \\\\ true, +* clean_conditionally \\\\ true, +* remove_empty_nodes \\\\ true, ## Test @@ -73,9 +75,10 @@ To run the test suite: $ mix test -## TODO -* [ ] Extract a author +## Todo +* [ ] Extract authors * [ ] Extract Images +* [ ] Extract Videos * [ ] Convert relative paths into absolute paths of `img#src` and `a#href` * [ ] More configurable * [ ] Command line interface diff --git a/lib/readability.ex b/lib/readability.ex index ead36fe..ba1d2fd 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -7,17 +7,17 @@ defmodule Readability do ```elixir @type html :: binary - # extract title + # Extract title Readability.title(html) - # extract only text from content - content = html - |> Readability.content + # Extract only text from article + article = html + |> Readability.article |> Readability.readable_text - # extract content with transformed html - content = html - |> Readability.content + # Extract article with transformed html + article = html + |> Readability.article |> Readability.raw_html ``` """ @@ -52,21 +52,59 @@ defmodule Readability do @type html_tree :: tuple | list @type options :: list + @doc """ + Extract title + + ## Example + + iex> title = Readability.title(html_str) + "Some title in html" + """ + @spec title(binary) :: binary def title(html) when is_binary(html), do: html |> parse |> title def title(html_tree), do: TitleFinder.title(html_tree) @doc """ Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read + + ## Example + + iex> article_tree = Redability(html_str) + # returns article that is tuple + """ - @spec content(binary, options) :: binary - def content(raw_html, opts \\ []) do + @spec article(binary, options) :: html_tree + def article(raw_html, opts \\ []) do opts = Keyword.merge(@default_options, opts) raw_html |> parse |> ArticleBuilder.build(opts) end + + @doc """ + return raw html binary from html_tree + """ + @spec raw_html(html_tree) :: binary + def raw_html(html_tree) do + html_tree |> Floki.raw_html + end + + @doc """ + return only text binary from html_tree + """ + @spec raw_html(html_tree) :: binary + def readable_text(html_tree) do + # TODO: Remove image caption when extract only text + tags_to_br = ~r/<\/(p|div|article|h\d)/i + html_str = html_tree |> raw_html + Regex.replace(tags_to_br, html_str, &("\n#{&1}")) + |> Floki.parse + |> Floki.text + |> String.strip + end + @doc """ Normalize and Parse to html tree(tuple or list)) from binary html """ @@ -80,28 +118,6 @@ defmodule Readability do |> Floki.filter_out(:comment) end - @doc """ - return raw html binary from html tree tuple - """ - @spec raw_html(html_tree) :: binary - def raw_html(html_tree) do - html_tree |> Floki.raw_html - end - - @doc """ - return only text binary from html tree tuple - """ - @spec raw_html(html_tree) :: binary - def readable_text(html_tree) do - # TODO: Remove image caption when extract only text - tags_to_br = ~r/<\/(p|div|article|h\d)/i - html_str = html_tree |> raw_html - Regex.replace(tags_to_br, html_str, &("\n#{&1}")) - |> Floki.parse - |> Floki.text - |> String.strip - end - def regexes, do: @regexes def default_options, do: @default_options diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex index e7b6254..4ccc7de 100644 --- a/lib/readability/article_builder.ex +++ b/lib/readability/article_builder.ex @@ -1,6 +1,6 @@ defmodule Readability.ArticleBuilder do @moduledoc """ - build article for readability + Build article for readability """ alias Readability.Helper diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index 9551da3..fe9dd3f 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -21,6 +21,9 @@ defmodule Readability.Helper do {tag_name, attrs, change_tag(html_tree, selector, tag)} end + @doc """ + Remove html attributes + """ @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree def remove_attrs(content, _) when is_binary(content), do: content def remove_attrs([], _), do: [] @@ -65,7 +68,7 @@ defmodule Readability.Helper do end @doc """ - count only text length + Count only text length """ @spec text_length(html_tree) :: number def text_length(html_tree) do diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index f3d2fab..4cb0f64 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -1,6 +1,6 @@ defmodule Readability.TitleFinder do @moduledoc """ - The TitleFinder engine traverse the HTML tree searching for finding title. + The TitleFinder engine traverses HTML tree searching for finding title. """ @title_suffix ~r/(\-)|(\:\:)|(\|)/ diff --git a/mix.exs b/mix.exs index 4ed76a4..c99b111 100644 --- a/mix.exs +++ b/mix.exs @@ -2,13 +2,18 @@ defmodule Readability.Mixfile do @moduledoc """ """ + @version "0.3.1" + @description """ + Readability library for extracting and curating articles. + """ + use Mix.Project def project do [app: :readability, - version: "0.3.1", + version: @version, elixir: "~> 1.2", - description: description, + description: @description, package: package, build_embedded: Mix.env == :prod, start_permanent: Mix.env == :prod, @@ -42,12 +47,6 @@ defmodule Readability.Mixfile do ] end - defp description do - """ - Readability library for extracting and curating articles. - """ - end - defp package do [files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"], maintainers: ["Jaehyun Shin"], diff --git a/test/readability_test.exs b/test/readability_test.exs index 128b628..79b0712 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -4,7 +4,7 @@ defmodule ReadabilityTest do test "readability for NY Times" do html = TestHelper.read_fixture("nytimes.html") opts = [clean_conditionally: false] - nytimes = Readability.content(html, opts) + nytimes = Readability.article(html, opts) nytimes_html = Readability.raw_html(nytimes) assert nytimes_html =~ ~r/^
/ @@ -17,7 +17,7 @@ defmodule ReadabilityTest do test "readability for BBC" do html = TestHelper.read_fixture("bbc.html") - bbc = Readability.content(html) + bbc = Readability.article(html) bbc_html = Readability.raw_html(bbc) @@ -32,7 +32,7 @@ defmodule ReadabilityTest do test "readability for medium" do html = TestHelper.read_fixture("medium.html") - medium = Readability.content(html) + medium = Readability.article(html) medium_html = Readability.raw_html(medium) @@ -47,7 +47,7 @@ defmodule ReadabilityTest do test "readability for buzzfeed" do html = TestHelper.read_fixture("buzzfeed.html") - buzzfeed = Readability.content(html) + buzzfeed = Readability.article(html) buzzfeed_html = Readability.raw_html(buzzfeed) @@ -59,10 +59,4 @@ defmodule ReadabilityTest do assert buzzfeed_text =~ ~r/^The FBI no longer needs Appleā€™s help/ assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/ end - - test "readability elixir blog" do - html = TestHelper.read_fixture("elixir.html") - html = Readability.content(html) - IO.inspect Readability.readable_text(html) - end end