From 4aa8f6eceae05d3b6bd39600dfe4dbc53ecdec34 Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Thu, 28 Apr 2016 15:13:03 +0900 Subject: [PATCH] add authors finder --- CANGELOG.md | 5 --- CHANGELOG.md | 14 +++++++ README.md | 28 ++++++++------ lib/readability.ex | 49 +++++++++++++++++++------ lib/readability/author_finder.ex | 38 +++++++++++++++++++ lib/readability/sanitizer.ex | 3 +- mix.exs | 6 ++- test/readability/author_finder_test.exs | 25 +++++++++++++ test/readability_test.exs | 16 ++++---- 9 files changed, 145 insertions(+), 39 deletions(-) delete mode 100644 CANGELOG.md create mode 100644 CHANGELOG.md create mode 100644 lib/readability/author_finder.ex create mode 100644 test/readability/author_finder_test.exs diff --git a/CANGELOG.md b/CANGELOG.md deleted file mode 100644 index 01d1c0e..0000000 --- a/CANGELOG.md +++ /dev/null @@ -1,5 +0,0 @@ -# Change log - -## [0.3.0] - 2016.04.24 - -- Release!! diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f66f8cc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Change log + +All notable changes to this project will be documented in this file. +This project adheres to [Semantic Versioning](http://semver.org/). + +## [0.4.0] - 2016.04.28 + +### Added +- Add author extractor function +- Add `readable_html` function + +## [0.3.1] - 2016.04.24 + +- Release!! diff --git a/README.md b/README.md index 83aa9ff..cf584df 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ```elixir def deps do - [{:readability, "~> 0.3"}] + [{:readability, "~> 0.4"}] end ``` @@ -28,23 +28,29 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ## Usage -The example below, `html` variable is the html source from blog content "[Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)". - ### Examples ```elixir +### Get example page. +%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58") -### Extract the title +### Extract the title. Readability.title(html) -#=> Elixir Design Goals +#=> "Why I’m betting on Elixir" + +### Extract authors. +Readability.authors(html) +#=> ["Ken Mazaika"] + ### Extract the primary content with transformed html. html |> Readability.article -|> Readability.raw_html +|> Readability.readable_html #=> -#

During the last year, +#

Background: I’ve spent... # ... -# ... out our sidebar for other learning resources.

+# ...button!
+ ### Extract only text from the primary content. html @@ -52,9 +58,9 @@ html |> Readability.readable_text #=> -# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s..... +# Background: I’ve spent the past 6 years building web applications in Ruby and..... # ... -# ... started guide, or check out our sidebar for other learning resources. +# ... value in this article, it would mean a lot to me if you hit the recommend button! ``` ### Options @@ -75,7 +81,7 @@ To run the test suite: $ mix test ## Todo -* [ ] Extract authors +* [x] Extract authors * [ ] Extract Images * [ ] Extract Videos * [ ] Convert relative paths into absolute paths of `img#src` and `a#href` diff --git a/lib/readability.ex b/lib/readability.ex index ba1d2fd..6b73cb3 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -23,7 +23,9 @@ defmodule Readability do """ alias Readability.TitleFinder + alias Readability.AuthorFinder alias Readability.ArticleBuilder + alias Readability.Helper @default_options [retry_length: 250, min_text_length: 25, @@ -46,7 +48,8 @@ defmodule Readability do replace_brs: ~r/(]*>[ \n\r\t]*){2,}/i, replace_fonts: ~r/<(\/?)font[^>]*>/i, normalize: ~r/\s{2,}/, - video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i + video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, + protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i ] @type html_tree :: tuple | list @@ -60,10 +63,23 @@ defmodule Readability do iex> title = Readability.title(html_str) "Some title in html" """ - @spec title(binary) :: binary - def title(html) when is_binary(html), do: html |> parse |> title + @spec title(binary | html_tree) :: binary + def title(html) when is_binary(html), do: html |> normalize |> title def title(html_tree), do: TitleFinder.title(html_tree) + + @doc """ + Extract authors + + ## Example + + iex> authors = Readability.authors(html_str) + ["José Valim", "chrismccord"] + """ + @spec authors(binary | html_tree) :: list[binary] + def authors(html) when is_binary(html), do: html |> parse |> authors + def authors(html_tree), do: AuthorFinder.find(html_tree) + @doc """ Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read @@ -78,23 +94,24 @@ defmodule Readability do def article(raw_html, opts \\ []) do opts = Keyword.merge(@default_options, opts) raw_html - |> parse + |> normalize |> ArticleBuilder.build(opts) end - @doc """ - return raw html binary from html_tree + return attributes, tags cleaned html """ - @spec raw_html(html_tree) :: binary - def raw_html(html_tree) do - html_tree |> Floki.raw_html + @spec readable_html(html_tree) :: binary + def readable_html(html_tree) do + html_tree + |> Helper.remove_attrs(regexes[:protect_attrs]) + |> raw_html end @doc """ return only text binary from html_tree """ - @spec raw_html(html_tree) :: binary + @spec readable_text(html_tree) :: binary def readable_text(html_tree) do # TODO: Remove image caption when extract only text tags_to_br = ~r/<\/(p|div|article|h\d)/i @@ -105,11 +122,19 @@ defmodule Readability do |> String.strip end + @doc """ + return raw html binary from html_tree + """ + @spec raw_html(html_tree) :: binary + def raw_html(html_tree) do + html_tree |> Floki.raw_html + end + @doc """ Normalize and Parse to html tree(tuple or list)) from binary html """ @spec parse(binary) :: html_tree - def parse(raw_html) do + def normalize(raw_html) do raw_html |> String.replace(Readability.regexes[:replace_brs], "

") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") @@ -118,6 +143,8 @@ defmodule Readability do |> Floki.filter_out(:comment) end + def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) + def regexes, do: @regexes def default_options, do: @default_options diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex new file mode 100644 index 0000000..8f9eaa6 --- /dev/null +++ b/lib/readability/author_finder.ex @@ -0,0 +1,38 @@ +defmodule Readability.AuthorFinder do + @moduledoc """ + AuthorFinder extracts authors + """ + + @type html_tree :: tuple | list + + @doc """ + Extract authors + """ + @spec find(html_tree) :: [binary] + def find(html_tree) do + author_names = find_by_meta_tag(html_tree) + split_author_names(author_names) + end + + def find_by_meta_tag(html_tree) do + names = html_tree + |> Floki.find("meta[name*=author], meta[property*=author]") + |> Enum.map(fn(meta) -> + meta + |> Floki.attribute("content") + |> Floki.text + |> String.strip + end) + |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) + if length(names) > 0 do + hd(names) + else + nil + end + end + + defp split_author_names(author_name) do + String.split(author_name, ~r/,\s|\sand\s|by\s/i) + |> Enum.reject(&(String.length(&1) == 0)) + end +end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index b8eae7a..2efceb5 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do html_tree = html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) end - - html_tree |> Helper.remove_attrs("style") + html_tree end defp conditionally_cleaing_fn(candidates) do diff --git a/mix.exs b/mix.exs index c99b111..2a3b728 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Readability.Mixfile do @moduledoc """ """ - @version "0.3.1" + @version "0.4.0" @description """ Readability library for extracting and curating articles. """ @@ -25,7 +25,8 @@ defmodule Readability.Mixfile do # Type "mix help compile.app" for more information def application do [applications: [:logger, - :floki + :floki, + :httpoison ]] end @@ -40,6 +41,7 @@ defmodule Readability.Mixfile do # Type "mix help deps" for more examples and options defp deps do [{:floki, "~> 0.8.0"}, + {:httpoison, "~> 0.8.0"}, {:earmark, "~> 0.1", only: :dev}, {:ex_doc, "~> 0.11", only: :dev}, {:credo, "~> 0.3", only: [:dev, :test]}, diff --git a/test/readability/author_finder_test.exs b/test/readability/author_finder_test.exs new file mode 100644 index 0000000..8a11499 --- /dev/null +++ b/test/readability/author_finder_test.exs @@ -0,0 +1,25 @@ +defmodule Readability.AuthoFinderTest do + use ExUnit.Case, async: true + + alias Readability.AuthorFinder + + test "extracting bbc format author" do + html = TestHelper.read_fixture("bbc.html") + assert AuthorFinder.find(html) == ["BBC News"] + end + + test "extracting buzzfeed format author" do + html = TestHelper.read_fixture("buzzfeed.html") + assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"] + end + + test "extracting medium format author" do + html = TestHelper.read_fixture("medium.html") + assert AuthorFinder.find(html) == ["Ken Mazaika"] + end + + test "extracting nytimes format author" do + html = TestHelper.read_fixture("nytimes.html") + assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"] + end +end diff --git a/test/readability_test.exs b/test/readability_test.exs index 79b0712..b4b1947 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -6,8 +6,8 @@ defmodule ReadabilityTest do opts = [clean_conditionally: false] nytimes = Readability.article(html, opts) - nytimes_html = Readability.raw_html(nytimes) - assert nytimes_html =~ ~r/^

/ + nytimes_html = Readability.readable_html(nytimes) + assert nytimes_html =~ ~r/^
<\/div><\/div>$/ nytimes_text = Readability.readable_text(nytimes) @@ -19,9 +19,9 @@ defmodule ReadabilityTest do html = TestHelper.read_fixture("bbc.html") bbc = Readability.article(html) - bbc_html = Readability.raw_html(bbc) + bbc_html = Readability.readable_html(bbc) - assert bbc_html =~ ~r/^
/ + assert bbc_html =~ ~r/^
\"A<\/div><\/div>$/ bbc_text = Readability.readable_text(bbc) @@ -34,9 +34,9 @@ defmodule ReadabilityTest do html = TestHelper.read_fixture("medium.html") medium = Readability.article(html) - medium_html = Readability.raw_html(medium) + medium_html = Readability.readable_html(medium) - assert medium_html =~ ~r/^
/ + assert medium_html =~ ~r/^

Background:/ assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/ medium_text = Readability.readable_text(medium) @@ -49,9 +49,9 @@ defmodule ReadabilityTest do html = TestHelper.read_fixture("buzzfeed.html") buzzfeed = Readability.article(html) - buzzfeed_html = Readability.raw_html(buzzfeed) + buzzfeed_html = Readability.readable_html(buzzfeed) - assert buzzfeed_html =~ ~r/^

/ + assert buzzfeed_html =~ ~r/^

The FBI no longer needs Apple’s help/ assert buzzfeed_html =~ ~r/encrypted devices.<\/p><\/div><\/div>$/ buzzfeed_text = Readability.readable_text(buzzfeed)