From 4aa8f6eceae05d3b6bd39600dfe4dbc53ecdec34 Mon Sep 17 00:00:00 2001
From: keepcosmos <keepcosmos@gmail.com>
Date: Thu, 28 Apr 2016 15:13:03 +0900
Subject: [PATCH] add authors finder

---
 CANGELOG.md                             |  5 ---
 CHANGELOG.md                            | 14 +++++++
 README.md                               | 28 ++++++++------
 lib/readability.ex                      | 49 +++++++++++++++++++------
 lib/readability/author_finder.ex        | 38 +++++++++++++++++++
 lib/readability/sanitizer.ex            |  3 +-
 mix.exs                                 |  6 ++-
 test/readability/author_finder_test.exs | 25 +++++++++++++
 test/readability_test.exs               | 16 ++++----
 9 files changed, 145 insertions(+), 39 deletions(-)
 delete mode 100644 CANGELOG.md
 create mode 100644 CHANGELOG.md
 create mode 100644 lib/readability/author_finder.ex
 create mode 100644 test/readability/author_finder_test.exs
diff --git a/CANGELOG.md b/CANGELOG.md
deleted file mode 100644
index 01d1c0e..0000000
--- a/CANGELOG.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Change log
-
-## [0.3.0] - 2016.04.24
-
-- Release!!
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..f66f8cc
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,14 @@
+# Change log
+
+All notable changes to this project will be documented in this file.
+This project adheres to [Semantic Versioning](http://semver.org/).
+
+## [0.4.0] - 2016.04.28
+
+### Added
+- Add author extractor function
+- Add `readable_html` function
+
+## [0.3.1] - 2016.04.24
+
+- Release!!
diff --git a/README.md b/README.md
index 83aa9ff..cf584df 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 
     ```elixir
     def deps do
-      [{:readability, "~> 0.3"}]
+      [{:readability, "~> 0.4"}]
     end
     ```
 
@@ -28,23 +28,29 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed
 
 ## Usage
 
-The example below, `html` variable is the html source from blog content "[Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)".
-
 ### Examples
 ```elixir
+### Get example page.
+%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58")
 
-### Extract the title
+### Extract the title.
 Readability.title(html)
-#=> Elixir Design Goals
+#=> "Why I’m betting on Elixir"
+
+### Extract authors.
+Readability.authors(html)
+#=> ["Ken Mazaika"]
+
 
 ### Extract the primary content with transformed html.
 html
 |> Readability.article
-|> Readability.raw_html
+|> Readability.readable_html
 #=>
-# <div><div class=\"entry-content\"><p>During the last year,
+# <div><div><p id=\"3476\"><strong><em>Background: </em></strong><em>I’ve spent...
 # ...
-# ... out our sidebar for other learning resources.</p></div></div>
+# ...button!</em></h3></div></div>
+
 
 ### Extract only text from the primary content.
 html
@@ -52,9 +58,9 @@ html
 |> Readability.readable_text
 
 #=>
-# During the last year, we have spoken at many conferences spreading the word about Elixir. We usually s.....
+# Background: I’ve spent the past 6 years building web applications in Ruby and.....
 # ...
-# ... started guide, or check out our sidebar for other learning resources.
+# ... value in this article, it would mean a lot to me if you hit the recommend button!
 ```
 
 ### Options
@@ -75,7 +81,7 @@ To run the test suite:
     $ mix test
 
 ## Todo
-* [ ] Extract authors
+* [x] Extract authors
 * [ ] Extract Images
 * [ ] Extract Videos
 * [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
diff --git a/lib/readability.ex b/lib/readability.ex
index ba1d2fd..6b73cb3 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -23,7 +23,9 @@ defmodule Readability do
   """
 
   alias Readability.TitleFinder
+  alias Readability.AuthorFinder
   alias Readability.ArticleBuilder
+  alias Readability.Helper
 
   @default_options [retry_length: 250,
                     min_text_length: 25,
@@ -46,7 +48,8 @@ defmodule Readability do
             replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
             replace_fonts: ~r/<(\/?)font[^>]*>/i,
             normalize: ~r/\s{2,}/,
-            video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
+            video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+            protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
            ]
 
   @type html_tree :: tuple | list
@@ -60,10 +63,23 @@ defmodule Readability do
       iex> title = Readability.title(html_str)
       "Some title in html"
   """
-  @spec title(binary) :: binary
-  def title(html) when is_binary(html), do: html |> parse |> title
+  @spec title(binary | html_tree) :: binary
+  def title(html) when is_binary(html), do: html |> normalize |> title
   def title(html_tree), do: TitleFinder.title(html_tree)
 
+
+  @doc """
+  Extract authors
+
+  ## Example
+
+      iex> authors = Readability.authors(html_str)
+      ["José Valim", "chrismccord"]
+  """
+  @spec authors(binary | html_tree) :: list[binary]
+  def authors(html) when is_binary(html), do: html |> parse |> authors
+  def authors(html_tree), do: AuthorFinder.find(html_tree)
+
   @doc """
   Using a variety of metrics (content score, classname, element types), find the content that is
   most likely to be the stuff a user wants to read
@@ -78,23 +94,24 @@ defmodule Readability do
   def article(raw_html, opts \\ []) do
     opts = Keyword.merge(@default_options, opts)
     raw_html
-    |> parse
+    |> normalize
     |> ArticleBuilder.build(opts)
   end
 
-
   @doc """
-  return raw html binary from html_tree
+  return attributes, tags cleaned html
   """
-  @spec raw_html(html_tree) :: binary
-  def raw_html(html_tree) do
-    html_tree |> Floki.raw_html
+  @spec readable_html(html_tree) :: binary
+  def readable_html(html_tree) do
+    html_tree
+    |> Helper.remove_attrs(regexes[:protect_attrs])
+    |> raw_html
   end
 
   @doc """
   return only text binary from html_tree
   """
-  @spec raw_html(html_tree) :: binary
+  @spec readable_text(html_tree) :: binary
   def readable_text(html_tree) do
     # TODO: Remove image caption when extract only text
     tags_to_br = ~r/<\/(p|div|article|h\d)/i
@@ -105,11 +122,19 @@ defmodule Readability do
     |> String.strip
   end
 
+  @doc """
+  return raw html binary from html_tree
+  """
+  @spec raw_html(html_tree) :: binary
+  def raw_html(html_tree) do
+    html_tree |> Floki.raw_html
+  end
+
   @doc """
   Normalize and Parse to html tree(tuple or list)) from binary html
   """
   @spec parse(binary) :: html_tree
-  def parse(raw_html) do
+  def normalize(raw_html) do
     raw_html
     |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
     |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
@@ -118,6 +143,8 @@ defmodule Readability do
     |> Floki.filter_out(:comment)
   end
 
+  def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
+
   def regexes, do: @regexes
 
   def default_options, do: @default_options
diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex
new file mode 100644
index 0000000..8f9eaa6
--- /dev/null
+++ b/lib/readability/author_finder.ex
@@ -0,0 +1,38 @@
+defmodule Readability.AuthorFinder do
+  @moduledoc """
+  AuthorFinder extracts authors
+  """
+
+  @type html_tree :: tuple | list
+
+  @doc """
+  Extract authors
+  """
+  @spec find(html_tree) :: [binary]
+  def find(html_tree) do
+    author_names = find_by_meta_tag(html_tree)
+    split_author_names(author_names)
+  end
+
+  def find_by_meta_tag(html_tree) do
+    names = html_tree
+             |> Floki.find("meta[name*=author], meta[property*=author]")
+             |> Enum.map(fn(meta) ->
+                  meta
+                  |> Floki.attribute("content")
+                  |> Floki.text
+                  |> String.strip
+                end)
+             |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
+    if length(names) > 0 do
+      hd(names)
+    else
+      nil
+    end
+  end
+
+  defp split_author_names(author_name) do
+    String.split(author_name, ~r/,\s|\sand\s|by\s/i)
+    |> Enum.reject(&(String.length(&1) == 0))
+  end
+end
diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex
index b8eae7a..2efceb5 100644
--- a/lib/readability/sanitizer.ex
+++ b/lib/readability/sanitizer.ex
@@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do
       html_tree = html_tree
                   |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
     end
-
-    html_tree |> Helper.remove_attrs("style")
+    html_tree
   end
 
   defp conditionally_cleaing_fn(candidates) do
diff --git a/mix.exs b/mix.exs
index c99b111..2a3b728 100644
--- a/mix.exs
+++ b/mix.exs
@@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
   @moduledoc """
   """
 
-  @version "0.3.1"
+  @version "0.4.0"
   @description """
   Readability library for extracting and curating articles.
   """
@@ -25,7 +25,8 @@ defmodule Readability.Mixfile do
   # Type "mix help compile.app" for more information
   def application do
     [applications: [:logger,
-                    :floki
+                    :floki,
+                    :httpoison
                    ]]
   end
 
@@ -40,6 +41,7 @@ defmodule Readability.Mixfile do
   # Type "mix help deps" for more examples and options
   defp deps do
     [{:floki, "~> 0.8.0"},
+     {:httpoison, "~> 0.8.0"},
      {:earmark, "~> 0.1", only: :dev},
      {:ex_doc, "~> 0.11", only: :dev},
      {:credo, "~> 0.3", only: [:dev, :test]},
diff --git a/test/readability/author_finder_test.exs b/test/readability/author_finder_test.exs
new file mode 100644
index 0000000..8a11499
--- /dev/null
+++ b/test/readability/author_finder_test.exs
@@ -0,0 +1,25 @@
+defmodule Readability.AuthoFinderTest do
+  use ExUnit.Case, async: true
+
+  alias Readability.AuthorFinder
+
+  test "extracting bbc format author" do
+    html = TestHelper.read_fixture("bbc.html")
+    assert AuthorFinder.find(html) == ["BBC News"]
+  end
+
+  test "extracting buzzfeed format author" do
+    html = TestHelper.read_fixture("buzzfeed.html")
+    assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"]
+  end
+
+  test "extracting medium format author" do
+    html = TestHelper.read_fixture("medium.html")
+    assert AuthorFinder.find(html) == ["Ken Mazaika"]
+  end
+
+  test "extracting nytimes format author" do
+    html = TestHelper.read_fixture("nytimes.html")
+    assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"]
+  end
+end
diff --git a/test/readability_test.exs b/test/readability_test.exs
index 79b0712..b4b1947 100644
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@@ -6,8 +6,8 @@ defmodule ReadabilityTest do
     opts = [clean_conditionally: false]
     nytimes = Readability.article(html, opts)
 
-    nytimes_html = Readability.raw_html(nytimes)
-    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
+    nytimes_html = Readability.readable_html(nytimes)
+    assert nytimes_html =~ ~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
     assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
 
     nytimes_text = Readability.readable_text(nytimes)
@@ -19,9 +19,9 @@ defmodule ReadabilityTest do
     html = TestHelper.read_fixture("bbc.html")
     bbc = Readability.article(html)
 
-    bbc_html = Readability.raw_html(bbc)
+    bbc_html = Readability.readable_html(bbc)
 
-    assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
+    assert bbc_html =~ ~r/^<div><div><figure><span><img alt=\"A Microsoft logo/
     assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
 
     bbc_text = Readability.readable_text(bbc)
@@ -34,9 +34,9 @@ defmodule ReadabilityTest do
     html = TestHelper.read_fixture("medium.html")
     medium = Readability.article(html)
 
-    medium_html = Readability.raw_html(medium)
+    medium_html = Readability.readable_html(medium)
 
-    assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
+    assert medium_html =~ ~r/^<div><div><p id=\"3476\"><strong><em>Background:/
     assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
 
     medium_text = Readability.readable_text(medium)
@@ -49,9 +49,9 @@ defmodule ReadabilityTest do
     html = TestHelper.read_fixture("buzzfeed.html")
     buzzfeed = Readability.article(html)
 
-    buzzfeed_html = Readability.raw_html(buzzfeed)
+    buzzfeed_html = Readability.readable_html(buzzfeed)
 
-    assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
+    assert buzzfeed_html =~ ~r/^<div><div><p>The FBI no longer needs Apple’s help/
     assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
 
     buzzfeed_text = Readability.readable_text(buzzfeed)