During the last year, +#
Background: I’ve spent... # ... -# ... out our sidebar for other learning resources.
diff --git a/CANGELOG.md b/CANGELOG.md deleted file mode 100644 index 01d1c0e..0000000 --- a/CANGELOG.md +++ /dev/null @@ -1,5 +0,0 @@ -# Change log - -## [0.3.0] - 2016.04.24 - -- Release!! diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f66f8cc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Change log + +All notable changes to this project will be documented in this file. +This project adheres to [Semantic Versioning](http://semver.org/). + +## [0.4.0] - 2016.04.28 + +### Added +- Add author extractor function +- Add `readable_html` function + +## [0.3.1] - 2016.04.24 + +- Release!! diff --git a/README.md b/README.md index 83aa9ff..cf584df 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ```elixir def deps do - [{:readability, "~> 0.3"}] + [{:readability, "~> 0.4"}] end ``` @@ -28,23 +28,29 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ## Usage -The example below, `html` variable is the html source from blog content "[Elixir Design Goals](http://elixir-lang.org/blog/2013/08/08/elixir-design-goals/)". - ### Examples ```elixir +### Get example page. +%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58") -### Extract the title +### Extract the title. Readability.title(html) -#=> Elixir Design Goals +#=> "Why I’m betting on Elixir" + +### Extract authors. +Readability.authors(html) +#=> ["Ken Mazaika"] + ### Extract the primary content with transformed html. html |> Readability.article -|> Readability.raw_html +|> Readability.readable_html #=> -#
During the last year, +#
Background: I’ve spent... # ... -# ... out our sidebar for other learning resources.
") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") @@ -118,6 +143,8 @@ defmodule Readability do |> Floki.filter_out(:comment) end + def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) + def regexes, do: @regexes def default_options, do: @default_options diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex new file mode 100644 index 0000000..8f9eaa6 --- /dev/null +++ b/lib/readability/author_finder.ex @@ -0,0 +1,38 @@ +defmodule Readability.AuthorFinder do + @moduledoc """ + AuthorFinder extracts authors + """ + + @type html_tree :: tuple | list + + @doc """ + Extract authors + """ + @spec find(html_tree) :: [binary] + def find(html_tree) do + author_names = find_by_meta_tag(html_tree) + split_author_names(author_names) + end + + def find_by_meta_tag(html_tree) do + names = html_tree + |> Floki.find("meta[name*=author], meta[property*=author]") + |> Enum.map(fn(meta) -> + meta + |> Floki.attribute("content") + |> Floki.text + |> String.strip + end) + |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) + if length(names) > 0 do + hd(names) + else + nil + end + end + + defp split_author_names(author_name) do + String.split(author_name, ~r/,\s|\sand\s|by\s/i) + |> Enum.reject(&(String.length(&1) == 0)) + end +end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index b8eae7a..2efceb5 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do html_tree = html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) end - - html_tree |> Helper.remove_attrs("style") + html_tree end defp conditionally_cleaing_fn(candidates) do diff --git a/mix.exs b/mix.exs index c99b111..2a3b728 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Readability.Mixfile do @moduledoc """ """ - @version "0.3.1" + @version "0.4.0" @description """ Readability library for extracting and curating articles. """ @@ -25,7 +25,8 @@ defmodule Readability.Mixfile do # Type "mix help compile.app" for more information def application do [applications: [:logger, - :floki + :floki, + :httpoison ]] end @@ -40,6 +41,7 @@ defmodule Readability.Mixfile do # Type "mix help deps" for more examples and options defp deps do [{:floki, "~> 0.8.0"}, + {:httpoison, "~> 0.8.0"}, {:earmark, "~> 0.1", only: :dev}, {:ex_doc, "~> 0.11", only: :dev}, {:credo, "~> 0.3", only: [:dev, :test]}, diff --git a/test/readability/author_finder_test.exs b/test/readability/author_finder_test.exs new file mode 100644 index 0000000..8a11499 --- /dev/null +++ b/test/readability/author_finder_test.exs @@ -0,0 +1,25 @@ +defmodule Readability.AuthoFinderTest do + use ExUnit.Case, async: true + + alias Readability.AuthorFinder + + test "extracting bbc format author" do + html = TestHelper.read_fixture("bbc.html") + assert AuthorFinder.find(html) == ["BBC News"] + end + + test "extracting buzzfeed format author" do + html = TestHelper.read_fixture("buzzfeed.html") + assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"] + end + + test "extracting medium format author" do + html = TestHelper.read_fixture("medium.html") + assert AuthorFinder.find(html) == ["Ken Mazaika"] + end + + test "extracting nytimes format author" do + html = TestHelper.read_fixture("nytimes.html") + assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"] + end +end diff --git a/test/readability_test.exs b/test/readability_test.exs index 79b0712..b4b1947 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -6,8 +6,8 @@ defmodule ReadabilityTest do opts = [clean_conditionally: false] nytimes = Readability.article(html, opts) - nytimes_html = Readability.raw_html(nytimes) - assert nytimes_html =~ ~r/^