During the last year, +#
Background: I’ve spent... # ... -# ... out our sidebar for other learning resources.
From 4aa8f6eceae05d3b6bd39600dfe4dbc53ecdec34 Mon Sep 17 00:00:00 2001
From: keepcosmos During the last year,
+# Background: I’ve spent...
# ...
-# ... out our sidebar for other learning resources.
]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
normalize: ~r/\s{2,}/,
- video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
+ video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+ protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
]
@type html_tree :: tuple | list
@@ -60,10 +63,23 @@ defmodule Readability do
iex> title = Readability.title(html_str)
"Some title in html"
"""
- @spec title(binary) :: binary
- def title(html) when is_binary(html), do: html |> parse |> title
+ @spec title(binary | html_tree) :: binary
+ def title(html) when is_binary(html), do: html |> normalize |> title
def title(html_tree), do: TitleFinder.title(html_tree)
+
+ @doc """
+ Extract authors
+
+ ## Example
+
+ iex> authors = Readability.authors(html_str)
+ ["José Valim", "chrismccord"]
+ """
+ @spec authors(binary | html_tree) :: list[binary]
+ def authors(html) when is_binary(html), do: html |> parse |> authors
+ def authors(html_tree), do: AuthorFinder.find(html_tree)
+
@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read
@@ -78,23 +94,24 @@ defmodule Readability do
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
raw_html
- |> parse
+ |> normalize
|> ArticleBuilder.build(opts)
end
-
@doc """
- return raw html binary from html_tree
+ return attributes, tags cleaned html
"""
- @spec raw_html(html_tree) :: binary
- def raw_html(html_tree) do
- html_tree |> Floki.raw_html
+ @spec readable_html(html_tree) :: binary
+ def readable_html(html_tree) do
+ html_tree
+ |> Helper.remove_attrs(regexes[:protect_attrs])
+ |> raw_html
end
@doc """
return only text binary from html_tree
"""
- @spec raw_html(html_tree) :: binary
+ @spec readable_text(html_tree) :: binary
def readable_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
@@ -105,11 +122,19 @@ defmodule Readability do
|> String.strip
end
+ @doc """
+ return raw html binary from html_tree
+ """
+ @spec raw_html(html_tree) :: binary
+ def raw_html(html_tree) do
+ html_tree |> Floki.raw_html
+ end
+
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec parse(binary) :: html_tree
- def parse(raw_html) do
+ def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "
") |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") @@ -118,6 +143,8 @@ defmodule Readability do |> Floki.filter_out(:comment) end + def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) + def regexes, do: @regexes def default_options, do: @default_options diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex new file mode 100644 index 0000000..8f9eaa6 --- /dev/null +++ b/lib/readability/author_finder.ex @@ -0,0 +1,38 @@ +defmodule Readability.AuthorFinder do + @moduledoc """ + AuthorFinder extracts authors + """ + + @type html_tree :: tuple | list + + @doc """ + Extract authors + """ + @spec find(html_tree) :: [binary] + def find(html_tree) do + author_names = find_by_meta_tag(html_tree) + split_author_names(author_names) + end + + def find_by_meta_tag(html_tree) do + names = html_tree + |> Floki.find("meta[name*=author], meta[property*=author]") + |> Enum.map(fn(meta) -> + meta + |> Floki.attribute("content") + |> Floki.text + |> String.strip + end) + |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) + if length(names) > 0 do + hd(names) + else + nil + end + end + + defp split_author_names(author_name) do + String.split(author_name, ~r/,\s|\sand\s|by\s/i) + |> Enum.reject(&(String.length(&1) == 0)) + end +end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index b8eae7a..2efceb5 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -24,8 +24,7 @@ defmodule Readability.Sanitizer do html_tree = html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) end - - html_tree |> Helper.remove_attrs("style") + html_tree end defp conditionally_cleaing_fn(candidates) do diff --git a/mix.exs b/mix.exs index c99b111..2a3b728 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Readability.Mixfile do @moduledoc """ """ - @version "0.3.1" + @version "0.4.0" @description """ Readability library for extracting and curating articles. """ @@ -25,7 +25,8 @@ defmodule Readability.Mixfile do # Type "mix help compile.app" for more information def application do [applications: [:logger, - :floki + :floki, + :httpoison ]] end @@ -40,6 +41,7 @@ defmodule Readability.Mixfile do # Type "mix help deps" for more examples and options defp deps do [{:floki, "~> 0.8.0"}, + {:httpoison, "~> 0.8.0"}, {:earmark, "~> 0.1", only: :dev}, {:ex_doc, "~> 0.11", only: :dev}, {:credo, "~> 0.3", only: [:dev, :test]}, diff --git a/test/readability/author_finder_test.exs b/test/readability/author_finder_test.exs new file mode 100644 index 0000000..8a11499 --- /dev/null +++ b/test/readability/author_finder_test.exs @@ -0,0 +1,25 @@ +defmodule Readability.AuthoFinderTest do + use ExUnit.Case, async: true + + alias Readability.AuthorFinder + + test "extracting bbc format author" do + html = TestHelper.read_fixture("bbc.html") + assert AuthorFinder.find(html) == ["BBC News"] + end + + test "extracting buzzfeed format author" do + html = TestHelper.read_fixture("buzzfeed.html") + assert AuthorFinder.find(html) == ["Salvador Hernandez", "Hamza Shaban"] + end + + test "extracting medium format author" do + html = TestHelper.read_fixture("medium.html") + assert AuthorFinder.find(html) == ["Ken Mazaika"] + end + + test "extracting nytimes format author" do + html = TestHelper.read_fixture("nytimes.html") + assert AuthorFinder.find(html) == ["Judith H. Dobrzynski"] + end +end diff --git a/test/readability_test.exs b/test/readability_test.exs index 79b0712..b4b1947 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -6,8 +6,8 @@ defmodule ReadabilityTest do opts = [clean_conditionally: false] nytimes = Readability.article(html, opts) - nytimes_html = Readability.raw_html(nytimes) - assert nytimes_html =~ ~r/^