From 93bdf48b8c9a39f35ab26138386b343fb3ef53e1 Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Sat, 7 May 2016 18:23:19 +0900 Subject: [PATCH] add summarize function this closes #4, closes #3 --- CHANGELOG.md | 7 ++- README.md | 79 ++++++++++++++++++++++---------- lib/readability.ex | 48 ++++++++++++------- lib/readability/author_finder.ex | 4 +- lib/readability/helper.ex | 13 ++++++ lib/readability/summary.ex | 3 ++ mix.exs | 2 +- 7 files changed, 113 insertions(+), 43 deletions(-) create mode 100644 lib/readability/summary.ex diff --git a/CHANGELOG.md b/CHANGELOG.md index f66f8cc..f43f287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,16 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [0.5.0] - 2016.05.07 + +### Added +- Add `Readability.summarize` function + ## [0.4.0] - 2016.04.28 ### Added - Add author extractor function -- Add `readable_html` function +- Add `Readability.readable_html` function ## [0.3.1] - 2016.04.24 diff --git a/README.md b/README.md index eebd38d..f2c4069 100644 --- a/README.md +++ b/README.md @@ -29,50 +29,74 @@ If [available in Hex](https://hex.pm/docs/publish), the package can be installed ## Usage ### Examples -```elixir -### Get example page. -%{status_code: 200, body: html} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58") -### Extract the title. -Readability.title(html) +#### Just pass url +```elixir +url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58" +summary = Readability.summarize(url) + +summary.title #=> "Why I’m betting on Elixir" -### Extract authors. -Readability.authors(html) +summary.authors #=> ["Ken Mazaika"] - -### Extract the primary content with transformed html. -html -|> Readability.article -|> Readability.readable_html +summary.article_html #=> #

Background: I’ve spent... # ... # ...button!

- -### Extract only text from the primary content. -html -|> Readability.article -|> Readability.readable_text - +summary.article_text #=> # Background: I’ve spent the past 6 years building web applications in Ruby and..... # ... # ... value in this article, it would mean a lot to me if you hit the recommend button! ``` +#### From raw html + +```elixir +### Extract the title. +Readability.title(html) + +### Extract authors. +Readability.authors(html) + +### Extract the primary content with transformed html. +html +|> Readability.article +|> Readability.readable_html + +### Extract only text from the primary content. +html +|> Readability.article +|> Readability.readable_text + +### you can extract the primary images with Floki +html +|> Readability.article +|> Floki.find("img") +|> Floki.attribute("src") +``` + ### Options -You may provide options(Keyword type) to `Readability.article`, including: +If result is different with your expectation, you can add options. + +#### Example +```elixir +url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58" +summary = Readability.summarize(url, [clean_conditionally: false]) +``` -* retry_length \\\\ 250 * min_text_length \\\\ 25 -* remove_unlikely_candidates \\\\ true, -* weight_classes \\\\ true, -* clean_conditionally \\\\ true, -* remove_empty_nodes \\\\ true, +* remove_unlikely_candidates \\\\ true +* weight_classes \\\\ true +* clean_conditionally \\\\ true +* retry_length \\\\ 250 + +**You can find other algorithm and regex options in `readability.ex`** ## Test @@ -81,10 +105,17 @@ To run the test suite: $ mix test ## Todo +<<<<<<< HEAD **Contributions are welcome!** Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones) +======= +* [x] Extract authors +* [x] More configurable +* [x] Summarize function +* [ ] Convert relative paths into absolute paths of `img#src` and `a#href` +>>>>>>> cb86b7a... add summarize function ## Related and Inpired Projects diff --git a/lib/readability.ex b/lib/readability.ex index 8030cb7..5687c98 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -7,6 +7,9 @@ defmodule Readability do ```elixir @type html :: binary + # Just pass url + %Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url) + # Extract title Readability.title(html) @@ -28,6 +31,7 @@ defmodule Readability do alias Readability.TitleFinder alias Readability.AuthorFinder alias Readability.ArticleBuilder + alias Readability.Summary alias Readability.Helper @default_options [retry_length: 250, @@ -40,7 +44,8 @@ defmodule Readability do min_image_height: 80, ignore_image_format: [], blacklist: nil, - whitelist: nil + whitelist: nil, + page_url: nil ] @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, @@ -56,8 +61,28 @@ defmodule Readability do ] @type html_tree :: tuple | list + @type raw_html :: binary + @type url :: binary @type options :: list + @doc """ + summarize the primary readable content of a webpage. + """ + @spec summarize(url, options) :: Summary.t + def summarize(url, opts \\ []) do + opts = Keyword.merge(opts, [page_url: url]) + %{status_code: _, body: raw_html} = HTTPoison.get!(url) + html_tree = Helper.normalize(raw_html) + article_tree = html_tree + |> ArticleBuilder.build(opts) + + %Summary{title: title(html_tree), + authors: authors(html_tree), + article_html: readable_html(article_tree), + article_text: readable_text(article_tree) + } + end + @doc """ Extract title @@ -67,7 +92,11 @@ defmodule Readability do "Some title in html" """ @spec title(binary | html_tree) :: binary - def title(html) when is_binary(html), do: html |> normalize |> title + def title(raw_html) when is_binary(raw_html) do + raw_html + |> Helper.normalize + |> title + end def title(html_tree), do: TitleFinder.title(html_tree) @@ -97,7 +126,7 @@ defmodule Readability do def article(raw_html, opts \\ []) do opts = Keyword.merge(@default_options, opts) raw_html - |> normalize + |> Helper.normalize |> ArticleBuilder.build(opts) end @@ -133,19 +162,6 @@ defmodule Readability do html_tree |> Floki.raw_html end - @doc """ - Normalize and Parse to html tree(tuple or list)) from binary html - """ - @spec parse(binary) :: html_tree - def normalize(raw_html) do - raw_html - |> String.replace(Readability.regexes[:replace_brs], "

") - |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") - |> String.replace(Readability.regexes[:normalize], " ") - |> Floki.parse - |> Floki.filter_out(:comment) - end - def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) def regexes, do: @regexes diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex index 8f9eaa6..3fb366c 100644 --- a/lib/readability/author_finder.ex +++ b/lib/readability/author_finder.ex @@ -11,7 +11,9 @@ defmodule Readability.AuthorFinder do @spec find(html_tree) :: [binary] def find(html_tree) do author_names = find_by_meta_tag(html_tree) - split_author_names(author_names) + if author_names do + split_author_names(author_names) + end end def find_by_meta_tag(html_tree) do diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index fe9dd3f..604be68 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -86,6 +86,19 @@ defmodule Readability.Helper do end) end + @doc """ + Normalize and Parse to html tree(tuple or list)) from binary html + """ + @spec normalize(binary) :: html_tree + def normalize(raw_html) do + raw_html + |> String.replace(Readability.regexes[:replace_brs], "

") + |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") + |> String.replace(Readability.regexes[:normalize], " ") + |> Floki.parse + |> Floki.filter_out(:comment) + end + defp candidates_selector do ["p", "td"] |> Enum.map(fn(s) -> diff --git a/lib/readability/summary.ex b/lib/readability/summary.ex new file mode 100644 index 0000000..e46f6f5 --- /dev/null +++ b/lib/readability/summary.ex @@ -0,0 +1,3 @@ +defmodule Readability.Summary do + defstruct title: nil, authors: [], article_html: nil, article_text: nil +end diff --git a/mix.exs b/mix.exs index 2a3b728..74bc285 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Readability.Mixfile do @moduledoc """ """ - @version "0.4.0" + @version "0.5.0" @description """ Readability library for extracting and curating articles. """